In [1]:
import pandas as pd

In [11]:
df = pd.read_csv(
    "data/daily-minimum-temperatures.csv",
    index_col="Date", 
    parse_dates=["Date"]
)
df.head()

Unnamed: 0_level_0,Temp
Date,Unnamed: 1_level_1
1981-01-01,20.7
1981-01-02,17.9
1981-01-03,18.8
1981-01-04,14.6
1981-01-05,15.8


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3650 entries, 1981-01-01 to 1990-12-31
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Temp    3650 non-null   float64
dtypes: float64(1)
memory usage: 57.0 KB


In [13]:
df.shape

(3650, 1)

# Date Time Features

In [18]:
df["year"] = df.index.year
df["month"] = df.index.month
df["day"] = df.index.day

In [19]:
df.head()

Unnamed: 0_level_0,Temp,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1981-01-01,20.7,1981,1,1
1981-01-02,17.9,1981,1,2
1981-01-03,18.8,1981,1,3
1981-01-04,14.6,1981,1,4
1981-01-05,15.8,1981,1,5


# Lag Features

In [24]:
df = df["Temp"]

In [26]:
df.head()

Date
1981-01-01    20.7
1981-01-02    17.9
1981-01-03    18.8
1981-01-04    14.6
1981-01-05    15.8
Name: Temp, dtype: float64

In [28]:
df.shift().head()

Date
1981-01-01     NaN
1981-01-02    20.7
1981-01-03    17.9
1981-01-04    18.8
1981-01-05    14.6
Name: Temp, dtype: float64

In [50]:
temp = df.reset_index()["Temp"]
shifted_1_temp = temp.shift()

In [52]:
data = pd.concat([temp, shifted_1_temp], axis=1)
data.columns = ["t", "t-1"]
data.head()

Unnamed: 0,t,t-1
0,20.7,
1,17.9,20.7
2,18.8,17.9
3,14.6,18.8
4,15.8,14.6


In [53]:
data = pd.concat(
    [
        temp, 
        temp.shift(1), 
        temp.shift(2),
        temp.shift(3)
    ], 
    axis=1
)

data.columns = ["t", "t-1", "t-2", "t-3"]
data.head()

Unnamed: 0,t,t-1,t-2,t-3
0,20.7,,,
1,17.9,20.7,,
2,18.8,17.9,20.7,
3,14.6,18.8,17.9,20.7
4,15.8,14.6,18.8,17.9


In [54]:
data.shape

(3650, 4)

In [55]:
data.dropna().shape

(3647, 4)

# Features from Rolling Window Statistics

In [59]:
shifted_1_temp.head()

0     NaN
1    20.7
2    17.9
3    18.8
4    14.6
Name: Temp, dtype: float64

In [62]:
temp.head()

0    20.7
1    17.9
2    18.8
3    14.6
4    15.8
Name: Temp, dtype: float64

In [60]:
window = shifted_1_temp.rolling(window=2)
means = window.mean()
data = pd.concat([means, temp], axis=1)
data.columns = ["mean(t-1, t)", "t+1"]
data.head()

Unnamed: 0,"mean(t-1, t)",t+1
0,,20.7
1,,17.9
2,19.3,18.8
3,18.35,14.6
4,16.7,15.8


# Expanding Window Statistics

In [66]:
width = 3
shifted = temp.shift(width - 1)
window = shifted.rolling(window=width)

data = pd.concat(
    [
        window.min(), 
        window.mean(), 
        window.max(), 
        temp
    ], 
    axis=1
)

data.columns = ["min", "mean", "max", "t+1"]
data.head(7)

Unnamed: 0,min,mean,max,t+1
0,,,,20.7
1,,,,17.9
2,,,,18.8
3,,,,14.6
4,17.9,19.133333,20.7,15.8
5,14.6,17.1,18.8,15.8
6,14.6,16.4,18.8,15.8


In [69]:
window = temp.expanding()
data = pd.concat(
    [
        window.min(), 
        window.mean(), 
        window.max(), 
        temp
    ], 
    axis=1
)
data.columns = ['min', 'mean', 'max', 't+1']
data.head(7)

Unnamed: 0,min,mean,max,t+1
0,20.7,20.7,20.7,20.7
1,17.9,19.3,20.7,17.9
2,17.9,19.133333,20.7,18.8
3,14.6,18.0,20.7,14.6
4,14.6,17.56,20.7,15.8
5,14.6,17.266667,20.7,15.8
6,14.6,17.057143,20.7,15.8
