# Feature engineering on time series

## Load data

In [1]:
path = '../../data/FRED/FRED_joined.parquet'

In [2]:
import pandas as pd

df = pd.read_parquet(path)
df = df[['MR', 'CPI']].copy()

df

Unnamed: 0_level_0,MR,CPI
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1948-01-01,,
1948-02-01,,
...,...,...
2025-09-11,6.35,3.419811
2025-09-12,6.35,3.419811


## Linear regression

### Visualize relationship

In [3]:
import plotly.express as px
fig = px.scatter(df, x="MR", y="CPI", trendline='ols', width=800, height=600)

fig

### Regression summary

In [4]:
r = px.get_trendline_results(fig)
r = r.iloc[0,0].summary2()
r

0,1,2,3
Model:,OLS,Adj. R-squared:,0.446
Dependent Variable:,y,AIC:,17898.972
Date:,2025-10-06 19:58,BIC:,17911.7584
No. Observations:,4417,Log-Likelihood:,-8947.5
Df Model:,1,F-statistic:,3556.0
Df Residuals:,4415,Prob (F-statistic):,0.0
R-squared:,0.446,Scale:,3.3668

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,0.3420,0.0702,4.8745,0.0000,0.2045,0.4796
x1,0.5345,0.0090,59.6296,0.0000,0.5169,0.5520

0,1,2,3
Omnibus:,863.104,Durbin-Watson:,1.406
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1834.46
Skew:,1.137,Prob(JB):,0.0
Kurtosis:,5.19,Condition No.:,20.0


## Filter data

Unnamed: 0_level_0,CPI,MR
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2007-01-01,3.136734,6.18
2007-01-04,3.139873,6.18
...,...,...
2009-12-24,1.220101,5.05
2009-12-31,1.195432,5.14


In [6]:
fig = px.scatter(df, x="MR", y="CPI", trendline='ols', width=800, height=600)
fig

In [7]:
r = px.get_trendline_results(fig)
r = r.iloc[0,0].summary2()
r

0,1,2,3
Model:,OLS,Adj. R-squared:,0.506
Dependent Variable:,y,AIC:,149.8142
Date:,2025-10-06 19:58,BIC:,156.2871
No. Observations:,188,Log-Likelihood:,-72.907
Df Model:,1,F-statistic:,192.4
Df Residuals:,186,Prob (F-statistic):,1.6999999999999998e-30
R-squared:,0.508,Scale:,0.12853

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,-0.9512,0.2496,-3.8116,0.0002,-1.4435,-0.4589
x1,0.5932,0.0428,13.8694,0.0000,0.5088,0.6776

0,1,2,3
Omnibus:,43.982,Durbin-Watson:,1.816
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15.612
Skew:,-0.48,Prob(JB):,0.0
Kurtosis:,1.966,Condition No.:,57.0


## Feature engineering

### Discretize temporal column

Unnamed: 0_level_0,CPI,MR,period
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2007-01-01,3.136734,6.18,
2007-01-04,3.139873,6.18,
...,...,...,...
2009-12-24,1.220101,5.05,
2009-12-31,1.195432,5.14,


In [9]:
fig = px.scatter(
    data_frame=df.dropna(),
    x="MR", y="CPI", trendline='ols',
    color='period', facet_col='period'
)

fig.update_layout(width=1200, height=400)
fig.update_xaxes(matches=None)
fig.update_yaxes(matches=None)

fig

In [10]:
r = px.get_trendline_results(fig).set_index('period')
r

Unnamed: 0_level_0,px_fit_results
period,Unnamed: 1_level_1
Before,<statsmodels.regression.linear_model.Regressio...
During,<statsmodels.regression.linear_model.Regressio...
After,<statsmodels.regression.linear_model.Regressio...


In [11]:
from modules import utils
df_results = utils.collect_lr_results(r.px_fit_results)

df_results

Unnamed: 0_level_0,Coef,StdErr,t,p,RÂ²,n
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Before,-0.334342,0.048049,-6.958392,5.047275e-08,0.587475,36
During,0.486619,0.044029,11.052174,9.9819e-14,0.753316,42
After,0.351124,0.027444,12.794007,6.663134e-16,0.799694,43


### Reduce granularity

Unnamed: 0_level_0,CPI,MR,period
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2007-01-01,3.138303,6.180,
2007-01-11,3.144581,6.220,
...,...,...,...
2009-12-16,1.232435,4.995,
2009-12-26,1.195432,5.140,


In [14]:
fig = px.scatter(
    data_frame=df.dropna(),
    x="MR", y="CPI", trendline='ols',
    color='period', facet_col='period'
)

fig.update_layout(width=1200, height=400)
fig.update_xaxes(matches=None)
fig.update_yaxes(matches=None)

fig

In [15]:
r = px.get_trendline_results(fig).set_index('period')
r

Unnamed: 0_level_0,px_fit_results
period,Unnamed: 1_level_1
Before,<statsmodels.regression.linear_model.Regressio...
During,<statsmodels.regression.linear_model.Regressio...
After,<statsmodels.regression.linear_model.Regressio...


In [16]:
df_results = utils.collect_lr_results(r.px_fit_results)
df_results

Unnamed: 0_level_0,Coef,StdErr,t,p,RÂ²,n
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Before,-0.326453,0.067017,-4.871173,0.0001060734,0.55533,21
During,0.504086,0.0616,8.183163,4.045715e-08,0.752709,24
After,0.365826,0.036379,10.056017,4.406968e-10,0.808189,26


Unnamed: 0_level_0,baseline,baseline,baseline,baseline,baseline,baseline,granularity_reduced,granularity_reduced,granularity_reduced,granularity_reduced,granularity_reduced,granularity_reduced
Unnamed: 0_level_1,Coef,StdErr,t,p,RÂ²,n,Coef,StdErr,t,p,RÂ²,n
Period,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Before,-0.334342,0.048049,-6.958392,5.047275e-08,0.587475,36,-0.326453,0.067017,-4.871173,0.0001060734,0.55533,21
During,0.486619,0.044029,11.052174,9.9819e-14,0.753316,42,0.504086,0.0616,8.183163,4.045715e-08,0.752709,24
After,0.351124,0.027444,12.794007,6.663134e-16,0.799694,43,0.365826,0.036379,10.056017,4.406968e-10,0.808189,26


### Transform numerical columns

#### Rolling windows

Unnamed: 0_level_0,CPI,MR,period,CPI_roll
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-01-01,3.138303,6.180,,
2007-01-11,3.144581,6.220,,
...,...,...,...,...
2009-12-16,1.232435,4.995,,1.411987
2009-12-26,1.195432,5.140,,1.378816


#### Create lag features

Unnamed: 0_level_0,CPI,MR,period,CPI_roll,CPI_lag
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2007-01-01,3.138303,6.180,,,2.748785
2007-01-11,3.144581,6.220,,,2.697956
...,...,...,...,...,...
2009-12-16,1.232435,4.995,,1.411987,
2009-12-26,1.195432,5.140,,1.378816,


In [21]:
utils.plot_explanatory_regression_grid(
    df,
    target="MR",
    explanatory_baseline="CPI",
    explanatory_transformed="CPI_lag",
    categorical="period"
)

#### Compare model performance

#### From plotly library

In [22]:
results = {}

xs = ['CPI', 'CPI_roll', 'CPI_lag']

for x in xs:
    f = px.scatter(
        data_frame=df.dropna(), x="MR", y=x,
        trendline='ols', color='period', facet_col='period'
    )

    r = px.get_trendline_results(f).set_index('period')

    results[x] = utils.collect_lr_results(r.px_fit_results)

pd.concat(results).style

Unnamed: 0_level_0,Unnamed: 1_level_0,Coef,StdErr,t,p,RÂ²,n
Unnamed: 0_level_1,Period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CPI,Before,-0.326453,0.067017,-4.871173,0.000106,0.55533,21
CPI,During,0.504086,0.0616,8.183163,0.0,0.752709,24
CPI,After,0.365826,0.036379,10.056017,0.0,0.808189,26
CPI_roll,Before,0.324601,0.103242,3.144083,0.005343,0.342225,21
CPI_roll,During,0.306327,0.042825,7.152926,0.0,0.699307,24
CPI_roll,After,0.344938,0.028426,12.134786,0.0,0.859856,26
CPI_lag,Before,-0.135163,0.060246,-2.243501,0.036978,0.20943,21
CPI_lag,During,-0.371006,0.091155,-4.070083,0.000508,0.429543,24
CPI_lag,After,0.47914,0.054658,8.766153,0.0,0.762012,26


#### From statsmodels library

In [23]:
import statsmodels.formula.api as smf
import pandas as pd

df_model = df.dropna()

baseline = smf.ols("MR ~ CPI * C(period)", data=df_model).fit()
with_rolling = smf.ols("MR ~ CPI_roll * C(period)", data=df_model).fit()
with_lags = smf.ols("MR ~ CPI_lag * C(period)", data=df_model).fit()

df_comparison = pd.DataFrame({
    'Model': ['Baseline', 'With Rolling', 'With Lags'],
    'RÂ²': [baseline.rsquared, with_rolling.rsquared, with_lags.rsquared],
    'Adj RÂ²': [baseline.rsquared_adj, with_rolling.rsquared_adj, with_lags.rsquared_adj],
    'AIC': [baseline.aic, with_rolling.aic, with_lags.aic]
})

df_comparison

Unnamed: 0,Model,RÂ²,Adj RÂ²,AIC
0,Baseline,0.889388,0.88088,-25.157163
1,With Rolling,0.895719,0.887697,-29.341361
2,With Lags,0.832311,0.819412,4.38482
