# 02 - Feature Engineering Notebook
**Goal:**  Clean the raw data, add more features, and create lagged/rolling averaged features.

- Implement a 1M, 3M, 6M, and 12M lag for economic features.

In [1]:
# Load raw csv data
import pandas as pd
import numpy as np

combined = pd.read_csv("data/raw/combined_raw.csv", index_col=0, parse_dates=True)
fred_only = pd.read_csv("data/raw/fred_only.csv", index_col=0, parse_dates=True)

USE_BIGMAC = True
df = combined.copy() if USE_BIGMAC else fred_only.copy()

# Parse date column and sort just in case
#df['date'] = pd.to_datetime(df['Unnamed: 0'])
#df = df.drop(columns=['Unnamed: 0'])
#df = df.sort_values('date').set_index('date')

df.head()

Unnamed: 0,fed_funds_lower,fed_funds_upper,fed_funds_mid,local_price,cpi,unemployment_rate,m2_money_supply,treasury_10yr_yield,yield_curve_spread
2000-04-30,6.0,6.0,6.0,2.24,170.9,3.8,4767.8,5.990526,-0.413158
2000-05-31,6.5,6.5,6.5,2.24,171.2,4.0,4755.7,6.440455,-0.369091
2000-06-30,6.5,6.5,6.5,2.24,172.2,4.0,4773.6,6.097273,-0.384545
2000-07-31,6.5,6.5,6.5,2.24,172.7,4.0,4791.3,6.054,-0.285
2000-08-31,6.5,6.5,6.5,2.24,172.7,4.1,4819.5,5.826087,-0.402609


In [11]:
# Choose target dynamically
TARGET = "fed_funds_mid"   # or fed_funds_upper / fed_funds_lower

# Fed columns
fed_cols = ["fed_funds_mid", "fed_funds_upper", "fed_funds_lower"]

# Create target column
df["target"] = df[TARGET].copy()

# Drop unused Fed columns
cols_to_drop = [c for c in fed_cols if c != TARGET and c in df.columns]
df = df.drop(columns=cols_to_drop)

X = df.drop(columns=["target"])
y = df["target"]

original_cols = X.columns.tolist()

df.head()



Unnamed: 0,fed_funds_mid,local_price,cpi,unemployment_rate,m2_money_supply,treasury_10yr_yield,yield_curve_spread,target
2000-04-30,6.0,2.24,170.9,3.8,4767.8,5.990526,-0.413158,6.0
2000-05-31,6.5,2.24,171.2,4.0,4755.7,6.440455,-0.369091,6.5
2000-06-30,6.5,2.24,172.2,4.0,4773.6,6.097273,-0.384545,6.5
2000-07-31,6.5,2.24,172.7,4.0,4791.3,6.054,-0.285,6.5
2000-08-31,6.5,2.24,172.7,4.1,4819.5,5.826087,-0.402609,6.5


In [12]:
def add_lags(data, cols, lags=[1, 3, 6, 12]):
    new_cols = {}
    for col in cols:
        for lag in lags:
            new_cols[f"{col}_lag{lag}"] = data[col].shift(lag)
    return data.join(pd.DataFrame(new_cols, index=data.index))

def add_pct_change(data, cols, periods=[1, 3, 12]):
    new_cols = {}
    for col in cols:
        for p in periods:
            new_cols[f"{col}_pct_change{p}"] = data[col].pct_change(p)
    return data.join(pd.DataFrame(new_cols, index=data.index))

def add_rolling_means(data, cols, windows=[3, 6, 12]):
    new_cols = {}
    for col in cols:
        for w in windows:
            new_cols[f"{col}_rollmean{w}"] = data[col].rolling(window=w).mean()
    return data.join(pd.DataFrame(new_cols, index=data.index))

# Add lag features
X = add_lags(X, original_cols)

# Add percent change features
X = add_pct_change(X, original_cols)

# Add rolling mean features
X = add_rolling_means(X, original_cols)


In [None]:
if "local_price" in X.columns:
    X['local_price_trend'] = X['local_price'].diff()

if "yield_curve_spread" in X.columns:
    X['yield_curve_spread_change'] = X['yield_curve_spread'].diff()

if "cpi" in X.columns and "unemployment_rate" in X.columns:
    X['cpi_unemp_interaction'] = X['cpi'] * X['unemployment_rate']

In [None]:
X = X.dropna()
y = y.loc[X.index]  # align target with valid feature rows

# Combine for convenience
df_engineered = X.copy()
df_engineered["target"] = y

print("Original rows:", len(df))
print("Rows after feature engineering:", len(df_engineered))
print("Columns in final dataset:", df_engineered.shape[1])

df_engineered.head()
