In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from fredapi import Fred
from dotenv import load_dotenv

from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", None)

print("Libraries loaded.")

In [None]:
# Load .env file
load_dotenv()

fred_api_key = os.getenv("FRED_API_KEY")
if fred_api_key is None:
    raise ValueError("FRED_API_KEY not found. Check your .env file.")

fred = Fred(api_key=fred_api_key)
print("FRED API connected.")

In [None]:
bigmac_path = "data/raw/big-mac-source-data.csv"
bigmac = pd.read_csv(bigmac_path)
print("Big Mac Index loaded. Rows:", len(bigmac))

bigmac_us = bigmac[bigmac['iso_a3'] == 'USA'].copy()
bigmac_us = bigmac_us[['date', 'local_price']]
bigmac_us['date'] = pd.to_datetime(bigmac_us['date'])
bigmac_us = bigmac_us.set_index('date').resample("ME").ffill()
bigmac_us.head(100)

In [None]:
# Pre-2008 single target rate
dfedtar = fred.get_series("DFEDTAR")  # single target rate (discontinued)
dfedtar = dfedtar.to_frame("fed_funds_target_single")
dfedtar.index = pd.to_datetime(dfedtar.index)

# Convert single target into pseudo upper/lower range
dfedtar["fed_funds_lower"] = dfedtar["fed_funds_target_single"]
dfedtar["fed_funds_upper"] = dfedtar["fed_funds_target_single"]
dfedtar["fed_funds_mid"]   = dfedtar["fed_funds_target_single"]

# Keep only columns we'll use
dfedtar = dfedtar[["fed_funds_lower", "fed_funds_upper", "fed_funds_mid"]]

# Post-2008 target range
ffr_upper = fred.get_series("DFEDTARU").to_frame("fed_funds_upper")
ffr_lower = fred.get_series("DFEDTARL").to_frame("fed_funds_lower")

ffr_upper.index = pd.to_datetime(ffr_upper.index)
ffr_lower.index = pd.to_datetime(ffr_lower.index)

# Merge modern upper/lower
ffr_post = pd.concat([ffr_lower, ffr_upper], axis=1)

# Compute midpoint
ffr_post["fed_funds_mid"] = (ffr_post["fed_funds_lower"] + ffr_post["fed_funds_upper"]) / 2

# Stitch pre + post together
ffr_pre = dfedtar[dfedtar.index < "2008-12-01"]
ffr_full = pd.concat([ffr_pre, ffr_post], axis=0)
ffr_full = ffr_full.sort_index()

# Convert to monthly frequency (M) and forward fill
ffr_full = ffr_full.resample("M").ffill()

ffr_full.head(20)

In [None]:
series_dict = {
    "CPIAUCSL": "cpi",
    "UNRATE": "unemployment_rate",
    "M2SL": "m2_money_supply",
    "DGS10": "treasury_10yr_yield",
    "T10Y2Y": "yield_curve_spread"
}

fred_frames = []

for series_id, colname in series_dict.items():
    data = fred.get_series(series_id)
    df = data.to_frame(name=colname)
    df.index = pd.to_datetime(df.index)
    df = df.resample("ME").mean()
    fred_frames.append(df)

fred_combined = pd.concat(fred_frames, axis=1)
fred_combined.head()

In [None]:
#Build FRED-only dataset (no Big Mac involved)

# Step 1: Determine earliest usable date across all FRED series
fred_start = fred_combined.dropna().index.min()
for col in fred_combined.columns:
    print(col, "starts at", fred_combined[col].dropna().index.min())

# Step 2: Build full monthly index from earliest date → latest date
full_index_fred = pd.date_range(
    start=fred_start,
    end=max(ffr_full.index.max(), fred_combined.index.max()),
    freq="ME"
)

# Step 3: Reindex both datasets to this timeline
ffr_fred = ffr_full.reindex(full_index_fred)
macro_fred = fred_combined.reindex(full_index_fred)

# Step 4: Forward-fill (Fed + macro indicators)
ffr_fred = ffr_fred.ffill()
macro_fred = macro_fred.ffill()

# Step 5: Combine into a single dataset
fred_only = pd.concat([ffr_fred, macro_fred], axis=1)

# Optional: Drop rows if any leading NaNs slipped in
fred_only = fred_only.dropna()

fred_only.head(20)

In [None]:
# Align all datasets to Big Mac Index availability

# 1. Big Mac determines the earliest valid date
bigmac_start = bigmac_us.dropna().index.min()

# 2. Full monthly index from Big Mac start → most recent data available
full_index = pd.date_range(
    start=bigmac_start,
    end=bigmac_us.index.max(),
    freq="ME"
)

# 3. Reindex all datasets to this unified timeline
ffr = ffr_full.reindex(full_index)
bigmac_us = bigmac_us.reindex(full_index)
fred_combined = fred_combined.reindex(full_index)

# 4. Forward-fill everything except Big Mac (already correct frequency)
ffr = ffr.ffill()
fred_combined = fred_combined.ffill()

# 5. Final unified dataset
combined = pd.concat([ffr, bigmac_us, fred_combined], axis=1)
combined = combined.sort_index()
combined.tail(20)

In [None]:
missing_report = combined.isna().sum()
missing_report

# missing_report = fred_only.isna().sum()
# missing_report

combined_output_path = "data/raw/combined_raw.csv"
combined.to_csv(combined_output_path)
print("Raw dataset saved to:", combined_output_path)

output_path = "data/raw/fred_only.csv"
fred_only.to_csv(output_path)
print("Raw dataset saved to:", output_path)

In [None]:
# Load raw csv data
from sklearn.model_selection import train_test_split

combined = pd.read_csv("data/raw/combined.csv", index_col=0, parse_dates=True)
fred_only = pd.read_csv("data/raw/fred_only.csv", index_col=0, parse_dates=True)

df = combined.copy()

# Parse date column and sort just in case
#df['date'] = pd.to_datetime(df['Unnamed: 0'])
#df = df.drop(columns=['Unnamed: 0'])
#df = df.sort_values('date').set_index('date')

df.head()

In [None]:
TARGET = "fed_funds_mid"   # can be changed later to "fed_funds_upper" or "fed_funds_lower"

y = df[TARGET].copy()

X = df.drop(columns=["fed_funds_mid", "fed_funds_upper", "fed_funds_lower"])

In [None]:
def add_lags(data, cols, lags=[1, 3, 6, 12]):
    for col in cols:
        for lag in lags:
            data[f"{col}_lag{lag}"] = data[col].shift(lag)
    return data

lag_columns = X.columns.tolist()
X = add_lags(X, lag_columns)

def add_pct_change(data, cols, periods=[1, 3, 12]):
    for col in cols:
        for p in periods:
            data[f"{col}_pct_change{p}"] = data[col].pct_change(p)
    return data

X = add_pct_change(X, lag_columns)

def add_rolling_means(data, cols, windows=[3, 6, 12]):
    for col in cols:
        for w in windows:
            data[f"{col}_rollmean{w}"] = data[col].rolling(window=w).mean()
    return data

X = add_rolling_means(X, lag_columns)

X['local_price_trend'] = X['local_price'].diff()
X['yield_curve_spread_change'] = X['yield_curve_spread'].diff()
X['cpi_unemp_interaction'] = X['cpi'] * X['unemployment_rate']

In [None]:
full = pd.concat([X, y], axis=1)
full = full.dropna()

X_final = full.drop(columns=[TARGET])
y_final = full[TARGET]

X_final.shape, y_final.shape

X_final.to_csv("data/X_features.csv")
y_final.to_csv("data/y_target.csv")

In [None]:
"""
# TODO: Replace this when feature-engineering is finished
df = pd.read_csv("processed_bigmac_fred.csv")

# Identify features & target
X = df.drop(columns=["ffr_target"])
y = df["ffr_target"]
"""
# Load X and y with proper index handling
X = pd.read_csv("data/X_features.csv", index_col=0)

y = pd.read_csv("data/y_target.csv", index_col=0).squeeze("columns")
y = y.astype(float)

# Confirm shapes
print(X.shape, y.shape)
print(X.index.equals(y.index))   # Should be True

In [None]:
# Chronological split
split_index = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

In [None]:
baseline_pred = np.full_like(y_test, y_train.mean(), dtype=float)
baseline_pred_persist = np.full_like(y_test, y_train.iloc[-1], dtype=float)

def evaluate(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": rmse,
        "R2": r2_score(y_true, y_pred)
    }

print("Mean baseline:", evaluate(y_test, baseline_pred))
print("Persistence baseline:", evaluate(y_test, baseline_pred_persist))

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "ElasticNet": ElasticNet(alpha=0.01, l1_ratio=0.5),
    "SVR (RBF)": SVR(kernel="rbf", C=10, gamma=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    results[name] = evaluate(y_test, preds)

pd.DataFrame(results).T

plt.figure(figsize=(12,5))
plt.plot(y_test.values, label="True")

for name, model in models.items():
    pred = model.predict(X_test)
    plt.plot(pred, label=name)

plt.legend()
plt.title("Model Predictions vs True FFR")
plt.show()

rf = models["Random Forest"]
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values().plot(kind="barh", figsize=(10,6))
plt.title("Random Forest Feature Importance")
plt.show()

pd.DataFrame(results).T.to_csv("model_performance.csv")