# Phase 4: Machine Learning Models

XGBoost, LightGBM, Random Forest with time-series cross-validation.

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.data_loader import load_and_merge_data, load_raw_tables
from src.feature_engineering import build_ml_features
from src.models.ml_models import (
    get_time_series_cv,
    fit_xgboost,
    fit_lightgbm,
    fit_random_forest,
)
from src.metrics import mae, evaluate_forecasts

## 1. Prepare Data

In [None]:
df = load_and_merge_data()
tables = load_raw_tables()

# Aggregate to daily level
daily = df.groupby('date').agg({
    'sales': 'sum',
    'onpromotion': 'sum',
    'is_holiday': 'first',
}).reset_index()
daily['is_holiday'] = daily['is_holiday'].fillna(False).astype(int)

# Build features
feat_df = build_ml_features(daily, lags=[1, 7, 14, 30], rolling_windows=[7, 14, 30])
feat_df = feat_df.dropna().reset_index(drop=True)

feature_cols = [c for c in feat_df.columns if c not in ['date', 'sales']]
print('Features:', feature_cols)

## 2. Time-Series Split

In [None]:
X = feat_df[feature_cols].values
y = feat_df['sales'].values

train_end_idx = int(len(feat_df) * 0.85)
X_train, X_val = X[:train_end_idx], X[train_end_idx:]
y_train, y_val = y[:train_end_idx], y[train_end_idx:]

print(f'Train: {len(X_train)}, Val: {len(X_val)}')

## 3. Train Models

In [None]:
xgb_model = fit_xgboost(X_train, y_train, X_val, y_val)
lgb_model = fit_lightgbm(X_train, y_train, X_val, y_val)
rf_model = fit_random_forest(X_train, y_train)

y_xgb = xgb_model.predict(X_val)
y_lgb = lgb_model.predict(X_val)
y_rf = rf_model.predict(X_val)

## 4. Metrics

In [None]:
results = {
    'XGBoost': evaluate_forecasts(y_val, y_xgb),
    'LightGBM': evaluate_forecasts(y_val, y_lgb),
    'RandomForest': evaluate_forecasts(y_val, y_rf),
}
pd.DataFrame(results).round(2)

## 5. Feature Importance

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

imp_xgb = pd.Series(xgb_model.feature_importances_, index=feature_cols).sort_values(ascending=True)
imp_xgb.plot(kind='barh', ax=axes[0], title='XGBoost')

imp_lgb = pd.Series(lgb_model.feature_importances_, index=feature_cols).sort_values(ascending=True)
imp_lgb.plot(kind='barh', ax=axes[1], title='LightGBM')

plt.tight_layout()
plt.show()

## 6. Actual vs Predicted

In [None]:
val_dates = feat_df.loc[train_end_idx:, 'date']
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(val_dates.values, y_val, label='Actual', color='black', alpha=0.8)
ax.plot(val_dates.values, y_xgb, label='XGBoost', alpha=0.7)
ax.plot(val_dates.values, y_lgb, label='LightGBM', alpha=0.7)
ax.plot(val_dates.values, y_rf, label='RandomForest', alpha=0.7)
ax.legend()
ax.set_title('ML Forecasts vs Actual')
ax.set_ylabel('Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()