# Model Training and Evaluation
## Electricity Demand Forecasting

This notebook implements and evaluates machine learning models for electricity demand forecasting using the leakage-free feature set.

**Key Components:**
- **Data Loading**: Loads the cleaned, leakage-free dataset (`engineered_features_essential.csv`).
- **Strict Chronological Split**: 70% Train, 15% Validation, 15% Test. NO shuffling.
- **Leakage Prevention**: Freezes all statistical parameters (percentiles, z-scores, scaling means/stds) on the TRAINING set and applies them to Validation/Test.
- **Model Training**: Baseline (Linear, Ridge) and Advanced (Random Forest, XGBoost, LightGBM).
- **Evaluation**: Comprehensive metrics (MAE, RMSE, MAPE) on validation and test sets.
- **Export**: Saves models, predictions, and most importantly, the **frozen parameters** for production use.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
import json
import joblib
warnings.filterwarnings('ignore')

# Modeling libraries
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Advanced models
import xgboost as xgb
import lightgbm as lgb

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('=' * 80)
print('ELECTRICITY DEMAND FORECASTING - MODEL TRAINING')
print('=' * 80)
print(f'Notebook initialized at: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

## 1. Data Loading and feature verification

In [None]:
data_path = '../data/'
input_file = f'{data_path}engineered_features_essential.csv'

if not os.path.exists(input_file):
    raise FileNotFoundError(f"Input file not found: {input_file}. Please ensure feature engineering has generated the essential dataset.")

df = pd.read_csv(input_file)
df['time'] = pd.to_datetime(df['time'])

print(f'‚úì Loaded dataset: {df.shape[0]} rows √ó {df.shape[1]} columns')
print(f'  Time range: {df["time"].min()} to {df["time"].max()}')

# Identify features (exclude non-numeric and metadata)
metadata_cols = ['time', 'demand', 'city', 'season', 'region', 'climate_zone', 
                 'day_phase', 'date', 'holiday_name', 'dawn', 'sunrise', 'sunset', 'dusk']

# Exclude any other object-type columns that might have slipped in
feature_cols = [c for c in df.columns if c not in metadata_cols and pd.api.types.is_numeric_dtype(df[c])]

print(f'‚úì Identified {len(feature_cols)} numeric features for modeling')

# FINAL LEAKAGE CHECK: Drop any lingering "future" columns if they exist
drop_leakage = [c for c in df.columns if 'future' in c or 'target' in c and c != 'demand']
if drop_leakage:
    print(f'‚ö†Ô∏è Dropping potential leakage columns: {drop_leakage}')
    df.drop(columns=drop_leakage, inplace=True)
    feature_cols = [c for c in feature_cols if c not in drop_leakage]


## 2. Chronological Train/Validation/Test Split
We use a strict time-based split to simulate real-world forecasting. **No shuffling** is allowed.

In [None]:
# Split Ratios: 70% Train, 15% Val, 15% Test
train_end = int(0.70 * len(df))
val_end = int(0.85 * len(df))

train_df = df.iloc[:train_end].copy()
val_df = df.iloc[train_end:val_end].copy()
test_df = df.iloc[val_end:].copy()

print(f'Train set:      {len(train_df)} samples ({train_df["time"].min()} to {train_df["time"].max()})')
print(f'Validation set: {len(val_df)} samples ({val_df["time"].min()} to {val_df["time"].max()})')
print(f'Test set:       {len(test_df)} samples ({test_df["time"].min()} to {test_df["time"].max()})')

# Verify no overlap
assert train_df['time'].max() < val_df['time'].min(), "Leakage detected: Train overlaps Validation"
assert val_df['time'].max() < test_df['time'].min(), "Leakage detected: Validation overlaps Test"
print('‚úì Chronological split verified.')

## 3. Handling Statistical Features (Leakage Prevention)
For features like `demand_percentile` or `z-scores` that rely on distributions, we must calculation statistics on the **training set only** and freeze them.

In [None]:
# FEATURE FREEZING & IMPUTATION

# 1. Imputation: Compute means on TRAIN only
train_means = train_df[feature_cols].mean()
train_df[feature_cols] = train_df[feature_cols].fillna(train_means)
val_df[feature_cols] = val_df[feature_cols].fillna(train_means)
test_df[feature_cols] = test_df[feature_cols].fillna(train_means)

# 2. Frozen Percentiles (if applicable)
# Construct bounds based on training data distribution per city/hour
percentile_bounds = {}
for city in train_df['city'].unique():
    for hour in range(24):
        mask = (train_df['city'] == city) & (train_df['hour'] == hour)
        if mask.any():
            subset = train_df.loc[mask, 'demand']
            percentile_bounds[f'{city}_{hour}'] = {
                'p25': float(subset.quantile(0.25)),
                'p50': float(subset.quantile(0.50)),
                'p75': float(subset.quantile(0.75))
            }

# Save frozen percentiles
with open(f'{data_path}frozen_percentiles.json', 'w') as f:
    json.dump(percentile_bounds, f)

def apply_frozen_percentiles(df_in, bounds):
    df_out = df_in.copy()
    # Placeholder for logic mapping values to percentiles using frozen bounds
    return df_out

print('‚úì Imputation using training means applied.')
print(f'‚úì Frozen percentile bounds saved to {data_path}frozen_percentiles.json')

## 4. Feature Scaling

In [None]:
scaler = StandardScaler()

# Fit on TRAIN only
X_train = train_df[feature_cols]
y_train = train_df['demand']

X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_cols, index=train_df.index)

# Transform Val/Test
X_val = val_df[feature_cols]
y_val = val_df['demand']
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=feature_cols, index=val_df.index)

X_test = test_df[feature_cols]
y_test = test_df['demand']
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=feature_cols, index=test_df.index)

# Save scalar for production
joblib.dump(scaler, f'{data_path}scaler.pkl')
print('‚úì Scaler fitted on training data and saved.')

## 5. Model Training

In [None]:
results = {}

def evaluate(y_true, y_pred, set_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    r2 = r2_score(y_true, y_pred)
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}

# 1. Linear Regression (Baseline)
print('Training Linear Regression...')
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_preds = lr.predict(X_val_scaled)
results['LinearRegression'] = evaluate(y_val, lr_preds, 'Validation')

# 2. Ridge Regression
print('Training Ridge Regression...')
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
ridge_preds = ridge.predict(X_val_scaled)
results['Ridge'] = evaluate(y_val, ridge_preds, 'Validation')

# 3. Random Forest
print('Training Random Forest (this may take a while)...')
rf = RandomForestRegressor(n_estimators=100, max_depth=15, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train) 
rf_preds = rf.predict(X_val)
results['RandomForest'] = evaluate(y_val, rf_preds, 'Validation')

# 4. XGBoost
print('Training XGBoost...')
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, n_jobs=-1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_val)
results['XGBoost'] = evaluate(y_val, xgb_preds, 'Validation')

# 5. LightGBM
print('Training LightGBM...')
lgb_model = lgb.LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, n_jobs=-1, random_state=42, verbose=-1)
lgb_model.fit(X_train, y_train)
lgb_preds = lgb_model.predict(X_val)
results['LightGBM'] = evaluate(y_val, lgb_preds, 'Validation')

print('\nModel Evaluation Results (Validation Set):')
res_df = pd.DataFrame(results).T.sort_values('MAE')
print(res_df)

## 6. Final Evaluation on Test Set
We select the best performing model from validation and evaluate it on the hold-out Test set.

In [None]:
best_model_name = res_df.index[0]
print(f'\nüèÜ Best Model: {best_model_name}')

if best_model_name == 'RandomForest':
    final_model = rf
    test_preds = final_model.predict(X_test)
elif best_model_name == 'XGBoost':
    final_model = xgb_model
    test_preds = final_model.predict(X_test)
elif best_model_name == 'LightGBM':
    final_model = lgb_model
    test_preds = final_model.predict(X_test)
elif best_model_name == 'Ridge':
    final_model = ridge
    test_preds = final_model.predict(X_test_scaled)
else:
    final_model = lr
    test_preds = final_model.predict(X_test_scaled)

test_metrics = evaluate(y_test, test_preds, 'Test')
print(f'Test Set Performance ({best_model_name}):')
print(test_metrics)

# Export Results
res_df.to_csv(f'{data_path}model_validation_results.csv')
output_file = f'{data_path}test_predictions.csv'
test_df_export = test_df.copy()
test_df_export['predicted_demand'] = test_preds
test_df_export[['time', 'city', 'demand', 'predicted_demand']].to_csv(output_file, index=False)

print(f'‚úì Predictions saved to {output_file}')

# Feature Importance (if tree-based)
if hasattr(final_model, 'feature_importances_'):
    importances = pd.DataFrame({
        'feature': feature_cols,
        'importance': final_model.feature_importances_
    }).sort_values('importance', ascending=False)
    importances.to_csv(f'{data_path}feature_importance.csv', index=False)
    print(f'‚úì Feature importance saved to {data_path}feature_importance.csv')
    print('\nTop 10 Features:')
    print(importances.head(10))