# XGBoost Baseline: Charging Demand Forecast

This notebook runs the full pipeline on the synthetic dataset included in `/data/raw/`.


In [None]:
# 1. Install / import
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

from src.data_pipeline import load_sessions, aggregate_hourly
from src.features import add_time_features, add_lag_features

# load data
try:
    df = load_sessions()
except Exception as e:
    print('Data not found, generating synthetic data...')
    import data.synthetic_generator as gen
    gen.generate_synthetic_data(n_sites=6, n_days=60)
    df = load_sessions()

print('Loaded rows:', len(df))

df.head()

In [None]:
# 2. Aggregate & features
hourly = aggregate_hourly(df)
hourly['hour'] = pd.to_datetime(hourly['hour'])
hourly = add_time_features(hourly, ts_col='hour')
hourly = add_lag_features(hourly, group_col='site_id', target='sessions')
hourly = hourly.dropna().reset_index(drop=True)

hourly.head()

In [None]:
# 3. Train-test split (time-based)
# Use first 90% of data for training
split_idx = int(len(hourly) * 0.9)
train = hourly.iloc[:split_idx]
test = hourly.iloc[split_idx:]

features = ['hour_of_day','dayofweek','is_weekend','hour_sin','hour_cos','lag_1','lag_24','rmean_24']
X_train, y_train = train[features], train['sessions']
X_test, y_test = test[features], test['sessions']

print('Train size', X_train.shape, 'Test size', X_test.shape)

In [None]:
# 4. Train XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {'objective':'reg:squarederror','max_depth':6,'eta':0.1,'seed':42}
bst = xgb.train(params, dtrain, num_boost_round=250)

# predictions
preds = bst.predict(dtest)
mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)
print(f'MAE: {mae:.3f}, RMSE: {rmse:.3f}')

In [None]:
# 5. Visualize example site
import matplotlib.pyplot as plt
example_site = hourly['site_id'].unique()[0]
site_test = test[test['site_id']==example_site]
site_idx = site_test.index
plt.figure(figsize=(12,4))
plt.plot(site_test['hour'], site_test['sessions'], label='actual')
plt.plot(site_test['hour'], preds[site_idx - split_idx], label='pred')
plt.legend()
plt.title(f'Site {example_site} actual vs pred (test set)')
plt.show()

In [None]:
# 6. Save model artifact (optional)
import joblib
import os
os.makedirs('models', exist_ok=True)
joblib.dump(bst, 'models/xgb_baseline.joblib')
print('Saved model to models/xgb_baseline.joblib')

In [None]:
# 7. SHAP explainability (optional, may take time)
try:
    import shap
    expl = shap.TreeExplainer(bst)
    shap_values = expl.shap_values(X_test)
    shap.summary_plot(shap_values, X_test)
except Exception as e:
    print('SHAP not installed or failed:', e)
