# Model trainer

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Tuple, Optional
import lightgbm as lgb
from sklearn.metrics import mean_absolute_percentage_error
import joblib

In [4]:
# Load the data
df = pd.read_parquet('data/gold/df.parquet')
df.head(3)

Unnamed: 0,24h_later_load,year,month,day,hour,weekday,1h_ago_load,2h_ago_load,3h_ago_load,24h_ago_load,7d_ago_load,8h_min,8h_max,8h_median,24h_min,24h_max,24h_median,7d_min,7d_max,7d_median
2015-01-08 01:00:00+01:00,6999,2015,1,8,1,3,7605.0,8000.0,8212.0,7528.0,7633.0,7499.0,9361.0,8568.0,7347.0,9447.0,8695.0,6380.0,9447.0,7496.0
2015-01-08 02:00:00+01:00,6864,2015,1,8,2,3,7499.0,7605.0,8000.0,7510.0,7654.0,7423.0,9361.0,8212.0,7347.0,9447.0,8695.0,6380.0,9447.0,7466.0
2015-01-08 03:00:00+01:00,6784,2015,1,8,3,3,7423.0,7499.0,7605.0,7349.0,7397.0,7303.0,9361.0,8000.0,7303.0,9447.0,8695.0,6380.0,9447.0,7463.0


In [5]:
def backtesting(Xy: pd.DataFrame, model: lgb.LGBMRegressor, starting_ts: Optional[pd.Timestamp] = None, window_size: Optional[int] = None, use_every_nth_ts: int = 1) -> Tuple[pd.DataFrame, float]:
    """Backtest the model, by starting either
    - `window_size` rows before the last row
    - at the `starting_ts` timestamp
    Each iteration during the foreward-walk, add `use_every_nth_ts` rows.
    """

    assert '24h_later_load' in Xy.columns
    assert window_size < len(Xy)

    if starting_ts is None:
        if window_size is None:
            raise Error('One of `starting_ts`, `window_size` must not be None.')

        starting_ts = Xy.index[-window_size]
        
    cutoff_timestamps = Xy[Xy.index >= starting_ts].index.to_list()
    
    cutoff_ts_to_y = {}
    for cutoff_ts in tqdm(cutoff_timestamps[::use_every_nth_ts]):    
        
        # Split train:val
        Xy_train = Xy[Xy.index < cutoff_ts]
        Xy_val = Xy[Xy.index == cutoff_ts]
        
        # Split X,y
        X_train, y_train = Xy_train.drop(columns=['24h_later_load']), Xy_train['24h_later_load']
        X_val, y_val = Xy_val.drop(columns=['24h_later_load']), Xy_val['24h_later_load']
    
        # Train model
        model.fit(X_train, y_train)
    
        # Compute prediction in 24h
        yhat_val = model.predict(X_val) 
    
        cutoff_ts_to_y[cutoff_ts] = (yhat_val[0], y_val.iloc[0])
        
    results_df = pd.DataFrame({
        'cutoff_ts': cutoff_ts_to_y.keys(), 
        'predicted_24h_later_load': [e[0] for e in cutoff_ts_to_y.values()], 
        '24h_later_load': [e[1] for e in cutoff_ts_to_y.values()]
    })

    mape = mean_absolute_percentage_error(results_df['24h_later_load'], results_df['predicted_24h_later_load']) * 100

    return results_df, mape

In [6]:
# Train & test model over the last month (i.e. 24*30 rows)
reg = lgb.LGBMRegressor(n_estimators=100, force_row_wise=True, verbose=0)
results_df, mape = backtesting(Xy=df, model=reg, window_size=24*30, use_every_nth_ts=10)
print(f'Backtested MAPE: {mape:.2f}%')
results_df.head(3)

100%|███████████████████████████████████████████| 72/72 [00:20<00:00,  3.45it/s]

Backtested MAPE: 5.31%





Unnamed: 0,cutoff_ts,predicted_24h_later_load,24h_later_load
0,2024-08-27 16:00:00+02:00,6811.455757,7171
1,2024-08-28 02:00:00+02:00,4973.135595,5065
2,2024-08-28 12:00:00+02:00,6783.165025,7051


In [7]:
# Dump
joblib.dump(reg, 'model.joblib')

['model.joblib']