In [1]:
from darts.metrics import mape
from darts import TimeSeries
import pandas as pd
import seaborn as sns
import numpy as np
import lightgbm as lgb
from tqdm import tqdm

In [2]:
df = pd.read_parquet('data/clean/df.parquet').drop(columns=['24h_later_forecast'])
df.head(3)

Unnamed: 0_level_0,forecast_dt,24h_later_load
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-12-14 00:00:00,2014-12-15 00:00:00,6131
2014-12-14 01:00:00,2014-12-15 01:00:00,5842
2014-12-14 02:00:00,2014-12-15 02:00:00,5715


# Build baseline

Use the load from the previous hour to predict the load in 24h

In [3]:
# For each timestamp, figure out the hourly load starting 1h ago
existing_timestamps = set(df.index)
ts_to_load = {}
for ts in tqdm(df.index):
    query_ts = ts - pd.Timedelta(25, 'h') # We want the hourly load starting at this ts
    last_hour_load = None
    if query_ts in existing_timestamps:
        last_hour_load = df.loc[query_ts]['24h_later_load']
    ts_to_load[ts] = last_hour_load
  

100%|██████████████████████████████████| 85319/85319 [00:07<00:00, 11678.30it/s]


In [4]:
# Build a df out of the collected data
previous_load_df = pd.DataFrame(
    data={'1h_ago_load': ts_to_load.values()},
    index=ts_to_load.keys(),
)
previous_load_df.index.name = 'datetime'
print(f"The previous hour's load was missing for {previous_load_df['1h_ago_load'].isna().sum()}/{len(previous_load_df)} records")

previous_load_df = previous_load_df.dropna()
previous_load_df['1h_ago_load'] = previous_load_df['1h_ago_load'].astype(int)
previous_load_df.head(3)

The previous hour's load was missing for 133/85319 records


Unnamed: 0_level_0,1h_ago_load
datetime,Unnamed: 1_level_1
2014-12-15 01:00:00,6131
2014-12-15 02:00:00,5842
2014-12-15 03:00:00,5715


In [5]:
# Enrich the df with last hour's load
df = pd.merge(df, previous_load_df, on='datetime')
df = df.dropna()
df['1h_ago_load'] = df['1h_ago_load'].astype(int) 
df.head(3)

Unnamed: 0_level_0,forecast_dt,24h_later_load,1h_ago_load
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-12-15 01:00:00,2014-12-16 01:00:00,6292,6131
2014-12-15 02:00:00,2014-12-16 02:00:00,6255,5842
2014-12-15 03:00:00,2014-12-16 03:00:00,6189,5715


In [6]:
# Split train:val
datetime_cutoff = pd.Timestamp('2024-08-01')
val_df = df[df.index >= datetime_cutoff]

# Compute MAPE
val_gt_ts = TimeSeries.from_dataframe(val_df, value_cols=['24h_later_load'], freq='h')
val_forecast_ts = TimeSeries.from_dataframe(val_df, value_cols=['1h_ago_load'], freq='h')

print('Val MAPE: ', mape(val_gt_ts, val_forecast_ts))

Val MAPE:  8.324114141082365


# Build smarter baseline

Use as features:
- The last hour's load
- The datetime attribute (day, month, hour, weekday

In [7]:
# Enrich the df with the datetime attributes
df['month'] = df.index.month
df['day'] = df.index.day
df['hour'] = df.index.hour
df['weekday'] = df.index.weekday
df.head(3)

Unnamed: 0_level_0,forecast_dt,24h_later_load,1h_ago_load,month,day,hour,weekday
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-12-15 01:00:00,2014-12-16 01:00:00,6292,6131,12,15,1,0
2014-12-15 02:00:00,2014-12-16 02:00:00,6255,5842,12,15,2,0
2014-12-15 03:00:00,2014-12-16 03:00:00,6189,5715,12,15,3,0


In [8]:
# Build Xy
Xy = df[['month', 'day', 'hour', 'weekday', '1h_ago_load', '24h_later_load']]

# Split train:val
datetime_cutoff = pd.Timestamp('2024-08-01')
Xy_train = Xy[Xy.index < datetime_cutoff]
Xy_val = Xy[Xy.index >= datetime_cutoff]

# Split X,y
X_train, y_train = Xy_train.drop(columns=['24h_later_load']), Xy_train['24h_later_load']
X_val, y_val = Xy_val.drop(columns=['24h_later_load']), Xy_val['24h_later_load']

In [9]:
reg = lgb.LGBMRegressor(n_estimators=100, force_row_wise=True)
reg.fit(X_train, y_train)

[LightGBM] [Info] Total Bins 331
[LightGBM] [Info] Number of data points in the train set: 83976, number of used features: 5
[LightGBM] [Info] Start training from score 7103.204058


In [10]:
# Compute MAPE
y_train_ts = TimeSeries.from_values(y_train)
yhat_train_ts = TimeSeries.from_values(reg.predict(X_train))

y_val_ts = TimeSeries.from_values(y_val)
yhat_val_ts = TimeSeries.from_values(reg.predict(X_val))

print('Train MAPE:', mape(y_train_ts, yhat_train_ts))
print('Val MAPE:', mape(y_val_ts, yhat_val_ts))

Train MAPE: 4.289144387042031
Val MAPE: 8.108838142196232


# Backtesting

In real life, each model is trained with all the historical data available, and the load prediction for in 24h is then made.

Hence, we need to build some backtesting. 
For each timestamp, a new model will be trained. 
It will then be used to predict the load in 24h.

In [11]:
def backtesting(Xy, model, starting_ts=pd.Timestamp('2024-08-01'), use_every_nth_ts=1):
    cutoff_ts = Xy[Xy.index >= starting_ts].index.to_list()
    
    cutoff_ts_to_y = {}
    for ts in tqdm(cutoff_ts[::use_every_nth_ts]):    
        
        # Split train:val
        Xy_train = Xy[Xy.index < ts]
        Xy_val = Xy[Xy.index == ts]
        
        # Split X,y
        X_train, y_train = Xy_train.drop(columns=['24h_later_load']), Xy_train['24h_later_load']
        X_val, y_val = Xy_val.drop(columns=['24h_later_load']), Xy_val['24h_later_load']
    
        # Train model
        model.fit(X_train, y_train)
    
        # Compute prediction in 24h
        yhat_val = model.predict(X_val) 
    
        cutoff_ts_to_y[ts] = (yhat_val[0], y_val.iloc[0])
        
    return pd.DataFrame({
        'cutoff_ts': cutoff_ts_to_y.keys(), 
        'predicted_24h_later_load': [e[0] for e in cutoff_ts_to_y.values()], 
        '24h_later_load': [e[1] for e in cutoff_ts_to_y.values()]
    })
        

In [13]:
reg = lgb.LGBMRegressor(n_estimators=100, force_row_wise=True, verbose=0)
results_df = backtesting(Xy, model=reg, starting_ts=pd.Timestamp('2024-08-01'), use_every_nth_ts=1)
results_df.head(3)

100%|███████████████████████████████████████| 1210/1210 [03:15<00:00,  6.19it/s]


Unnamed: 0,cutoff_ts,predicted_24h_later_load,24h_later_load
0,2024-08-01 00:00:00,5378.943071,5029
1,2024-08-01 01:00:00,5294.366448,4868
2,2024-08-01 02:00:00,5176.87358,4664


In [14]:
print(f'Backtested MAPE: {
    mape(
        TimeSeries.from_values(results_df.predicted_24h_later_load),
        TimeSeries.from_values(results_df['24h_later_load'])
    )
}')

Backtested MAPE: 6.701911108383431


# Add last week's load

As feature, use 
- Load 24h ago
- Load a week ago
- Datetime attributes

In [15]:
def get_load(df, timedelta, colname):
    """
    timedelta: pd.Timedelta from which we would like the hourly load, starting at the CURRENT timestamp
    i.e. if timedelta == 1h, we would like the hourly load of the current timestamp - 1h, which corresponds to the 24h_later_load of the current timestamp - 24h - 1h 
    """
    
    # For each timestamp, figure out the hourly load starting 1h ago
    existing_timestamps = set(df.index)
    ts_to_load = {}
    for ts in tqdm(df.index):
        query_ts = ts - pd.Timedelta(24, 'h') - timedelta # We want the hourly load starting at this ts
        last_hour_load = None
        if query_ts in existing_timestamps:
            last_hour_load = df.loc[query_ts]['24h_later_load']
        ts_to_load[ts] = last_hour_load

    # Build a df out of the collected data
    previous_load_df = pd.DataFrame(
        data={colname: ts_to_load.values()},
        index=ts_to_load.keys(),
    )
    previous_load_df.index.name = 'datetime'
    print(f"The previous hour's load was missing for {previous_load_df[colname].isna().sum()}/{len(previous_load_df)} records")
    
    previous_load_df = previous_load_df.dropna()
    previous_load_df[colname] = previous_load_df[colname].astype(int)

    return previous_load_df

In [16]:
previous_load_df = get_load(df, timedelta=pd.Timedelta(24, 'h'), colname='24h_ago_load')
previous_load_df.head(3)

100%|███████████████████████████████████| 85186/85186 [00:08<00:00, 9475.20it/s]


The previous hour's load was missing for 259/85186 records


Unnamed: 0_level_0,24h_ago_load
datetime,Unnamed: 1_level_1
2014-12-17 01:00:00,6292
2014-12-17 02:00:00,6255
2014-12-17 03:00:00,6189


In [17]:
# Enrich the df with the load from 24h ago, and 7 days ago
df = pd.merge(df, previous_load_df, on='datetime')
df = df.dropna()
df['24h_ago_load'] = df['24h_ago_load'].astype(int) 
df.head(3)

Unnamed: 0_level_0,forecast_dt,24h_later_load,1h_ago_load,month,day,hour,weekday,24h_ago_load
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-12-17 01:00:00,2014-12-18 01:00:00,6514,6682,12,17,1,2,6292
2014-12-17 02:00:00,2014-12-18 02:00:00,6373,6444,12,17,2,2,6255
2014-12-17 03:00:00,2014-12-18 03:00:00,6291,6377,12,17,3,2,6189


In [18]:
previous_load_df = get_load(df, timedelta=pd.Timedelta(7, 'd'), colname='7d_ago_load')
previous_load_df.head(3)

100%|███████████████████████████████████| 84927/84927 [00:09<00:00, 9230.54it/s]


The previous hour's load was missing for 616/84927 records


Unnamed: 0_level_0,7d_ago_load
datetime,Unnamed: 1_level_1
2015-01-11 01:00:00,7017
2015-01-11 02:00:00,6969
2015-01-11 03:00:00,6820


In [19]:
# Enrich the df with the load from 24h ago, and 7 days ago
df = pd.merge(df, previous_load_df, on='datetime')
df = df.dropna()
df['7d_ago_load'] = df['7d_ago_load'].astype(int) 
df.head(3)

Unnamed: 0_level_0,forecast_dt,24h_later_load,1h_ago_load,month,day,hour,weekday,24h_ago_load,7d_ago_load
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-11 01:00:00,2015-01-12 01:00:00,7231,6703,1,11,1,6,6798,7017
2015-01-11 02:00:00,2015-01-12 02:00:00,7226,6433,1,11,2,6,6701,6969
2015-01-11 03:00:00,2015-01-12 03:00:00,7177,6419,1,11,3,6,6638,6820


In [20]:
# Build Xy
Xy = df[[
    'month', 'day', 'hour', 'weekday', 
    '1h_ago_load', 
    '24h_ago_load',
    '7d_ago_load',
    '24h_later_load'
]]
Xy.head(3)

Unnamed: 0_level_0,month,day,hour,weekday,1h_ago_load,24h_ago_load,7d_ago_load,24h_later_load
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-11 01:00:00,1,11,1,6,6703,6798,7017,7231
2015-01-11 02:00:00,1,11,2,6,6433,6701,6969,7226
2015-01-11 03:00:00,1,11,3,6,6419,6638,6820,7177


In [21]:
reg = lgb.LGBMRegressor(n_estimators=100, force_row_wise=True, verbose=0)
results_df = backtesting(Xy, model=reg, starting_ts=pd.Timestamp('2024-08-01'), use_every_nth_ts=1)
results_df.head(3)

100%|███████████████████████████████████████| 1210/1210 [03:34<00:00,  5.65it/s]


Unnamed: 0,cutoff_ts,predicted_24h_later_load,24h_later_load
0,2024-08-01 00:00:00,5365.530974,5029
1,2024-08-01 01:00:00,5094.63665,4868
2,2024-08-01 02:00:00,4903.311637,4664


In [22]:
print(f'Backtested MAPE: {
    mape(
        TimeSeries.from_values(results_df.predicted_24h_later_load),
        TimeSeries.from_values(results_df['24h_later_load'])
    )
}')

Backtested MAPE: 6.122156907209566


# Add [-48h;-24h] and [-8d, -1d] statistics

As features, start using
- Min/Max/Median load 

In [23]:
def compute_stat(current_time, start_n_hours_ago, stats):
    start_time = current_time -  pd.Timedelta(24, 'h') - pd.Timedelta(hours=start_n_hours_ago)
    end_time = current_time - pd.Timedelta(24, 'h') 
    
    relevant_data = df.loc[start_time:end_time, '24h_later_load']

    if len(relevant_data) == 0:
        return np.nan
    
    return [stat(relevant_data.values) for stat in stats]

In [24]:
# Compute previous day's stats
df['previous_day_stats'] = df.index.to_series().apply(lambda x: compute_stat(x, start_n_hours_ago=24, stats=[np.median, np.min, np.max]))
df = df.dropna()
df.head(3)

Unnamed: 0_level_0,forecast_dt,24h_later_load,1h_ago_load,month,day,hour,weekday,24h_ago_load,7d_ago_load,previous_day_stats
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-12 01:00:00,2015-01-13 01:00:00,7363,6983,1,12,1,0,6433,6971,"[7231.0, 7231, 7231]"
2015-01-12 02:00:00,2015-01-13 02:00:00,7288,7231,1,12,2,0,6419,7177,"[7228.5, 7226, 7231]"
2015-01-12 03:00:00,2015-01-13 03:00:00,7213,7226,1,12,3,0,6475,7224,"[7226.0, 7177, 7231]"


In [25]:
df['previous_day_median'] = df.previous_day_stats.apply(lambda x: x[0])
df['previous_day_min'] = df.previous_day_stats.apply(lambda x: x[1])
df['previous_day_max'] = df.previous_day_stats.apply(lambda x: x[2])

In [26]:
# Compute previous week's stats
df['previous_week_stats'] = df.index.to_series().apply(lambda x: compute_stat(x, start_n_hours_ago=24*7, stats=[np.median, np.min, np.max]))
df = df.dropna()
df.head(3)

Unnamed: 0_level_0,forecast_dt,24h_later_load,1h_ago_load,month,day,hour,weekday,24h_ago_load,7d_ago_load,previous_day_stats,previous_day_median,previous_day_min,previous_day_max,previous_week_stats
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2015-01-13 01:00:00,2015-01-14 01:00:00,6938,7512,1,13,1,1,7231,7516,"[8710.0, 7095, 9357]",8710.0,7095,9357,"[7363.0, 7363, 7363]"
2015-01-13 02:00:00,2015-01-14 02:00:00,6792,7363,1,13,2,1,7226,7466,"[8710.0, 7095, 9357]",8710.0,7095,9357,"[7325.5, 7288, 7363]"
2015-01-13 03:00:00,2015-01-14 03:00:00,6747,7288,1,13,3,1,7177,7434,"[8710.0, 7095, 9357]",8710.0,7095,9357,"[7288.0, 7213, 7363]"


In [27]:
df['previous_week_median'] = df.previous_week_stats.apply(lambda x: x[0])
df['previous_week_min'] = df.previous_week_stats.apply(lambda x: x[1])
df['previous_week_max'] = df.previous_week_stats.apply(lambda x: x[2])

In [28]:
# Build Xy
Xy = df[[
    'month', 'day', 'hour', 'weekday', 
    '24h_ago_load', 
    '7d_ago_load', 
    'previous_day_median', 'previous_day_min', 'previous_day_max',
    'previous_week_median', 'previous_week_min', 'previous_week_max',
    '24h_later_load'
]]
Xy.head(3)

Unnamed: 0_level_0,month,day,hour,weekday,24h_ago_load,7d_ago_load,previous_day_median,previous_day_min,previous_day_max,previous_week_median,previous_week_min,previous_week_max,24h_later_load
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-01-13 01:00:00,1,13,1,1,7231,7516,8710.0,7095,9357,7363.0,7363,7363,6938
2015-01-13 02:00:00,1,13,2,1,7226,7466,8710.0,7095,9357,7325.5,7288,7363,6792
2015-01-13 03:00:00,1,13,3,1,7177,7434,8710.0,7095,9357,7288.0,7213,7363,6747


In [29]:
reg = lgb.LGBMRegressor(n_estimators=100, force_row_wise=True, verbose=0)
results_df = backtesting(Xy, model=reg, starting_ts=pd.Timestamp('2024-08-01'), use_every_nth_ts=1)
results_df.head(3)

100%|███████████████████████████████████████| 1210/1210 [04:27<00:00,  4.53it/s]


Unnamed: 0,cutoff_ts,predicted_24h_later_load,24h_later_load
0,2024-08-01 00:00:00,4850.78868,5029
1,2024-08-01 01:00:00,4742.036682,4868
2,2024-08-01 02:00:00,4807.964745,4664


In [30]:
print(f'Backtested MAPE: {
    mape(
        TimeSeries.from_values(results_df.predicted_24h_later_load),
        TimeSeries.from_values(results_df['24h_later_load'])
    )
}')

Backtested MAPE: 6.111648176413032


# Try with a bigger # estimators

In [31]:
reg = lgb.LGBMRegressor(n_estimators=10_000, force_row_wise=True, verbose=0)
results_df = backtesting(Xy, model=reg, starting_ts=pd.Timestamp('2024-08-01'), use_every_nth_ts=10)
results_df.head(3)

100%|█████████████████████████████████████████| 121/121 [21:41<00:00, 10.76s/it]


Unnamed: 0,cutoff_ts,predicted_24h_later_load,24h_later_load
0,2024-08-01 00:00:00,5234.827663,5029
1,2024-08-01 10:00:00,5436.994413,4821
2,2024-08-01 20:00:00,4865.741561,4987


In [32]:
print(f'Backtested MAPE: {
    mape(
        TimeSeries.from_values(results_df.predicted_24h_later_load),
        TimeSeries.from_values(results_df['24h_later_load'])
    )
}')

Backtested MAPE: 3.7407192447540645


## TODO

Build website to showcase project
- Landing page shows a pretty plot with the historical data and prediction
- One page about data exploration (EDA), showing plots, seasonality, etc.
- One page about the modelling