# Run this notebook 
include complete calendar and public holidays

In [0]:
%pip install lightgbm

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import logging
logger = spark._jvm.org.apache.log4j
logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR)

In [0]:
%run /Users/ebizindavyi@delhaize.be/Data_preparation/Parse_xls_files



In [0]:
%run /Users/ebizindavyi@delhaize.be/Evaluation/Evaluation_functions

In [0]:
import pandas as pd
import numpy as np
import datetime as dt
import lightgbm as lgb
from reusable.featurestore.features_schoolholidays import school_holidays
from pyspark.sql import functions as f


## Evaluation Params

In [0]:
actual_cols = ['Dry Actuals', 'Dry Actuals']
fc_cols = ['Dry Fc', 'LGBM_FC']
error_suffixes = ['Dry_WOW', 'Dry_LGBM']
error_to_calc = ["Bias", "AbsError", "SqError","RelAbsError"]

## Load inbound data

In [0]:
inbound_2022_dir = '/dbfs/mnt/dataplatform/acc/DataScience/sandbox/ebiz/Inbound_FC/data/input/Ecom-Ops-status_2022.xlsx'

In [0]:
inbound_2022_df = create_master_df(inbound_2022_dir)


In [0]:
calendar_2022 = pd.DataFrame({'Dates':pd.date_range(dt.datetime(2022,1,3), periods=363, freq='d')})
inbound_2022_dry = calendar_2022.merge(inbound_2022_df[['Dates','Dry Actuals']], on='Dates',how='left').fillna(0).copy()


In [0]:
inbound_120days_df=inbound_2022_dry.copy()
inbound_120days_df

Unnamed: 0,Dates,Dry Actuals
0,2022-01-03,84224.0
1,2022-01-04,49564.0
2,2022-01-05,45890.0
3,2022-01-06,26976.0
4,2022-01-07,37044.0
...,...,...
358,2022-12-27,63764.0
359,2022-12-28,47680.0
360,2022-12-29,39552.0
361,2022-12-30,61119.0


### Get holidays

In [0]:
flemish_holidays = (
  school_holidays()
  .filter(
    (f.col('is_dutch_region_holiday')==1)
    &
    (f.col('holiday_name').isNull())
    )
  .withColumn('holiday', f.lit('flemish_region_holiday'))
  .withColumn('ds', f.to_date(f.col('date')))
  .select('holiday','ds')
  .toPandas()
)

walloon_holidays =(
  school_holidays()
  .filter(
    (f.col('is_french_region_holiday')==1)
    &
    (f.col('holiday_name').isNull())
    )
  .withColumn('holiday', f.lit('walloon_region_holiday'))
  .withColumn('ds', f.to_date(f.col('date')))
  .select('holiday','ds')
  .toPandas()
)

holidays = pd.concat((flemish_holidays, walloon_holidays))
#change ds to datetime
holidays['ds'] = pd.to_datetime(holidays['ds'])


## LightGBM


##Parameters

In [0]:
#lgbm parameter
backtest_start_date = pd.to_datetime('2022-01-24') #('2022-01-03)
backtest_end_date =  pd.to_datetime('2022-08-30') #('2022-09-30) 

#shift periods to create features(lags)
shift_period_1 = 7
shift_period_2 = 14
shift_period_3 = 21

# Initialize lgbm parameters 
params = {
    'objective': 'regression',  
    'metric': 'rmse',           
    'boosting_type': 'gbdt',    
    # 'max_depth':2,      
}
# variables 
feature_col = ['x_7days', 'x_14days', 'x_21days']
pred_period=6
end_horizon = 7
fc_freq=7

# ::::Daily_pred::::
# end_horizon = 8 #length of the forecasting
# pred_period = 1
# fc_freq= 1
training_period= 120

### Clean the data

In [0]:
shift_periods = [shift_period_1, shift_period_2, shift_period_3]
column_names = ['x_7days', 'x_14days', 'x_21days']

for i, period in enumerate(shift_periods):
    inbound_120days_df[column_names[i]] = inbound_120days_df['Dry Actuals'].shift(period)


In [0]:
inbound_120days_df

Unnamed: 0,Dates,Dry Actuals,x_7days,x_14days,x_21days
21,2022-01-24,102266.0,,,
22,2022-01-25,80584.0,,,
23,2022-01-26,70974.0,,,
24,2022-01-27,67565.0,,,
25,2022-01-28,57571.0,,,
...,...,...,...,...,...
358,2022-12-27,63764.0,76525.0,58807.0,64654.0
359,2022-12-28,47680.0,75921.0,123475.0,64759.0
360,2022-12-29,39552.0,58697.0,68514.0,94568.0
361,2022-12-30,61119.0,62334.0,65979.0,76287.0


In [0]:
#drop Nan and sundays where Actuals== 0
inbound_120days_df = inbound_120days_df.dropna().loc[inbound_120days_df['Dry Actuals'] != 0]


### Backtest

In [0]:
backtest_results = pd.DataFrame()

inbound_2022_train = inbound_120days_df[['Dates', 'Dry Actuals']].copy()#original non filtered dataset, 


# Iterate through each backtesting period
while backtest_start_date + dt.timedelta(days=pred_period) <= backtest_end_date:
  print(f"Backtesting on: {backtest_start_date}")
  
  start_train_date = backtest_start_date
  end_train_date = start_train_date + dt.timedelta(days=training_period)
  print(f"Backtesting training ending on: {end_train_date}") 

  lgbm_train = inbound_2022_train[(inbound_2022_train['Dates'] >= start_train_date) &(inbound_2022_train['Dates'] < end_train_date)].copy()
  print(f"print the shape:{lgbm_train.shape} ")
  # Convert Dates column to datetime
  lgbm_train['Dates'] = pd.to_datetime(lgbm_train['Dates'])
    
  #Train features 
  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()
  features_values_train=train_features_df.iloc[:, -3:]

  # Filter the dates with values in Dry Actuals 
  features_train = features_values_train.copy() 
  target = lgbm_train['Dry Actuals']
  
  # Create LightGBM dataset
  lgb_train = lgb.Dataset(features_train, label=target)
    
  # Model training
  m = lgb.train(params, lgb_train)

  # Prediction for the current period
  future_dates = pd.date_range(start=end_train_date+dt.timedelta(days=15), periods=pred_period, freq='D')

  future_features = pd.DataFrame({'ds': future_dates})
  future_features['ds'] = pd.to_datetime(future_features['ds'])

  future_features = inbound_120days_df[inbound_120days_df['Dates'].isin(future_dates)].iloc[:, -3:].values ### Changed the source df

  prediction_dates = inbound_120days_df[inbound_120days_df['Dates'].isin(future_dates)]['Dates'] ### New variable to extract valid dates from the future date range
  
  #forecast using lgbm model m

  forecast = m.predict(future_features, num_iteration=m.best_iteration,predict_disable_shape_check=True)

  lgbm_pred = pd.DataFrame({'ds': prediction_dates, 'yhat': forecast}) ### Use dates form df rather than the original date range
    
  backtest_results = pd.concat([backtest_results, lgbm_pred])
    
  # Increase training period after each iterations 
  backtest_start_date += dt.timedelta(days=end_horizon)
 
print("Backtesting completed.")


Backtesting on: 2022-01-24 00:00:00
Backtesting training ending on: 2022-05-24 00:00:00
print the shape:(76, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 76, number of used features: 3
[LightGBM] [Info] Start training from score 64640.855263
Backtesting on: 2022-01-31 00:00:00
Backtesting training ending on: 2022-05-31 00:00:00
print the shape:(80, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 3
[LightGBM] [Info] Start training from score 64766.800000
Backtesting on: 2022-02-07 00:00:00
Backtesting training ending on: 2022-06-07 00:00:00
print the shape:(85, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 85, number of used features: 3
[LightGBM] [Info] Start training from score 64526.788235
Backtesting on: 2022-02-14 00:00:00
Backtesting training ending on: 2022-06-14 00:00:00
print the shape:(91, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 91, number of used features: 3
[LightGBM] [Info] Start training from score 65060.406593
Backtesting on: 2022-02-21 00:00:00
Backtesting training ending on: 2022-06-21 00:00:00
print the shape:(96, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 3
[LightGBM] [Info] Start training from score 65373.416667
Backtesting on: 2022-02-28 00:00:00
Backtesting training ending on: 2022-06-28 00:00:00
print the shape:(96, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 3
[LightGBM] [Info] Start training from score 65505.145833
Backtesting on: 2022-03-07 00:00:00
Backtesting training ending on: 2022-07-05 00:00:00
print the shape:(96, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 3
[LightGBM] [Info] Start training from score 65387.229167
Backtesting on: 2022-03-14 00:00:00
Backtesting training ending on: 2022-07-12 00:00:00
print the shape:(96, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 3
[LightGBM] [Info] Start training from score 64554.572917
Backtesting on: 2022-03-21 00:00:00
Backtesting training ending on: 2022-07-19 00:00:00
print the shape:(96, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 3
[LightGBM] [Info] Start training from score 63449.416667
Backtesting on: 2022-03-28 00:00:00
Backtesting training ending on: 2022-07-26 00:00:00
print the shape:(95, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 95, number of used features: 3
[LightGBM] [Info] Start training from score 62831.157895
Backtesting on: 2022-04-04 00:00:00
Backtesting training ending on: 2022-08-02 00:00:00
print the shape:(95, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 95, number of used features: 3
[LightGBM] [Info] Start training from score 61808.915789
Backtesting on: 2022-04-11 00:00:00
Backtesting training ending on: 2022-08-09 00:00:00
print the shape:(96, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 3
[LightGBM] [Info] Start training from score 60976.979167
Backtesting on: 2022-04-18 00:00:00
Backtesting training ending on: 2022-08-16 00:00:00
print the shape:(95, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 95, number of used features: 3
[LightGBM] [Info] Start training from score 59828.652632
Backtesting on: 2022-04-25 00:00:00
Backtesting training ending on: 2022-08-23 00:00:00
print the shape:(95, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 95, number of used features: 3
[LightGBM] [Info] Start training from score 60764.084211
Backtesting on: 2022-05-02 00:00:00
Backtesting training ending on: 2022-08-30 00:00:00
print the shape:(96, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 3
[LightGBM] [Info] Start training from score 60852.822917
Backtesting on: 2022-05-09 00:00:00
Backtesting training ending on: 2022-09-06 00:00:00
print the shape:(96, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 3
[LightGBM] [Info] Start training from score 60918.072917
Backtesting on: 2022-05-16 00:00:00
Backtesting training ending on: 2022-09-13 00:00:00
print the shape:(96, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 96, number of used features: 3
[LightGBM] [Info] Start training from score 61711.166667
Backtesting on: 2022-05-23 00:00:00
Backtesting training ending on: 2022-09-20 00:00:00
print the shape:(97, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 97, number of used features: 3
[LightGBM] [Info] Start training from score 61732.484536
Backtesting on: 2022-05-30 00:00:00
Backtesting training ending on: 2022-09-27 00:00:00
print the shape:(99, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 99, number of used features: 3
[LightGBM] [Info] Start training from score 61721.686869
Backtesting on: 2022-06-06 00:00:00
Backtesting training ending on: 2022-10-04 00:00:00
print the shape:(99, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 99, number of used features: 3
[LightGBM] [Info] Start training from score 61609.656566
Backtesting on: 2022-06-13 00:00:00
Backtesting training ending on: 2022-10-11 00:00:00
print the shape:(100, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 3
[LightGBM] [Info] Start training from score 61792.360000
Backtesting on: 2022-06-20 00:00:00
Backtesting training ending on: 2022-10-18 00:00:00
print the shape:(100, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 3
[LightGBM] [Info] Start training from score 61761.460000
Backtesting on: 2022-06-27 00:00:00
Backtesting training ending on: 2022-10-25 00:00:00
print the shape:(100, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 3
[LightGBM] [Info] Start training from score 61610.720000
Backtesting on: 2022-07-04 00:00:00
Backtesting training ending on: 2022-11-01 00:00:00
print the shape:(100, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 3
[LightGBM] [Info] Start training from score 61347.660000
Backtesting on: 2022-07-11 00:00:00
Backtesting training ending on: 2022-11-08 00:00:00
print the shape:(99, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 99, number of used features: 3
[LightGBM] [Info] Start training from score 63188.525253
Backtesting on: 2022-07-18 00:00:00
Backtesting training ending on: 2022-11-15 00:00:00
print the shape:(98, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 98, number of used features: 3
[LightGBM] [Info] Start training from score 65489.020408
Backtesting on: 2022-07-25 00:00:00
Backtesting training ending on: 2022-11-22 00:00:00
print the shape:(99, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 99, number of used features: 3
[LightGBM] [Info] Start training from score 66974.434343
Backtesting on: 2022-08-01 00:00:00
Backtesting training ending on: 2022-11-29 00:00:00
print the shape:(99, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 99, number of used features: 3
[LightGBM] [Info] Start training from score 68682.717172
Backtesting on: 2022-08-08 00:00:00
Backtesting training ending on: 2022-12-06 00:00:00
print the shape:(99, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 99, number of used features: 3
[LightGBM] [Info] Start training from score 69917.171717
Backtesting on: 2022-08-15 00:00:00
Backtesting training ending on: 2022-12-13 00:00:00
print the shape:(99, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102
[LightGBM] [Info] Number of data points in the train set: 99, number of used features: 3
[LightGBM] [Info] Start training from score 71187.202020
Backtesting on: 2022-08-22 00:00:00
Backtesting training ending on: 2022-12-20 00:00:00
print the shape:(101, 2) 


  train_features_df = inbound_120days_df[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 105
[LightGBM] [Info] Number of data points in the train set: 101, number of used features: 3
[LightGBM] [Info] Start training from score 71026.000000
Backtesting completed.


In [0]:
backtest_results

Unnamed: 0,ds,yhat,Dates,LGBM_FC
156,2022-06-08,78813.232565,2022-06-08,78813.232565
157,2022-06-09,81681.015987,2022-06-09,81681.015987
158,2022-06-10,79552.502570,2022-06-10,79552.502570
159,2022-06-11,58389.246499,2022-06-11,58389.246499
161,2022-06-13,68145.201898,2022-06-13,68145.201898
...,...,...,...,...
357,2022-12-26,58371.947732,2022-12-26,58371.947732
359,2022-12-28,63531.174858,2022-12-28,63531.174858
360,2022-12-29,56616.228980,2022-12-29,56616.228980
361,2022-12-30,68406.621469,2022-12-30,68406.621469


### Reformat backtest

In [0]:
backtest_results['Dates']= backtest_results['ds']
backtest_results['LGBM_FC']= backtest_results['yhat']
backtest_df = backtest_results[['Dates','LGBM_FC']].reset_index(drop=True).copy()
backtest_df['Dates'] = pd.to_datetime(backtest_df['Dates'])

### Join the backtest to the inbound

In [0]:
###Remove the data quality issues where FC is exactly te same as actuals

backtest_df_merge = pd.merge(inbound_2022_df[inbound_2022_df['Dry Fc']!=inbound_2022_df['Dry Actuals']],backtest_df, on="Dates", how='inner')[['Dates','Dry Actuals','Dry Fc','LGBM_FC']]
backtest_df_merge.head(20)

Unnamed: 0,Dates,Dry Actuals,Dry Fc,LGBM_FC
0,2022-06-08,53805.0,57255.313386,78813.232565
1,2022-06-09,58254.0,57574.319188,81681.015987
2,2022-06-10,56166.0,55072.698355,79552.50257
3,2022-06-11,61616.0,58815.70035,58389.246499
4,2022-06-13,99560.0,96667.372564,68145.201898
5,2022-06-15,62152.0,61393.937213,75340.344247
6,2022-06-16,47822.0,57778.462574,57915.033559
7,2022-06-17,63600.0,55121.389425,66476.662601
8,2022-06-18,66165.0,59001.605554,74036.38752
9,2022-06-20,100734.0,96801.0244,66052.556926


## Evaluate

### Create error columns

In [0]:
for error in  error_to_calc:
  for i in range(len(actual_cols)):
    calculate_errors(df_eval= backtest_df_merge, actuals_col=actual_cols[i], fc_col= fc_cols[i],error_to_calculate=error,error_colname_suffix=error_suffixes[i])
    

In [0]:
backtest_df_merge

Unnamed: 0,Dates,Dry Actuals,Dry Fc,LGBM_FC,Bias_Dry_WOW,Bias_Dry_LGBM,AbsError_Dry_WOW,AbsError_Dry_LGBM,SqError_Dry_WOW,SqError_Dry_LGBM,RelAbsError_Dry_WOW,Outside_range_Dry_WOW,RelAbsError_Dry_LGBM,Outside_range_Dry_LGBM
0,2022-06-08,53805.0,57255.313386,78813.232565,3450.313386,25008.232565,3450.313386,25008.232565,1.190466e+07,6.254117e+08,0.064126,False,0.464794,True
1,2022-06-09,58254.0,57574.319188,81681.015987,-679.680812,23427.015987,679.680812,23427.015987,4.619660e+05,5.488251e+08,0.011668,False,0.402153,True
2,2022-06-10,56166.0,55072.698355,79552.502570,-1093.301645,23386.502570,1093.301645,23386.502570,1.195308e+06,5.469285e+08,0.019466,False,0.416382,True
3,2022-06-11,61616.0,58815.700350,58389.246499,-2800.299650,-3226.753501,2800.299650,3226.753501,7.841678e+06,1.041194e+07,0.045448,False,0.052369,False
4,2022-06-13,99560.0,96667.372564,68145.201898,-2892.627436,-31414.798102,2892.627436,31414.798102,8.367293e+06,9.868895e+08,0.029054,False,0.315536,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,2022-12-26,90911.0,95488.000000,58371.947732,4577.000000,-32539.052268,4577.000000,32539.052268,2.094893e+07,1.058790e+09,0.050346,False,0.357922,True
141,2022-12-28,47680.0,71916.320000,63531.174858,24236.320000,15851.174858,24236.320000,15851.174858,5.873992e+08,2.512597e+08,0.508312,True,0.332449,True
142,2022-12-29,39552.0,60346.346000,56616.228980,20794.346000,17064.228980,20794.346000,17064.228980,4.324048e+08,2.911879e+08,0.525747,True,0.431438,True
143,2022-12-30,61119.0,52203.888000,68406.621469,-8915.112000,7287.621469,8915.112000,7287.621469,7.947922e+07,5.310943e+07,0.145865,True,0.119237,True


### Calculate global metrics

In [0]:
global_metrics_dry = describe_inbound_fc(inbound_fc=backtest_df_merge, cols_to_keep=['Dry Actuals','Dry Fc','LGBM_FC','Bias_Dry_WOW','Bias_Dry_LGBM', 'AbsError_Dry_WOW','AbsError_Dry_LGBM','SqError_Dry_WOW','SqError_Dry_LGBM','RelAbsError_Dry_WOW','RelAbsError_Dry_LGBM', 'Outside_range_Dry_WOW', 'Outside_range_Dry_LGBM'])

  described_df = inbound_fc.describe(include='all')[cols_to_keep].rename_axis('Metrics').reset_index(drop=False).copy()


In [0]:
global_metrics_dry

Unnamed: 0,Metrics,Dry Actuals,Dry Fc,LGBM_FC,Bias_Dry_WOW,Bias_Dry_LGBM,AbsError_Dry_WOW,AbsError_Dry_LGBM,SqError_Dry_WOW,SqError_Dry_LGBM,RelAbsError_Dry_WOW,RelAbsError_Dry_LGBM,Outside_range_Dry_WOW,Outside_range_Dry_LGBM
0,count,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145,145
1,unique,,,,,,,,,,,,2,2
2,top,,,,,,,,,,,,False,True
3,freq,,,,,,,,,,,,90,112
4,first,,,,,,,,,,,,,
5,last,,,,,,,,,,,,,
6,mean,63844.951724,66432.019795,62697.759803,2587.068071,-1147.191921,6122.56163,17713.741926,81208540.0,605707900.0,0.126875,0.297003,,
7,std,23805.820095,23053.863537,10029.711595,8662.164122,24669.596347,6635.23989,17145.218427,256367200.0,1408421000.0,0.244094,0.38431,,
8,min,20444.0,35712.572,39857.007491,-19209.068,-105445.02485,4.0,33.1379,16.0,1098.12,8.1e-05,0.000673,,
9,25%,49238.0,52992.704386,55414.432294,-2499.2,-12219.660543,2163.796,6213.91285,4682013.0,38612710.0,0.033877,0.118269,,


### Visualize profiles

In [0]:
plot_two_inbound_fc(inbound_df=backtest_df_merge, actuals_col='Dry Actuals', fc_col_1='Dry Fc', fc_col_2= 'LGBM_FC', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_LGBM')

### Visualize errors

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='Bias_Dry_WOW', error_col_2= 'Bias_Dry_LGBM', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_LGBM')

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='AbsError_Dry_WOW', error_col_2= 'AbsError_Dry_LGBM', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_LGBM')