# Run this notebook 
include complete calendar and public holidays

In [0]:
%pip install lightgbm

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import logging
logger = spark._jvm.org.apache.log4j
logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR)

In [0]:
%run /Users/ebizindavyi@delhaize.be/Data_preparation/Parse_xls_files



In [0]:
%run /Users/ebizindavyi@delhaize.be/Evaluation/Evaluation_functions

In [0]:
import pandas as pd
import numpy as np
import datetime as dt
import lightgbm as lgb
from reusable.featurestore.features_schoolholidays import school_holidays
from pyspark.sql import functions as f
import holidays



## Evaluation Params

In [0]:
actual_cols = ['Dry Actuals', 'Dry Actuals']
fc_cols = ['Dry Fc', 'LGBM_FC']
error_suffixes = ['Dry_WOW', 'Dry_LGBM']
error_to_calc = ["Bias", "AbsError", "SqError","RelAbsError"]

## Load inbound data

In [0]:
inbound_2022_dir = '/dbfs/mnt/dataplatform/acc/DataScience/sandbox/ebiz/Inbound_FC/data/input/Ecom-Ops-status_2022.xlsx'

In [0]:
inbound_2022_df = create_master_df(inbound_2022_dir)


In [0]:
calendar_2022 = pd.DataFrame({'Dates':pd.date_range(dt.datetime(2022,1,3), periods=363, freq='d')})
inbound_2022_dry = calendar_2022.merge(inbound_2022_df[['Dates','Dry Actuals']], on='Dates',how='left').fillna(0).copy()

## LightGBM


##Parameters

In [0]:
#lgbm parameter
backtest_start_date = pd.to_datetime('2022-01-03')
backtest_end_date =  pd.to_datetime('2022-12-01')

#shift periods to create features(lags)
shift_period_1 = 7
shift_period_2 = 14
shift_period_3 = 21

# Initialize lgbm parameters 
params = {
    'objective': 'regression',  
    'metric': 'rmse',           
    'boosting_type': 'gbdt',    
    # 'max_depth':2,      
}
# variables 
feature_col = ['x_7days', 'x_14days', 'x_21days']
pred_period=6
end_horizon = 7
fc_freq= 7
training_period= 30

### Clean the data

removes all sundays and holidays in the complete dataset

In [0]:
shift_periods = [shift_period_1, shift_period_2, shift_period_3]
column_names = ['x_7days', 'x_14days', 'x_21days']

for i, period in enumerate(shift_periods):
    inbound_2022_dry[column_names[i]] = inbound_2022_dry['Dry Actuals'].shift(period)


In [0]:
inbound_2022_dry

Unnamed: 0,Dates,Dry Actuals,x_7days,x_14days,x_21days
0,2022-01-03,84224.0,,,
1,2022-01-04,49564.0,,,
2,2022-01-05,45890.0,,,
3,2022-01-06,26976.0,,,
4,2022-01-07,37044.0,,,
...,...,...,...,...,...
358,2022-12-27,63764.0,75921.0,68514.0,76287.0
359,2022-12-28,47680.0,58697.0,65979.0,63301.0
360,2022-12-29,39552.0,62334.0,56286.0,54256.0
361,2022-12-30,61119.0,55510.0,50336.0,52050.0


In [0]:
#drop Nan and sundays and holidays with Zero in Actuals 
inbound_2022_dry = inbound_2022_dry.dropna().loc[inbound_2022_dry['Dry Actuals'] != 0].reset_index(drop=True)


In [0]:
inbound_2022_dry

Unnamed: 0,Dates,Dry Actuals,x_7days,x_14days,x_21days
0,2022-01-24,102266.0,81534.0,71540.0,84224.0
1,2022-01-25,80584.0,72886.0,63644.0,49564.0
2,2022-01-26,70974.0,53783.0,66433.0,45890.0
3,2022-01-27,67565.0,0.0,52802.0,26976.0
4,2022-01-28,57571.0,52025.0,43554.0,37044.0
...,...,...,...,...,...
275,2022-12-27,63764.0,75921.0,68514.0,76287.0
276,2022-12-28,47680.0,58697.0,65979.0,63301.0
277,2022-12-29,39552.0,62334.0,56286.0,54256.0
278,2022-12-30,61119.0,55510.0,50336.0,52050.0


### Backtest

In [0]:
backtest_results = pd.DataFrame()

inbound_2022_train = inbound_2022_dry[['Dates', 'Dry Actuals']].copy()#original non filtered dataset, 


# Iterate through each backtesting period
while backtest_start_date + dt.timedelta(days=pred_period) <= backtest_end_date:
  print(f"Backtesting on: {backtest_start_date}")
  
  start_train_date = backtest_start_date
  # end_train_date = backtest_start_date + dt.timedelta(days=training_period)
  end_train_date = start_train_date + dt.timedelta(days=training_period)
  print(f"Backtesting training ending on: {end_train_date}") 

  lgbm_train = inbound_2022_train[(inbound_2022_train['Dates'] >= start_train_date) &(inbound_2022_train['Dates'] < end_train_date)].copy()
  print(f"print the shape:{lgbm_train.shape} ")
  # Convert Dates column to datetime
  lgbm_train['Dates'] = pd.to_datetime(lgbm_train['Dates'])
    
  #Train features 
  train_features_df = inbound_2022_dry[(lgbm_train['Dates'] >= start_train_date) &(inbound_2022_dry['Dates'] < end_train_date)].copy()
  features_values_train=train_features_df.iloc[:, -3:]

  # Filter the dates with values in Dry Actuals 
  features_train = features_values_train.copy() 
  target = lgbm_train['Dry Actuals']
  
  # Create LightGBM dataset
  lgb_train = lgb.Dataset(features_train, label=target)
    
  # Model training
  m = lgb.train(params, lgb_train)

  # Prediction for the current period
  future_dates = pd.date_range(start=end_train_date+dt.timedelta(days=15), periods=pred_period, freq='D')

  future_features = pd.DataFrame({'ds': future_dates})
  future_features['ds'] = pd.to_datetime(future_features['ds'])

  future_features = inbound_2022_dry[inbound_2022_dry['Dates'].isin(future_dates)].iloc[:, -3:].values ### Changed the source df

  prediction_dates = inbound_2022_dry[inbound_2022_dry['Dates'].isin(future_dates)]['Dates'] ### New variable to extract valid dates from the future date range
  
  #forecast using lgbm model m

  forecast = m.predict(future_features, num_iteration=m.best_iteration,predict_disable_shape_check=True)

  lgbm_pred = pd.DataFrame({'ds': prediction_dates, 'yhat': forecast}) ### Use dates form df rather than the original date range
    
  backtest_results = pd.concat([backtest_results, lgbm_pred])
    
  # Increase training period after each iterations 
  backtest_start_date += dt.timedelta(days=end_horizon)
 
print("Backtesting completed.")


Backtesting on: 2022-01-03 00:00:00
Backtesting training ending on: 2022-02-02 00:00:00
print the shape:(8, 2) 
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 8, number of used features: 0
[LightGBM] [Info] Start training from score 75415.000000
Backtesting on: 2022-01-10 00:00:00
Backtesting training ending on: 2022-02-09 00:00:00
print the shape:(13, 2) 
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 13, number of used features: 0
[LightGBM] [Info] Start training from score 72862.923077
Backtesting on: 2022-01-17 00:00:00
Backtesting training ending on: 2022-02-16 00:00:00
print the shape:(18, 2) 
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 18, number of used features: 0
[LightGBM] [Info] Start training from score 74504.277778


ng] Stopped training because there are no more leaves that meet the split requirements
Backtesting on: 2022-11-14 00:00:00
Backtes

In [0]:
backtest_results

Unnamed: 0,ds,yhat
19,2022-02-17,75415.000000
20,2022-02-18,75415.000000
21,2022-02-19,75415.000000
22,2022-02-21,75415.000000
23,2022-02-22,75415.000000
...,...,...
274,2022-12-26,79813.840000
275,2022-12-27,79813.840000
277,2022-12-29,75812.038462
278,2022-12-30,75812.038462


### Reformat backtest

In [0]:
backtest_results['Dates']= backtest_results['ds']
backtest_results['LGBM_FC']= backtest_results['yhat']
backtest_df = backtest_results[['Dates','LGBM_FC']].reset_index(drop=True).copy()
backtest_df['Dates'] = pd.to_datetime(backtest_df['Dates'])

### Join the backtest to the inbound

In [0]:
###Remove the data quality issues where FC is exactly te same as actuals

backtest_df_merge = pd.merge(inbound_2022_df[inbound_2022_df['Dry Fc']!=inbound_2022_df['Dry Actuals']],backtest_df, on="Dates", how='inner')[['Dates','Dry Actuals','Dry Fc','LGBM_FC']]
backtest_df_merge

Unnamed: 0,Dates,Dry Actuals,Dry Fc,LGBM_FC
0,2022-02-17,29278.0,57180.579027,75415.000000
1,2022-02-18,37975.0,57286.371442,75415.000000
2,2022-02-19,52677.0,60975.818450,75415.000000
3,2022-02-21,96552.0,91629.000000,75415.000000
4,2022-02-22,70073.0,65231.000000,75415.000000
...,...,...,...,...
211,2022-12-26,90911.0,95488.000000,79813.840000
212,2022-12-27,63764.0,71459.270000,79813.840000
213,2022-12-29,39552.0,60346.346000,75812.038462
214,2022-12-30,61119.0,52203.888000,75812.038462


## Evaluate

### Create error columns

In [0]:
for error in  error_to_calc:
  for i in range(len(actual_cols)):
    calculate_errors(df_eval= backtest_df_merge, actuals_col=actual_cols[i], fc_col= fc_cols[i],error_to_calculate=error,error_colname_suffix=error_suffixes[i])
    

In [0]:
backtest_df_merge

Unnamed: 0,Dates,Dry Actuals,Dry Fc,LGBM_FC,Bias_Dry_WOW,Bias_Dry_LGBM,AbsError_Dry_WOW,AbsError_Dry_LGBM,SqError_Dry_WOW,SqError_Dry_LGBM,RelAbsError_Dry_WOW,Outside_range_Dry_WOW,RelAbsError_Dry_LGBM,Outside_range_Dry_LGBM
0,2022-02-17,29278.0,57180.579027,75415.000000,27902.579027,46137.000000,27902.579027,46137.000000,7.785539e+08,2.128623e+09,0.953022,True,1.575825,True
1,2022-02-18,37975.0,57286.371442,75415.000000,19311.371442,37440.000000,19311.371442,37440.000000,3.729291e+08,1.401754e+09,0.508529,True,0.985912,True
2,2022-02-19,52677.0,60975.818450,75415.000000,8298.818450,22738.000000,8298.818450,22738.000000,6.887039e+07,5.170166e+08,0.157542,True,0.431649,True
3,2022-02-21,96552.0,91629.000000,75415.000000,-4923.000000,-21137.000000,4923.000000,21137.000000,2.423593e+07,4.467728e+08,0.050988,False,0.218918,True
4,2022-02-22,70073.0,65231.000000,75415.000000,-4842.000000,5342.000000,4842.000000,5342.000000,2.344496e+07,2.853696e+07,0.069099,False,0.076235,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,2022-12-26,90911.0,95488.000000,79813.840000,4577.000000,-11097.160000,4577.000000,11097.160000,2.094893e+07,1.231470e+08,0.050346,False,0.122066,True
212,2022-12-27,63764.0,71459.270000,79813.840000,7695.270000,16049.840000,7695.270000,16049.840000,5.921718e+07,2.575974e+08,0.120684,True,0.251707,True
213,2022-12-29,39552.0,60346.346000,75812.038462,20794.346000,36260.038462,20794.346000,36260.038462,4.324048e+08,1.314790e+09,0.525747,True,0.916769,True
214,2022-12-30,61119.0,52203.888000,75812.038462,-8915.112000,14693.038462,8915.112000,14693.038462,7.947922e+07,2.158854e+08,0.145865,True,0.240401,True


In [0]:

days_out_range= backtest_df_merge.groupby('Dates').agg({'Outside_range_Dry_LGBM':'sum'}).reset_index()
days_out_range


Unnamed: 0,Dates,Outside_range_Dry_LGBM
0,2022-02-17,1
1,2022-02-18,1
2,2022-02-19,1
3,2022-02-21,1
4,2022-02-22,0
...,...,...
211,2022-12-26,1
212,2022-12-27,1
213,2022-12-29,1
214,2022-12-30,1


In [0]:
filter_df = days_out_range[days_out_range['Outside_range_Dry_LGBM'] != 0].reset_index(drop=True)
filter_df

Unnamed: 0,Dates,Outside_range_Dry_LGBM
0,2022-02-17,1
1,2022-02-18,1
2,2022-02-19,1
3,2022-02-21,1
4,2022-02-24,1
...,...,...
156,2022-12-26,1
157,2022-12-27,1
158,2022-12-29,1
159,2022-12-30,1


### Calculate global metrics

In [0]:
global_metrics_dry = describe_inbound_fc(inbound_fc=backtest_df_merge, cols_to_keep=['Dry Actuals','Dry Fc','LGBM_FC','Bias_Dry_WOW','Bias_Dry_LGBM', 'AbsError_Dry_WOW','AbsError_Dry_LGBM','SqError_Dry_WOW','SqError_Dry_LGBM','RelAbsError_Dry_WOW','RelAbsError_Dry_LGBM', 'Outside_range_Dry_WOW', 'Outside_range_Dry_LGBM'])





In [0]:
global_metrics_dry

Unnamed: 0,Metrics,Dry Actuals,Dry Fc,LGBM_FC,Bias_Dry_WOW,Bias_Dry_LGBM,AbsError_Dry_WOW,AbsError_Dry_LGBM,SqError_Dry_WOW,SqError_Dry_LGBM,RelAbsError_Dry_WOW,RelAbsError_Dry_LGBM,Outside_range_Dry_WOW,Outside_range_Dry_LGBM
0,count,216.0,216.0,216.0,216.0,216.0,216.0,216.0,216.0,216.0,216.0,216.0,216,216
1,unique,,,,,,,,,,,,2,2
2,top,,,,,,,,,,,,False,True
3,freq,,,,,,,,,,,,123,161
4,first,,,,,,,,,,,,,
5,last,,,,,,,,,,,,,
6,mean,66188.310185,66986.756716,67297.023587,798.446531,1108.713402,6925.023979,18501.489781,94033630.0,636863500.0,0.12107,0.29635,,
7,std,24165.070047,20972.820934,6814.589513,9686.615123,25270.352239,6803.821466,17202.570276,269284800.0,1554360000.0,0.152434,0.274349,,
8,min,20444.0,35712.572,51331.92,-59204.84,-122167.730769,4.0,251.4,16.0,63201.96,8.1e-05,0.003799,,
9,25%,49944.75,54915.24,65587.461538,-4641.545,-8873.614583,2454.73725,6362.76,6027309.0,40485660.0,0.035944,0.098473,,


### Visualize profiles

In [0]:
plot_two_inbound_fc(inbound_df=backtest_df_merge, actuals_col='Dry Actuals', fc_col_1='Dry Fc', fc_col_2= 'LGBM_FC', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_LGBM')

### Visualize errors

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='Bias_Dry_WOW', error_col_2= 'Bias_Dry_LGBM', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_LGBM')

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='AbsError_Dry_WOW', error_col_2= 'AbsError_Dry_LGBM', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_LGBM')