# Run this notebook on the Demand forecast cluster

In [0]:
%pip install lightgbm

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import logging
logger = spark._jvm.org.apache.log4j
logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR)

In [0]:
%run /Users/ebizindavyi@delhaize.be/Data_preparation/Parse_xls_files



In [0]:
%run /Users/ebizindavyi@delhaize.be/Evaluation/Evaluation_functions

In [0]:
import pandas as pd
import numpy as np
import datetime as dt
import lightgbm as lgb
from reusable.featurestore.features_schoolholidays import school_holidays
from pyspark.sql import functions as f


## Evaluation Params

In [0]:
actual_cols = ['Dry Actuals', 'Dry Actuals']
fc_cols = ['Dry Fc', 'LGBM_FC']
error_suffixes = ['Dry_WOW', 'Dry_LGBM']
error_to_calc = ["Bias", "AbsError", "SqError","RelAbsError"]

## Load inbound data

In [0]:
inbound_2022_dir = '/dbfs/mnt/dataplatform/acc/DataScience/sandbox/ebiz/Inbound_FC/data/input/Ecom-Ops-status_2022.xlsx'

In [0]:
inbound_2022_df = create_master_df(inbound_2022_dir)


In [0]:
# calendar_2022 = pd.DataFrame({'Dates':pd.date_range(dt.datetime(2022,1,3), periods=363, freq='d')})
# inbound_2022_dry = calendar_2022.merge(inbound_2022_df[['Dates','Dry Actuals']], on='Dates',how='left').fillna(0).copy()

### Get holidays

In [0]:
flemish_holidays = (
  school_holidays()
  .filter(
    (f.col('is_dutch_region_holiday')==1)
    &
    (f.col('holiday_name').isNull())
    )
  .withColumn('holiday', f.lit('flemish_region_holiday'))
  .withColumn('ds', f.to_date(f.col('date')))
  .select('holiday','ds')
  .toPandas()
)

walloon_holidays =(
  school_holidays()
  .filter(
    (f.col('is_french_region_holiday')==1)
    &
    (f.col('holiday_name').isNull())
    )
  .withColumn('holiday', f.lit('walloon_region_holiday'))
  .withColumn('ds', f.to_date(f.col('date')))
  .select('holiday','ds')
  .toPandas()
)

holidays = pd.concat((flemish_holidays, walloon_holidays))


## LightGBM


##Parameters

In [0]:
#lgbm parameter
backtest_start_date = pd.to_datetime('2022-01-03')
backtest_end_date =  pd.to_datetime('2022-09-30')#('2022-09-30')

#shift periods to create features(lags)
shift_period_1 = 7
shift_period_2 = 14
shift_period_3 = 21

# Initialize lgbm parameters 
params = {
    'objective': 'regression',  
    'metric': 'rmse',           
    'boosting_type': 'gbdt',    
    
}
# variables 
#pred_period = 1
# end_horizon = 7
# fc_freq=1

horizon = 8 #length of the forecasting
freq_forecast = 1
expected_pred = 1
# future_dates=1

training_period= 90

### Backtest

In [0]:
backtest_results = pd.DataFrame()

inbound_2022_train = inbound_2022_df[['Dates', 'Dry Actuals']].copy() #original filtered dataset, no sundays&NANs

inbound_2022_train['x_7days'] = inbound_2022_df['Dry Actuals'].shift(shift_period_1)
inbound_2022_train['x_14days'] = inbound_2022_df['Dry Actuals'].shift(shift_period_2)
inbound_2022_train['x_21days'] = inbound_2022_df['Dry Actuals'].shift(shift_period_3)

# Iterate through each backtesting period
while backtest_start_date + dt.timedelta(days=expected_pred) <= backtest_end_date:
  print(f"Backtesting on: {backtest_start_date}")
  
  start_train_date = backtest_start_date
  end_train_date = start_train_date + dt.timedelta(days=training_period)
  
  print(f"Backtesting training ending on: {end_train_date}") 

  inbound_2022_dry_lgbm_train = inbound_2022_train[(inbound_2022_train['Dates'] >= start_train_date) &(inbound_2022_train['Dates'] < end_train_date)].copy()
  print(f"print the shape:{inbound_2022_dry_lgbm_train.shape} ")
  # Convert Dates column to datetime
  inbound_2022_dry_lgbm_train['Dates'] = pd.to_datetime(inbound_2022_dry_lgbm_train['Dates'])
    
  inbound_2022_dry_lgbm_train = inbound_2022_dry_lgbm_train.dropna()
  features_values_train=inbound_2022_dry_lgbm_train.iloc[:, -3:]

  # Filter the dates with values in Dry Actuals 
  features_train = features_values_train.copy() 
  target = inbound_2022_dry_lgbm_train['Dry Actuals']
  
  # Create LightGBM dataset
  lgb_train = lgb.Dataset(features_train, label=target)
    
  # Model training
  m = lgb.train(params, lgb_train)

  # Prediction for the current period
  future_dates = pd.date_range(start=end_train_date +dt.timedelta(days=horizon), periods= expected_pred, freq='D')
                                                                 
  future_features = pd.DataFrame({'ds': future_dates})
  future_features['ds'] = pd.to_datetime(future_features['ds'])

  future_features = inbound_2022_train[inbound_2022_train['Dates'].isin(future_dates)].iloc[:, -3:].values ### Changed the source df
  prediction_dates = inbound_2022_train[inbound_2022_train['Dates'].isin(future_dates)]['Dates'] ### New variable to extract valid dates from the future date range
  
  #forecast using lgbm model m

  forecast = m.predict(future_features, num_iteration=m.best_iteration, predict_disable_shape_check=True)

  lgbm_pred = pd.DataFrame({'ds': prediction_dates, 'yhat': forecast}) ### Use dates form df rather than the original date range
    
  backtest_results = pd.concat([backtest_results, lgbm_pred])
    
  # Increase training period after each iterations 
  backtest_start_date += dt.timedelta(days=expected_pred)
print("Backtesting completed.")


Backtesting on: 2022-01-03 00:00:00
Backtesting training ending on: 2022-04-03 00:00:00
print the shape:(75, 5) 
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the train set: 54, number of used features: 3
[LightGBM] [Info] Start training from score 65105.407407
Backtesting on: 2022-01-04 00:00:00
Backtesting training ending on: 2022-04-04 00:00:00
print the shape:(74, 5) 
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the train set: 54, number of used features: 3
[LightGBM] [Info] Start training from score 65105.407407
Backtesting on: 2022-01-05 00:00:00
Backtesting training ending on: 2022-04-05 00:00:00
print the shape:(74, 5) 
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 60
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 3
[LightGBM]

In [0]:
backtest_results

Unnamed: 0,ds,yhat
80,2022-04-11,67653.722495
81,2022-04-12,69991.422135
82,2022-04-13,64246.388556
83,2022-04-14,60849.638747
84,2022-04-15,58772.315150
...,...,...
293,2022-12-27,54320.344001
294,2022-12-28,61504.879302
295,2022-12-29,55829.256966
296,2022-12-30,55446.292318


### Reformat backtest

In [0]:
backtest_results['Dates']= backtest_results['ds']
backtest_results['LGBM_FC']= backtest_results['yhat']
backtest_df = backtest_results[['Dates','LGBM_FC']].reset_index(drop=True).copy()
backtest_df['Dates'] = pd.to_datetime(backtest_df['Dates'])

### Join the backtest to the inbound

In [0]:
###Remove the data quality issues where FC is exactly te same as actuals

backtest_df_merge = pd.merge(inbound_2022_df[inbound_2022_df['Dry Fc']!=inbound_2022_df['Dry Actuals']],backtest_df, on="Dates", how='inner')[['Dates','Dry Actuals','Dry Fc','LGBM_FC']]
backtest_df_merge

Unnamed: 0,Dates,Dry Actuals,Dry Fc,LGBM_FC
0,2022-04-11,108071.0,102679.523110,67653.722495
1,2022-04-12,63663.0,67732.138787,69991.422135
2,2022-04-13,59816.0,68789.651233,64246.388556
3,2022-04-14,63475.0,66165.818167,60849.638747
4,2022-04-15,53515.0,63996.940710,58772.315150
...,...,...,...,...
212,2022-12-27,63764.0,71459.270000,54320.344001
213,2022-12-28,47680.0,71916.320000,61504.879302
214,2022-12-29,39552.0,60346.346000,55829.256966
215,2022-12-30,61119.0,52203.888000,55446.292318


## Evaluate

### Create error columns

In [0]:
for error in  error_to_calc:
  for i in range(len(actual_cols)):
    calculate_errors(df_eval= backtest_df_merge, actuals_col=actual_cols[i],
                      fc_col= fc_cols[i],error_to_calculate=error,error_colname_suffix=error_suffixes[i])
    

In [0]:
backtest_df_merge


Unnamed: 0,Dates,Dry Actuals,Dry Fc,LGBM_FC,Bias_Dry_WOW,Bias_Dry_LGBM,AbsError_Dry_WOW,AbsError_Dry_LGBM,SqError_Dry_WOW,SqError_Dry_LGBM,RelAbsError_Dry_WOW,Outside_range_Dry_WOW,RelAbsError_Dry_LGBM,Outside_range_Dry_LGBM
0,2022-04-11,108071.0,102679.523110,67653.722495,-5391.476890,-40417.277505,5391.476890,40417.277505,2.906802e+07,1.633556e+09,0.049888,False,0.373988,True
1,2022-04-12,63663.0,67732.138787,69991.422135,4069.138787,6328.422135,4069.138787,6328.422135,1.655789e+07,4.004893e+07,0.063917,False,0.099405,False
2,2022-04-13,59816.0,68789.651233,64246.388556,8973.651233,4430.388556,8973.651233,4430.388556,8.052642e+07,1.962834e+07,0.150021,True,0.074067,False
3,2022-04-14,63475.0,66165.818167,60849.638747,2690.818167,-2625.361253,2690.818167,2625.361253,7.240502e+06,6.892522e+06,0.042392,False,0.041361,False
4,2022-04-15,53515.0,63996.940710,58772.315150,10481.940710,5257.315150,10481.940710,5257.315150,1.098711e+08,2.763936e+07,0.195869,True,0.098240,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,2022-12-27,63764.0,71459.270000,54320.344001,7695.270000,-9443.655999,7695.270000,9443.655999,5.921718e+07,8.918264e+07,0.120684,True,0.148103,True
213,2022-12-28,47680.0,71916.320000,61504.879302,24236.320000,13824.879302,24236.320000,13824.879302,5.873992e+08,1.911273e+08,0.508312,True,0.289951,True
214,2022-12-29,39552.0,60346.346000,55829.256966,20794.346000,16277.256966,20794.346000,16277.256966,4.324048e+08,2.649491e+08,0.525747,True,0.411541,True
215,2022-12-30,61119.0,52203.888000,55446.292318,-8915.112000,-5672.707682,8915.112000,5672.707682,7.947922e+07,3.217961e+07,0.145865,True,0.092814,False


In [0]:

days_out_range= backtest_df_merge.groupby('Dates').agg({'Outside_range_Dry_LGBM':'sum'}).reset_index()
days_out_range

Unnamed: 0,Dates,Outside_range_Dry_LGBM
0,2022-04-11,1
1,2022-04-12,0
2,2022-04-13,0
3,2022-04-14,0
4,2022-04-15,0
...,...,...
212,2022-12-27,1
213,2022-12-28,1
214,2022-12-29,1
215,2022-12-30,0


In [0]:
filter_df = days_out_range[days_out_range['Outside_range_Dry_LGBM'] != 0]
filter_df

Unnamed: 0,Dates,Outside_range_Dry_LGBM
0,2022-04-11,1
5,2022-04-16,1
7,2022-04-20,1
8,2022-04-21,1
9,2022-04-22,1
...,...,...
211,2022-12-26,1
212,2022-12-27,1
213,2022-12-28,1
214,2022-12-29,1


### Calculate global metrics

In [0]:
global_metrics_dry = describe_inbound_fc(inbound_fc=backtest_df_merge, cols_to_keep=['Dry Actuals','Dry Fc','LGBM_FC','Bias_Dry_WOW','Bias_Dry_LGBM', 'AbsError_Dry_WOW','AbsError_Dry_LGBM','SqError_Dry_WOW','SqError_Dry_LGBM','RelAbsError_Dry_WOW','RelAbsError_Dry_LGBM', 'Outside_range_Dry_WOW', 'Outside_range_Dry_LGBM'])

  described_df = inbound_fc.describe(include='all')[cols_to_keep].rename_axis('Metrics').reset_index(drop=False).copy()


In [0]:
global_metrics_dry

Unnamed: 0,Metrics,Dry Actuals,Dry Fc,LGBM_FC,Bias_Dry_WOW,Bias_Dry_LGBM,AbsError_Dry_WOW,AbsError_Dry_LGBM,SqError_Dry_WOW,SqError_Dry_LGBM,RelAbsError_Dry_WOW,RelAbsError_Dry_LGBM,Outside_range_Dry_WOW,Outside_range_Dry_LGBM
0,count,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217,217
1,unique,,,,,,,,,,,,2,2
2,top,,,,,,,,,,,,False,True
3,freq,,,,,,,,,,,,126,150
4,first,,,,,,,,,,,,,
5,last,,,,,,,,,,,,,
6,mean,65115.456221,67041.670539,63296.557369,1926.214318,-1818.898852,6763.496803,16768.741443,100419500.0,553857200.0,0.126827,0.268651,,
7,std,23593.470839,20801.654616,10273.73528,9856.820775,23518.027869,7411.324025,16550.795938,323841400.0,1243664000.0,0.215607,0.316412,,
8,min,20444.0,35712.572,42059.704688,-59204.84,-107377.902516,4.0,72.23696,16.0,5218.178,8.1e-05,0.001305,,
9,25%,49605.0,55072.698355,54694.970396,-2789.838,-12242.844989,2211.934,5257.31515,4892652.0,27639360.0,0.033983,0.085995,,


### Visualize profiles

In [0]:
plot_two_inbound_fc(inbound_df=backtest_df_merge, actuals_col='Dry Actuals', fc_col_1='Dry Fc', fc_col_2= 'LGBM_FC', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_LGBM')

### Visualize errors

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='Bias_Dry_WOW', error_col_2= 'Bias_Dry_LGBM', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_LGBM')

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='AbsError_Dry_WOW', error_col_2= 'AbsError_Dry_LGBM', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_LGBM')