# Run this notebook on the Demand forecast cluster

In [0]:
%pip install prophet

In [0]:
import logging
logger = spark._jvm.org.apache.log4j
logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR)

In [0]:
%run /Users/ebizindavyi@delhaize.be/Data_preparation/Parse_xls_files

In [0]:
%run /Users/ebizindavyi@delhaize.be/Evaluation/Evaluation_functions

In [0]:
import pandas as pd
import datetime as dt
from reusable.featurestore.features_schoolholidays import school_holidays
from pyspark.sql import functions as f
from prophet import Prophet

## Parameters

In [0]:
#Backtest
fc_frequency=7
backtest_start_date='2022-01-03'
backtest_end_date='2022-12-31'

#Forecast
start_horizon = 8
end_horizon = 14

#Prophet
changepoint_prior_scale =  0.01
seasonality_prior_scale = 1.0
training_period = 120


In [0]:
#Eval parameters
actual_cols = ['Dry Actuals', 'Dry Actuals']
fc_cols = ['Dry Fc', 'PROPHET_FC']
error_suffixes = ['Dry_WOW', 'Dry_PROPHET']
error_to_calc = ["Bias", "AbsError", "SqError","RelAbsError"]

## Load inbound data

In [0]:
inbound_2022_dir = '/dbfs/mnt/dataplatform/acc/DataScience/sandbox/ebiz/Inbound_FC/data/input/Ecom-Ops-status_2022.xlsx'

In [0]:
inbound_2022_df = create_master_df(inbound_2022_dir)

In [0]:
calendar_2022 = pd.DataFrame({'Dates':pd.date_range(dt.datetime(2022,1,3), periods=363, freq='d')})
inbound_2022_dry = calendar_2022.merge(inbound_2022_df[['Dates','Dry Actuals']], on='Dates',how='left').fillna(0).copy()

### Get holidays

In [0]:
flemish_holidays = (
  school_holidays()
  .filter(
    (f.col('is_dutch_region_holiday')==1)
    &
    (f.col('holiday_name').isNull())
    )
  .withColumn('holiday', f.lit('flemish_region_holiday'))
  .withColumn('ds', f.to_date(f.col('date')))
  .select('holiday','ds')
  .toPandas()
)

walloon_holidays =(
  school_holidays()
  .filter(
    (f.col('is_french_region_holiday')==1)
    &
    (f.col('holiday_name').isNull())
    )
  .withColumn('holiday', f.lit('walloon_region_holiday'))
  .withColumn('ds', f.to_date(f.col('date')))
  .select('holiday','ds')
  .toPandas()
)

holidays = pd.concat((flemish_holidays, walloon_holidays))


In [0]:
holidays

Unnamed: 0,holiday,ds
0,flemish_region_holiday,2015-01-02
1,flemish_region_holiday,2015-01-03
2,flemish_region_holiday,2015-01-04
3,flemish_region_holiday,2015-02-16
4,flemish_region_holiday,2015-02-17
...,...,...
872,walloon_region_holiday,2023-08-23
873,walloon_region_holiday,2023-08-24
874,walloon_region_holiday,2023-08-25
875,walloon_region_holiday,2023-08-26


## Prophet

### Backtest

In [0]:
backtest_results = pd.DataFrame()
starting_dates = pd.date_range(start=backtest_start_date, end=backtest_end_date,freq=f'{fc_frequency}D')

for start_date in starting_dates:
  print(f"Backtesting on: {start_date}")
  start_train_date = start_date
  end_train_date = start_train_date + dt.timedelta(days=training_period)

  ## Prepare the dataset to have it compatible with Prophet
  inbound_2022_dry_prophet = inbound_2022_dry[(inbound_2022_dry['Dates']>=start_train_date)&(inbound_2022_dry['Dates']<end_train_date)].copy()
  inbound_2022_dry_prophet['ds']= inbound_2022_dry_prophet['Dates']
  inbound_2022_dry_prophet['y']= inbound_2022_dry_prophet['Dry Actuals']
  inbound_2022_dry_prophet = inbound_2022_dry_prophet[['ds','y']]

  ## Fit model
  m = Prophet(holidays=holidays, changepoint_prior_scale=changepoint_prior_scale, seasonality_prior_scale=seasonality_prior_scale).add_country_holidays(country_name='BE').fit(inbound_2022_dry_prophet)
  future = m.make_future_dataframe(periods=end_horizon)

  ## Forecast
  forecast = m.predict(future)
  prophet_pred = forecast.tail(end_horizon-start_horizon+1)[['ds','yhat']].copy()

  # Append results
  backtest_results = pd.concat([backtest_results,prophet_pred])


In [0]:
backtest_results

Unnamed: 0,ds,yhat
127,2022-05-10,75834.758081
128,2022-05-11,60604.560348
129,2022-05-12,49186.083172
130,2022-05-13,56093.411058
131,2022-05-14,46303.206245
...,...,...
15,2023-01-10,-90436.924945
16,2023-01-11,-96997.642814
17,2023-01-12,-103558.360683
18,2023-01-13,-110119.078552


### Reformat backtest

In [0]:
backtest_results['Dates']= backtest_results['ds']
backtest_results['PROPHET_FC']= backtest_results['yhat']
backtest_df = backtest_results[['Dates','PROPHET_FC']].reset_index(drop=True).copy()

In [0]:
backtest_df

Unnamed: 0,Dates,PROPHET_FC
0,2022-05-10,75834.758081
1,2022-05-11,60604.560348
2,2022-05-12,49186.083172
3,2022-05-13,56093.411058
4,2022-05-14,46303.206245
...,...,...
359,2023-01-10,-90436.924945
360,2023-01-11,-96997.642814
361,2023-01-12,-103558.360683
362,2023-01-13,-110119.078552


### Join the backtest to the inbound

In [0]:
###Remove the data quality issues where FC is exactly te same as actuals

backtest_df_merge = pd.merge(inbound_2022_df[inbound_2022_df['Dry Fc']!=inbound_2022_df['Dry Actuals']],backtest_df, on="Dates", how='inner')[['Dates','Dry Fc','Dry Actuals','PROPHET_FC']]
backtest_df_merge

Unnamed: 0,Dates,Dry Fc,Dry Actuals,PROPHET_FC
0,2022-05-10,66717.729732,66459.0,75834.758081
1,2022-05-11,63909.213527,49313.0,60604.560348
2,2022-05-12,59648.343564,43377.0,49186.083172
3,2022-05-13,57664.833847,46556.0,56093.411058
4,2022-05-14,59931.051337,77061.0,46303.206245
...,...,...,...,...
189,2022-12-27,71459.270000,63764.0,85368.118171
190,2022-12-28,71916.320000,47680.0,77879.495703
191,2022-12-29,60346.346000,39552.0,68690.717810
192,2022-12-30,52203.888000,61119.0,65367.314212


## Evaluate

### Create error columns

In [0]:
for error in  error_to_calc:
  for i in range(len(actual_cols)):
    calculate_errors(df_eval= backtest_df_merge, actuals_col=actual_cols[i], fc_col= fc_cols[i],error_to_calculate=error,error_colname_suffix=error_suffixes[i])

In [0]:
backtest_df_merge

Unnamed: 0,Dates,Dry Fc,Dry Actuals,PROPHET_FC,Bias_Dry_WOW,Bias_Dry_PROPHET,AbsError_Dry_WOW,AbsError_Dry_PROPHET,SqError_Dry_WOW,SqError_Dry_PROPHET,RelAbsError_Dry_WOW,Outside_range_Dry_WOW,RelAbsError_Dry_PROPHET,Outside_range_Dry_PROPHET
0,2022-05-10,66717.729732,66459.0,75834.758081,258.729732,9375.758081,258.729732,9375.758081,6.694107e+04,8.790484e+07,0.003893,False,0.141076,True
1,2022-05-11,63909.213527,49313.0,60604.560348,14596.213527,11291.560348,14596.213527,11291.560348,2.130494e+08,1.274993e+08,0.295991,True,0.228977,True
2,2022-05-12,59648.343564,43377.0,49186.083172,16271.343564,5809.083172,16271.343564,5809.083172,2.647566e+08,3.374545e+07,0.375115,True,0.133921,True
3,2022-05-13,57664.833847,46556.0,56093.411058,11108.833847,9537.411058,11108.833847,9537.411058,1.234062e+08,9.096221e+07,0.238612,True,0.204859,True
4,2022-05-14,59931.051337,77061.0,46303.206245,-17129.948663,-30757.793755,17129.948663,30757.793755,2.934351e+08,9.460419e+08,0.222291,True,0.399136,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,2022-12-27,71459.270000,63764.0,85368.118171,7695.270000,21604.118171,7695.270000,21604.118171,5.921718e+07,4.667379e+08,0.120684,True,0.338814,True
190,2022-12-28,71916.320000,47680.0,77879.495703,24236.320000,30199.495703,24236.320000,30199.495703,5.873992e+08,9.120095e+08,0.508312,True,0.633379,True
191,2022-12-29,60346.346000,39552.0,68690.717810,20794.346000,29138.717810,20794.346000,29138.717810,4.324048e+08,8.490649e+08,0.525747,True,0.736719,True
192,2022-12-30,52203.888000,61119.0,65367.314212,-8915.112000,4248.314212,8915.112000,4248.314212,7.947922e+07,1.804817e+07,0.145865,True,0.069509,False


### Calculate global metrics

In [0]:
global_metrics_dry = describe_inbound_fc(inbound_fc=backtest_df_merge, cols_to_keep=['Dry Fc',	'Dry Actuals', 'PROPHET_FC', 'Bias_Dry_WOW', 'AbsError_Dry_WOW', 'SqError_Dry_WOW', 'RelAbsError_Dry_WOW', 'Outside_range_Dry_WOW', 'Bias_Dry_PROPHET', 'AbsError_Dry_PROPHET', 'SqError_Dry_PROPHET', 'RelAbsError_Dry_PROPHET', 'Outside_range_Dry_PROPHET'])

In [0]:
global_metrics_dry

Unnamed: 0,Metrics,Dry Fc,Dry Actuals,PROPHET_FC,Bias_Dry_WOW,AbsError_Dry_WOW,SqError_Dry_WOW,RelAbsError_Dry_WOW,Outside_range_Dry_WOW,Bias_Dry_PROPHET,AbsError_Dry_PROPHET,SqError_Dry_PROPHET,RelAbsError_Dry_PROPHET,Outside_range_Dry_PROPHET
0,count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194,194.0,194.0,194.0,194.0,194
1,unique,,,,,,,,2,,,,,2
2,top,,,,,,,,False,,,,,True
3,freq,,,,,,,,115,,,,,129
4,first,,,,,,,,,,,,,
5,last,,,,,,,,,,,,,
6,mean,66842.199314,64945.71134,65943.261461,1896.487974,6573.6206,98786280.0,0.123144,,997.550121,12130.926348,315329200.0,0.194672,
7,std,21349.594299,23780.149202,20240.41938,9781.759559,7474.070884,335580600.0,0.216504,,17775.34082,13001.582427,1033251000.0,0.212553,
8,min,35712.572,20444.0,30801.477692,-59204.84,4.0,16.0,8.1e-05,,-113737.91195,34.499596,1190.222,0.000307,
9,25%,54612.935945,49627.0,52382.34053,-2522.39,2083.976504,4345082.0,0.033756,,-5737.296805,4350.953027,18930800.0,0.081474,


### Visualize profiles

In [0]:
plot_two_inbound_fc(inbound_df=backtest_df_merge, actuals_col='Dry Actuals', fc_col_1='Dry Fc', fc_col_2= 'PROPHET_FC', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_PROPHET')

### Visualize errors

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='Bias_Dry_WOW', error_col_2= 'Bias_Dry_PROPHET', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_PROPHET')

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='AbsError_Dry_WOW', error_col_2= 'AbsError_Dry_PROPHET', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_PROPHET')