# Run this notebook on the Demand forecast cluster

In [0]:
%pip install prophet

In [0]:
import logging
logger = spark._jvm.org.apache.log4j
logging.getLogger("py4j.java_gateway").setLevel(logging.ERROR)

In [0]:
%run /Users/ebizindavyi@delhaize.be/Data_preparation/Parse_xls_files

In [0]:
%run /Users/ebizindavyi@delhaize.be/Evaluation/Evaluation_functions

In [0]:
import pandas as pd
import datetime as dt
from reusable.featurestore.features_schoolholidays import school_holidays
from pyspark.sql import functions as f
from prophet import Prophet

## Parameters

In [0]:
#Backtest
fc_frequency=7
backtest_start_date='2022-01-03'
backtest_end_date='2022-12-31'

#Forecast
start_horizon = 15
end_horizon = 21

#Prophet
changepoint_prior_scale =  0.01
seasonality_prior_scale = 1.0
training_period = 120


In [0]:
#Eval parameters
actual_cols = ['Dry Actuals', 'Dry Actuals']
fc_cols = ['Dry Fc', 'PROPHET_FC']
error_suffixes = ['Dry_WOW', 'Dry_PROPHET']
error_to_calc = ["Bias", "AbsError", "SqError","RelAbsError"]

## Load inbound data

In [0]:
inbound_2022_dir = '/dbfs/mnt/dataplatform/acc/DataScience/sandbox/ebiz/Inbound_FC/data/input/Ecom-Ops-status_2022.xlsx'

In [0]:
inbound_2022_df = create_master_df(inbound_2022_dir)

In [0]:
calendar_2022 = pd.DataFrame({'Dates':pd.date_range(dt.datetime(2022,1,3), periods=363, freq='d')})
inbound_2022_dry = calendar_2022.merge(inbound_2022_df[['Dates','Dry Actuals']], on='Dates',how='left').fillna(0).copy()

### Get holidays

In [0]:
flemish_holidays = (
  school_holidays()
  .filter(
    (f.col('is_dutch_region_holiday')==1)
    &
    (f.col('holiday_name').isNull())
    )
  .withColumn('holiday', f.lit('flemish_region_holiday'))
  .withColumn('ds', f.to_date(f.col('date')))
  .select('holiday','ds')
  .toPandas()
)

walloon_holidays =(
  school_holidays()
  .filter(
    (f.col('is_french_region_holiday')==1)
    &
    (f.col('holiday_name').isNull())
    )
  .withColumn('holiday', f.lit('walloon_region_holiday'))
  .withColumn('ds', f.to_date(f.col('date')))
  .select('holiday','ds')
  .toPandas()
)

holidays = pd.concat((flemish_holidays, walloon_holidays))


## Prophet

### Backtest

In [0]:
backtest_results = pd.DataFrame()
starting_dates = pd.date_range(start=backtest_start_date, end=backtest_end_date,freq=f'{fc_frequency}D')

for start_date in starting_dates:
  print(f"Backtesting on: {start_date}")
  start_train_date = start_date
  end_train_date = start_train_date + dt.timedelta(days=training_period)

  ## Prepare the dataset to have it compatible with Prophet
  inbound_2022_dry_prophet = inbound_2022_dry[(inbound_2022_dry['Dates']>=start_train_date)&(inbound_2022_dry['Dates']<end_train_date)].copy()
  inbound_2022_dry_prophet['ds']= inbound_2022_dry_prophet['Dates']
  inbound_2022_dry_prophet['y']= inbound_2022_dry_prophet['Dry Actuals']
  inbound_2022_dry_prophet = inbound_2022_dry_prophet[['ds','y']]

  ## Fit model
  m = Prophet(holidays=holidays, changepoint_prior_scale=changepoint_prior_scale, seasonality_prior_scale=seasonality_prior_scale).add_country_holidays(country_name='BE').fit(inbound_2022_dry_prophet)
  future = m.make_future_dataframe(periods=end_horizon)

  ## Forecast
  forecast = m.predict(future)
  prophet_pred = forecast.tail(end_horizon-start_horizon+1)[['ds','yhat']].copy()

  # Append results
  backtest_results = pd.concat([backtest_results,prophet_pred])


### Reformat backtest

In [0]:
backtest_results['Dates']= backtest_results['ds']
backtest_results['PROPHET_FC']= backtest_results['yhat']
backtest_df = backtest_results[['Dates','PROPHET_FC']].reset_index(drop=True).copy()

In [0]:
backtest_df

Unnamed: 0,Dates,PROPHET_FC
0,2022-05-17,76135.609671
1,2022-05-18,60905.411939
2,2022-05-19,49486.934762
3,2022-05-20,56394.262648
4,2022-05-21,46604.057836
...,...,...
359,2023-01-17,-136361.950028
360,2023-01-18,-142922.667898
361,2023-01-19,-149483.385767
362,2023-01-20,-156044.103636


### Join the backtest to the inbound

In [0]:
###Remove the data quality issues where FC is exactly te same as actuals

backtest_df_merge = pd.merge(inbound_2022_df[inbound_2022_df['Dry Fc']!=inbound_2022_df['Dry Actuals']],backtest_df, on="Dates", how='inner')[['Dates','Dry Fc','Dry Actuals','PROPHET_FC']]
backtest_df_merge

Unnamed: 0,Dates,Dry Fc,Dry Actuals,PROPHET_FC
0,2022-05-17,62385.511763,63737.0,76135.609671
1,2022-05-18,60376.833054,45119.0,60905.411939
2,2022-05-19,56748.692723,46023.0,49486.934762
3,2022-05-20,55594.135447,48034.0,56394.262648
4,2022-05-23,112076.065450,94552.0,103882.698909
...,...,...,...,...
183,2022-12-27,71459.270000,63764.0,91905.024342
184,2022-12-28,71916.320000,47680.0,84824.272376
185,2022-12-29,60346.346000,39552.0,75715.940144
186,2022-12-30,52203.888000,61119.0,73216.704798


## Evaluate

### Create error columns

In [0]:
for error in  error_to_calc:
  for i in range(len(actual_cols)):
    calculate_errors(df_eval= backtest_df_merge, actuals_col=actual_cols[i], fc_col= fc_cols[i],error_to_calculate=error,error_colname_suffix=error_suffixes[i])

In [0]:
backtest_df_merge

Unnamed: 0,Dates,Dry Fc,Dry Actuals,PROPHET_FC,Bias_Dry_WOW,Bias_Dry_PROPHET,AbsError_Dry_WOW,AbsError_Dry_PROPHET,SqError_Dry_WOW,SqError_Dry_PROPHET,RelAbsError_Dry_WOW,Outside_range_Dry_WOW,RelAbsError_Dry_PROPHET,Outside_range_Dry_PROPHET
0,2022-05-17,62385.511763,63737.0,76135.609671,-1351.488237,12398.609671,1351.488237,12398.609671,1.826520e+06,1.537255e+08,0.021204,False,0.194528,True
1,2022-05-18,60376.833054,45119.0,60905.411939,15257.833054,15786.411939,15257.833054,15786.411939,2.328015e+08,2.492108e+08,0.338169,True,0.349884,True
2,2022-05-19,56748.692723,46023.0,49486.934762,10725.692723,3463.934762,10725.692723,3463.934762,1.150405e+08,1.199884e+07,0.233051,True,0.075265,False
3,2022-05-20,55594.135447,48034.0,56394.262648,7560.135447,8360.262648,7560.135447,8360.262648,5.715565e+07,6.989399e+07,0.157391,True,0.174049,True
4,2022-05-23,112076.065450,94552.0,103882.698909,17524.065450,9330.698909,17524.065450,9330.698909,3.070929e+08,8.706194e+07,0.185338,True,0.098683,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,2022-12-27,71459.270000,63764.0,91905.024342,7695.270000,28141.024342,7695.270000,28141.024342,5.921718e+07,7.919173e+08,0.120684,True,0.441331,True
184,2022-12-28,71916.320000,47680.0,84824.272376,24236.320000,37144.272376,24236.320000,37144.272376,5.873992e+08,1.379697e+09,0.508312,True,0.779033,True
185,2022-12-29,60346.346000,39552.0,75715.940144,20794.346000,36163.940144,20794.346000,36163.940144,4.324048e+08,1.307831e+09,0.525747,True,0.914339,True
186,2022-12-30,52203.888000,61119.0,73216.704798,-8915.112000,12097.704798,8915.112000,12097.704798,7.947922e+07,1.463545e+08,0.145865,True,0.197937,True


### Calculate global metrics

In [0]:
global_metrics_dry = describe_inbound_fc(inbound_fc=backtest_df_merge, cols_to_keep=['Dry Fc',	'Dry Actuals', 'PROPHET_FC', 'Bias_Dry_WOW', 'AbsError_Dry_WOW', 'SqError_Dry_WOW', 'RelAbsError_Dry_WOW', 'Outside_range_Dry_WOW', 'Bias_Dry_PROPHET', 'AbsError_Dry_PROPHET', 'SqError_Dry_PROPHET', 'RelAbsError_Dry_PROPHET', 'Outside_range_Dry_PROPHET'])

In [0]:
global_metrics_dry

Unnamed: 0,Metrics,Dry Fc,Dry Actuals,PROPHET_FC,Bias_Dry_WOW,AbsError_Dry_WOW,SqError_Dry_WOW,RelAbsError_Dry_WOW,Outside_range_Dry_WOW,Bias_Dry_PROPHET,AbsError_Dry_PROPHET,SqError_Dry_PROPHET,RelAbsError_Dry_PROPHET,Outside_range_Dry_PROPHET
0,count,188.0,188.0,188.0,188.0,188.0,188.0,188.0,188,188.0,188.0,188.0,188.0,188
1,unique,,,,,,,,2,,,,,2
2,top,,,,,,,,False,,,,,True
3,freq,,,,,,,,113,,,,,116
4,first,,,,,,,,,,,,,
5,last,,,,,,,,,,,,,
6,mean,66795.983907,64968.297872,66265.448658,1827.686035,6463.435359,97176580.0,0.120991,,1297.150785,13062.516673,368317500.0,0.209647,
7,std,21513.663962,23866.963461,20581.350234,9712.772011,7463.031688,340144700.0,0.21818,,19198.841748,14097.706349,1091051000.0,0.229069,
8,min,35712.572,20444.0,29638.930404,-59204.84,4.0,16.0,8.1e-05,,-112486.431449,69.909987,4887.406,0.001314,
9,25%,54467.9925,49836.25,51331.541044,-2542.59,2137.189501,4569703.0,0.033836,,-4261.277961,3744.357935,14020450.0,0.070037,


### Visualize profiles

In [0]:
plot_two_inbound_fc(inbound_df=backtest_df_merge, actuals_col='Dry Actuals', fc_col_1='Dry Fc', fc_col_2= 'PROPHET_FC', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_PROPHET')

### Visualize errors

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='Bias_Dry_WOW', error_col_2= 'Bias_Dry_PROPHET', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_PROPHET')

In [0]:
plot_inbound_two_errors(inbound_df=backtest_df_merge, error_col_1='AbsError_Dry_WOW', error_col_2= 'AbsError_Dry_PROPHET', show_out_of_range=True, date_col='Dates', outside_range_col_1='Outside_range_Dry_WOW', outside_range_col_2='Outside_range_Dry_PROPHET')