In [4]:
# Core libraries
import numpy as np
import pandas as pd
import calendar
import os
import sys
import pickle

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
# import altair_ally as aly

# Time series analysis
from statsmodels.tsa.api import SimpleExpSmoothing, Holt, seasonal_decompose, SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Forecasting models
# from pmdarima import auto_arima

# ML models
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# Preprocessing and pipelines
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline

# Model evaluation and cross-validation
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import (
    train_test_split, TimeSeriesSplit, cross_validate, GridSearchCV
)
import shap 

# Optimization
from scipy.optimize import minimize

# sktime
from sktime.forecasting.model_selection import ForecastingGridSearchCV, ExpandingWindowSplitter
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.base import ForecastingHorizon

%matplotlib inline

### Read in data 

In [5]:
train_df = pd.read_csv('../data/modelling/train.csv', index_col=0, parse_dates=True)
test_df = pd.read_csv('../data/modelling/test.csv', index_col=0, parse_dates=True)

In [66]:
X_train = train_df.drop(columns=['total_sales_normalized'])
y_train = train_df['total_sales_normalized']

X_test = test_df.drop(columns=['total_sales_normalized'])
y_test = test_df['total_sales_normalized']

### Regression model - base

In [85]:
numerical_features = ['hours_opened', 'avg_temperature', 'rain', 'snow']
categorical_features = ['is_long_weekend', 'is_HCF', 'season', 'day_of_week', 'is_holiday']
category_orders = [
    [False, True],  # is_long_weekend
    [False, True],  # is_HCF
    ['Winter', 'Spring', 'Summer', 'Fall'],  # season
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],  # day_of_week
    [False, True]  # is_holiday
]
drop_features = ['item_A_sales', 'item_B_sales', 'item_C_sales']

preprocessor = make_column_transformer(
    (OneHotEncoder(drop='first', categories=category_orders), categorical_features),
    (StandardScaler(), numerical_features),
    ('drop', drop_features)
)

lr_pipe = make_pipeline(preprocessor, LinearRegression())

In [86]:
lr_pipe.fit(X_train, y_train)
y_pred = lr_pipe.predict(X_test)
y_train_pred = lr_pipe.predict(X_train)

In [87]:
mean_absolute_error(y_test, y_pred)

275.40270288760547

In [88]:
y_train_for_plot = pd.DataFrame(y_train).assign(label='train')
y_test_for_plot = pd.DataFrame(y_test).assign(label='test')
lr_pred_for_plot = pd.DataFrame(y_pred, columns=['total_sales_normalized'], index=y_test.index).assign(label='test_prediction')
y_train_pred_for_plot = pd.DataFrame(y_train_pred, columns=['total_sales_normalized'], index=y_train.index).assign(label='train_prediction')

custom_colors = {
    'train': '#1f77b4',
    'test': '#d62728',
    'test_prediction': '#8bc34a',
    'train_prediction': '#8bc34a'
}

lr_plot_fig = px.line(pd.concat((y_train_for_plot, y_test_for_plot, lr_pred_for_plot, y_train_pred_for_plot)), 
        y="total_sales_normalized",
        color='label', 
        title='Prediction results - linear regression',
        color_discrete_map=custom_colors
        )

lr_plot_fig.show()

In [89]:
columns = (lr_pipe.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out().tolist() + 
           lr_pipe.named_steps['columntransformer'].named_transformers_['standardscaler'].get_feature_names_out().tolist()) 

coef = lr_pipe.named_steps['linearregression'].coef_

lr_coef = pd.DataFrame({
    'features': columns,
    'coefficient': coef
})

lr_coef.sort_values(by='coefficient', ascending=False).round(2)

Unnamed: 0,features,coefficient
9,day_of_week_Saturday,2274.71
10,day_of_week_Sunday,1947.18
11,is_holiday_True,1618.98
8,day_of_week_Friday,1326.85
3,season_Summer,1290.69
2,season_Spring,588.4
1,is_HCF_True,378.34
0,is_long_weekend_True,308.53
6,day_of_week_Wednesday,147.19
7,day_of_week_Thursday,126.52


In [93]:
pred_results = test_df.copy()
pred_results['y_pred'] = y_pred
pred_results['prediction_error'] = pred_results['y_pred'] - pred_results['total_sales_normalized'] 

mae_grouped_df = pred_results.groupby('day_of_week')[['total_sales_normalized', 'y_pred', 'prediction_error']].mean().round(2)
mae_grouped_df['error_percentage'] = (mae_grouped_df['prediction_error'] / mae_grouped_df['total_sales_normalized']).round(3)
mae_grouped_df.loc[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]

Unnamed: 0_level_0,total_sales_normalized,y_pred,prediction_error,error_percentage
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Monday,2410.92,2663.18,252.27,0.105
Tuesday,2215.2,2237.79,22.59,0.01
Wednesday,2478.06,2371.13,-106.93,-0.043
Thursday,2726.48,2675.13,-51.35,-0.019
Friday,4334.14,4075.58,-258.56,-0.06
Saturday,4961.4,4614.0,-347.4,-0.07
Sunday,4614.48,4249.07,-365.41,-0.079


In [None]:
numerical_features = ['hours_opened', 'avg_temperature', 'rain', 'snow']
categorical_features = ['is_long_weekend', 'is_HCF', 'season', 'day_of_week', 'is_holiday']

In [236]:
train_df['is_long_weekend'] = pd.Categorical(train_df['is_long_weekend'], categories=[False, True])
train_df['is_holiday'] = pd.Categorical(train_df['is_holiday'], categories=[False, True])
train_df['is_HCF'] = pd.Categorical(train_df['is_HCF'], categories=[False, True])
train_df['day_of_week'] = pd.Categorical(train_df['day_of_week'], categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
train_df['season'] = pd.Categorical(train_df['season'], categories=['Winter', 'Spring', 'Summer', 'Fall'])


formula = (
    """
        total_sales_normalized ~ hours_opened 
        + C(is_HCF) 
        + C(is_long_weekend) 
        + C(day_of_week) 
        + C(season)
        + C(is_holiday)
        + rain 
        + snow 
        + avg_temperature 
    """
    )

stat_model_ols = smf.ols(formula, data=train_df).fit()
print(stat_model_ols.summary())

                              OLS Regression Results                              
Dep. Variable:     total_sales_normalized   R-squared:                       0.869
Model:                                OLS   Adj. R-squared:                  0.862
Method:                     Least Squares   F-statistic:                     121.6
Date:                    Wed, 02 Jul 2025   Prob (F-statistic):          5.38e-119
Time:                            16:41:00   Log-Likelihood:                -2339.7
No. Observations:                     310   AIC:                             4713.
Df Residuals:                         293   BIC:                             4777.
Df Model:                              16                                         
Covariance Type:                nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

### Regression model - log sales

In [102]:
train_df = pd.read_csv('../data/modelling/train.csv', index_col=0, parse_dates=True)
test_df = pd.read_csv('../data/modelling/test.csv', index_col=0, parse_dates=True)

train_df['log_total_sales_normalized'] = np.log(train_df['total_sales_normalized'])
test_df['log_total_sales_normalized'] = np.log(test_df['total_sales_normalized'])

X_train_log = train_df.drop(columns=['log_total_sales_normalized'])
y_train_log = train_df['log_total_sales_normalized']

X_test_log = test_df.drop(columns=['log_total_sales_normalized'])
y_test_log = test_df['log_total_sales_normalized']

In [103]:
numerical_features = ['hours_opened', 'avg_temperature', 'rain', 'snow']
categorical_features = ['is_long_weekend', 'is_HCF', 'season', 'day_of_week', 'is_holiday']
category_orders = [
    [False, True],  # is_long_weekend
    [False, True],  # is_HCF
    ['Winter', 'Spring', 'Summer', 'Fall'],  # season
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],  # day_of_week
    [False, True]  # is_holiday
]
drop_features = ['total_sales_normalized', 'item_A_sales', 'item_B_sales', 'item_C_sales']

preprocessor = make_column_transformer(
    (OneHotEncoder(drop='first', categories=category_orders), categorical_features),
    (StandardScaler(), numerical_features),
    ("drop", drop_features)
)

lr_pipe_log = make_pipeline(preprocessor, LinearRegression())

In [107]:
lr_pipe_log.fit(X_train_log, y_train_log)
y_train_pred_log = lr_pipe_log.predict(X_train_log)
y_pred_log = lr_pipe_log.predict(X_test_log)

In [109]:
mean_absolute_error(np.exp(y_test_log), np.exp(y_pred_log))

365.47635355385404

In [112]:
y_train_for_plot_log = pd.DataFrame(np.exp(y_train_log)).assign(label='train')
y_test_for_plot_log = pd.DataFrame(np.exp(y_test_log)).assign(label='test')
y_pred_for_plot_log = pd.DataFrame(np.exp(y_pred_log), columns=['log_total_sales_normalized'], index=y_test.index).assign(label='test_prediction')
y_train_pred_for_plot_log = pd.DataFrame(np.exp(y_train_pred_log), columns=['log_total_sales_normalized'], index=y_train.index).assign(label='train_prediction')

custom_colors = {
    'train': '#1f77b4',
    'test': '#d62728',
    'test_prediction': '#8bc34a',
    'train_prediction': '#8bc34a'
}

log_lr_plot = px.line(pd.concat((y_train_for_plot_log, y_test_for_plot_log, y_pred_for_plot_log, y_train_pred_for_plot_log)), 
        y="log_total_sales_normalized",
        color='label', 
        title='prediction results - linear regression - log sales',
        color_discrete_map=custom_colors
)

log_lr_plot.show()


In [113]:
columns = (lr_pipe_log.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out().tolist() + 
           lr_pipe_log.named_steps['columntransformer'].named_transformers_['standardscaler'].get_feature_names_out().tolist()) 

coef = lr_pipe_log.named_steps['linearregression'].coef_

lr_coef = pd.DataFrame({
    'features': columns,
    'coefficient': coef
})

lr_coef.sort_values(by='coefficient', ascending=False).round(2)

Unnamed: 0,features,coefficient
9,day_of_week_Saturday,0.7
10,day_of_week_Sunday,0.65
11,is_holiday_True,0.62
8,day_of_week_Friday,0.46
3,season_Summer,0.39
2,season_Spring,0.22
1,is_HCF_True,0.17
7,day_of_week_Thursday,0.08
0,is_long_weekend_True,0.08
13,avg_temperature,0.07


### Regression model - Item A only 

In [114]:
train_df = pd.read_csv('../data/modelling/train.csv', index_col=0, parse_dates=True)
test_df = pd.read_csv('../data/modelling/test.csv', index_col=0, parse_dates=True)

X_train_item_A = train_df.drop(columns=['item_A_sales'])
y_train_item_A = train_df['item_A_sales']

X_test_item_A = test_df.drop(columns=['item_A_sales'])
y_test_item_A = test_df['item_A_sales']


In [55]:
numerical_features = ['hours_opened', 'avg_temperature', 'rain', 'snow']
categorical_features = ['is_long_weekend', 'is_HCF', 'season', 'day_of_week', 'is_holiday']
category_orders = [
    [False, True],  # is_long_weekend
    [False, True],  # is_HCF
    ['Winter', 'Spring', 'Summer', 'Fall'],  # season
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],  # day_of_week
    [False, True]  # is_holiday
]
drop_features = ['total_sales_normalized', 'item_B_sales', 'item_C_sales']

preprocessor = make_column_transformer(
    (OneHotEncoder(drop='first', categories=category_orders), categorical_features),
    (StandardScaler(), numerical_features),
    ("drop", drop_features)
)

lr_pipe_item_A = make_pipeline(preprocessor, LinearRegression())

In [115]:
lr_pipe_item_A.fit(X_train_item_A, y_train_item_A)
y_train_pred_item_A = lr_pipe_item_A.predict(X_train_item_A)
y_pred_item_A = lr_pipe_item_A.predict(X_test_item_A)

In [116]:
y_train_for_plot_item_A = pd.DataFrame(y_train_item_A).assign(label='train')
y_test_for_plot_item_A  = pd.DataFrame(y_test_item_A).assign(label='test')
y_pred_pred_for_plot_item_A = pd.DataFrame(y_pred_item_A, columns=['item_A_sales'], index=y_test_item_A.index).assign(label='test_prediction')
y_train_pred_for_plot_item_A = pd.DataFrame(y_train_pred_item_A, columns=['item_A_sales'], index=y_train_item_A.index).assign(label='train_prediction')

custom_colors = {
    'train': '#1f77b4',
    'test': '#d62728',
    'test_prediction': '#8bc34a',
    'train_prediction': '#8bc34a'
}

lr_plot_fig = px.line(pd.concat((y_train_for_plot_item_A , y_test_for_plot_item_A, y_pred_pred_for_plot_item_A, y_train_pred_for_plot_item_A)), 
        y="item_A_sales",
        color='label', 
        title='Prediction results - linear regression - item A only',
        color_discrete_map=custom_colors
        )

lr_plot_fig.show()

In [117]:
columns = (lr_pipe_item_A.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out().tolist() + 
           lr_pipe_item_A.named_steps['columntransformer'].named_transformers_['standardscaler'].get_feature_names_out().tolist()) 

coef = lr_pipe_item_A.named_steps['linearregression'].coef_

lr_coef = pd.DataFrame({
    'features': columns,
    'coefficient': coef
})

lr_coef.sort_values(by='coefficient', ascending=False).round(2)

Unnamed: 0,features,coefficient
9,day_of_week_Saturday,974.4
11,is_holiday_True,824.63
10,day_of_week_Sunday,817.88
3,season_Summer,578.1
8,day_of_week_Friday,531.78
2,season_Spring,211.38
4,season_Fall,69.94
6,day_of_week_Wednesday,67.63
5,day_of_week_Tuesday,37.22
7,day_of_week_Thursday,27.45


In [118]:
pred_results = test_df.copy()
pred_results['y_pred'] = y_pred_item_A
pred_results['prediction_error'] = pred_results['y_pred'] - pred_results['item_A_sales'] 

mae_grouped_df = pred_results.groupby('day_of_week')[['item_A_sales', 'y_pred', 'prediction_error']].mean().round(2)
mae_grouped_df['error_percentage'] = (mae_grouped_df['prediction_error'] / mae_grouped_df['item_A_sales']).round(3)
mae_grouped_df

Unnamed: 0_level_0,item_A_sales,y_pred,prediction_error,error_percentage
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Friday,1648.8,1497.86,-150.94,-0.092
Monday,952.82,1019.56,66.74,0.07
Saturday,2021.15,1738.59,-282.56,-0.14
Sunday,1891.52,1626.73,-264.8,-0.14
Thursday,1122.44,1000.39,-122.05,-0.109
Tuesday,904.1,838.38,-65.72,-0.073
Wednesday,1061.11,867.71,-193.39,-0.182


### Regression model - Item B only 

In [119]:
train_df = pd.read_csv('../data/modelling/train.csv', index_col=0, parse_dates=True)
test_df = pd.read_csv('../data/modelling/test.csv', index_col=0, parse_dates=True)

X_train_item_B = train_df.drop(columns=['item_B_sales'])
y_train_item_B = train_df['item_B_sales']

X_test_item_B = test_df.drop(columns=['item_B_sales'])
y_test_item_B = test_df['item_B_sales']

In [120]:
numerical_features = ['hours_opened', 'avg_temperature', 'rain', 'snow']
categorical_features = ['is_long_weekend', 'is_HCF', 'season', 'day_of_week', 'is_holiday']
category_orders = [
    [False, True],  # is_long_weekend
    [False, True],  # is_HCF
    ['Winter', 'Spring', 'Summer', 'Fall'],  # season
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],  # day_of_week
    [False, True]  # is_holiday
]
drop_features = ['total_sales_normalized', 'item_A_sales', 'item_C_sales']

preprocessor = make_column_transformer(
    (OneHotEncoder(drop='first', categories=category_orders), categorical_features),
    (StandardScaler(), numerical_features),
    ("drop", drop_features)
)

lr_pipe_item_B = make_pipeline(preprocessor, LinearRegression())

In [121]:
lr_pipe_item_B.fit(X_train_item_B, y_train_item_B)
y_train_pred_item_B = lr_pipe_item_B.predict(X_train_item_B)
y_pred_item_B = lr_pipe_item_B.predict(X_test_item_B)

In [122]:
y_train_for_plot_item_B = pd.DataFrame(y_train_item_B).assign(label='train')
y_test_for_plot_item_B = pd.DataFrame(y_test_item_B).assign(label='test')
y_pred_pred_for_plot_item_B = pd.DataFrame(y_pred_item_B, columns=['item_B_sales'], index=y_test_item_B.index).assign(label='test_prediction')
y_train_pred_for_plot_item_B = pd.DataFrame(y_train_pred_item_B, columns=['item_B_sales'], index=y_train_item_B.index).assign(label='train_prediction')

custom_colors = {
    'train': '#1f77b4',
    'test': '#d62728',
    'test_prediction': '#8bc34a',
    'train_prediction': '#8bc34a'
}

lr_plot_fig = px.line(pd.concat((y_train_for_plot_item_B , y_test_for_plot_item_B, y_pred_pred_for_plot_item_B, y_train_pred_for_plot_item_B)), 
        y="item_B_sales",
        color='label', 
        title='Prediction results - linear regression - item B only',
        color_discrete_map=custom_colors
        )

lr_plot_fig.show()

In [76]:
columns = (lr_pipe_item_B.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out().tolist() + 
           lr_pipe_item_B.named_steps['columntransformer'].named_transformers_['standardscaler'].get_feature_names_out().tolist()) 

coef = lr_pipe_item_B.named_steps['linearregression'].coef_

lr_coef = pd.DataFrame({
    'features': columns,
    'coefficient': coef
})

lr_coef.sort_values(by='coefficient', ascending=False).round(2)

Unnamed: 0,features,coefficient
9,day_of_week_Saturday,916.0
10,day_of_week_Sunday,795.71
8,day_of_week_Friday,587.61
3,season_Summer,582.93
11,is_holiday_True,536.25
0,is_long_weekend_True,307.08
13,avg_temperature,217.34
2,season_Spring,216.84
7,day_of_week_Thursday,97.24
1,is_HCF_True,88.35


In [124]:
pred_results = test_df.copy()
pred_results['y_pred'] = y_pred_item_B
pred_results['prediction_error'] = pred_results['y_pred'] - pred_results['item_B_sales'] 

mae_grouped_df = pred_results.groupby('day_of_week')[['item_B_sales', 'y_pred', 'prediction_error']].mean().round(2)
mae_grouped_df['error_percentage'] = (mae_grouped_df['prediction_error'] / mae_grouped_df['item_B_sales']).round(3)
mae_grouped_df

Unnamed: 0_level_0,item_B_sales,y_pred,prediction_error,error_percentage
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Friday,1745.12,1842.64,97.52,0.056
Monday,907.06,1108.74,201.68,0.222
Saturday,1920.51,2028.11,107.6,0.056
Sunday,1711.52,1817.72,106.2,0.062
Thursday,997.06,1149.12,152.06,0.153
Tuesday,840.53,968.23,127.7,0.152
Wednesday,859.95,1020.14,160.19,0.186


### Regression model - Item C only 

In [79]:
train_df = pd.read_csv('../data/modelling/train.csv', index_col=0, parse_dates=True)
test_df = pd.read_csv('../data/modelling/test.csv', index_col=0, parse_dates=True)

X_train_item_C = train_df.drop(columns=['item_C_sales'])
y_train_item_C = train_df['item_C_sales']

X_test_item_C = test_df.drop(columns=['item_C_sales'])
y_test_item_C = test_df['item_C_sales']

In [80]:
numerical_features = ['hours_opened', 'avg_temperature', 'rain', 'snow']
categorical_features = ['is_long_weekend', 'is_HCF', 'season', 'day_of_week', 'is_holiday']
category_orders = [
    [False, True],  # is_long_weekend
    [False, True],  # is_HCF
    ['Winter', 'Spring', 'Summer', 'Fall'],  # season
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],  # day_of_week
    [False, True]  # is_holiday
]
drop_features = ['total_sales_normalized', 'item_A_sales', 'item_B_sales']

preprocessor = make_column_transformer(
    (OneHotEncoder(drop='first', categories=category_orders), categorical_features),
    (StandardScaler(), numerical_features),
    ("drop", drop_features)
)

lr_pipe_item_C = make_pipeline(preprocessor, LinearRegression())

In [81]:
lr_pipe_item_C.fit(X_train_item_C, y_train_item_C)
y_train_pred_item_C = lr_pipe_item_C.predict(X_train_item_C)
y_pred_item_C = lr_pipe_item_C.predict(X_test_item_C)

In [82]:
y_train_for_plot_item_C = pd.DataFrame(y_train_item_C).assign(label='train')
y_test_for_plot_item_C = pd.DataFrame(y_test_item_C).assign(label='test')
y_pred_pred_for_plot_item_C = pd.DataFrame(y_pred_item_C, columns=['item_C_sales'], index=y_test_item_C.index).assign(label='test_prediction')
y_train_pred_for_plot_item_C = pd.DataFrame(y_train_pred_item_C, columns=['item_C_sales'], index=y_train_item_C.index).assign(label='train_prediction')

custom_colors = {
    'train': '#1f77b4',
    'test': '#d62728',
    'test_prediction': '#8bc34a',
    'train_prediction': '#8bc34a'
}

lr_plot_fig = px.line(pd.concat((y_train_for_plot_item_C , y_test_for_plot_item_C, y_pred_pred_for_plot_item_C, y_train_pred_for_plot_item_C)), 
        y="item_C_sales",
        color='label', 
        title='Prediction results - linear regression - item C only',
        color_discrete_map=custom_colors
        )

lr_plot_fig.show()

In [83]:
columns = (lr_pipe_item_C.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out().tolist() + 
           lr_pipe_item_C.named_steps['columntransformer'].named_transformers_['standardscaler'].get_feature_names_out().tolist()) 

coef = lr_pipe_item_C.named_steps['linearregression'].coef_

lr_coef = pd.DataFrame({
    'features': columns,
    'coefficient': coef
})

lr_coef.sort_values(by='coefficient', ascending=False).round(2)

Unnamed: 0,features,coefficient
9,day_of_week_Saturday,194.18
10,day_of_week_Sunday,154.8
11,is_holiday_True,121.41
3,season_Summer,115.57
8,day_of_week_Friday,105.45
2,season_Spring,44.13
7,day_of_week_Thursday,14.57
6,day_of_week_Wednesday,14.05
0,is_long_weekend_True,11.78
4,season_Fall,0.2


In [84]:
pred_results = test_df.copy()
pred_results['y_pred'] = y_pred_item_C
pred_results['prediction_error'] = pred_results['y_pred'] - pred_results['item_C_sales'] 

mae_grouped_df = pred_results.groupby('day_of_week')[['item_C_sales', 'y_pred', 'prediction_error']].mean().round(2)
mae_grouped_df['error_percentage'] = (mae_grouped_df['prediction_error'] / mae_grouped_df['item_C_sales']).round(3)
mae_grouped_df

Unnamed: 0_level_0,item_C_sales,y_pred,prediction_error,error_percentage
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Friday,426.76,322.02,-104.74,-0.245
Monday,289.88,218.29,-71.59,-0.247
Saturday,516.89,380.49,-136.4,-0.264
Sunday,508.83,343.25,-165.58,-0.325
Thursday,300.44,224.58,-75.86,-0.252
Tuesday,234.18,176.22,-57.97,-0.248
Wednesday,289.8,199.37,-90.43,-0.312


### Poisson Regression - number of orders 

#### Hyperparameter tuning

#### Testing random train / test split for 1000 samples

In [None]:
numerical_features = ['hours_opened', 'avg_temperature', 'rain', 'snow', 'avg_cloud_cover_8']
categorical_features = ['is_long_weekend', 'is_HCF', 'has_pop_up', 'season', 'day_of_week', 'is_holiday']
category_orders = [
    [False, True],  # is_long_weekend
    [False, True],  # is_HCF
    [False, True],  # has_pop_up
    ['Winter', 'Spring', 'Summer', 'Fall'],  # season
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],  # day_of_week
    [False, True]  # is_holiday
]

preprocessor = make_column_transformer(
    (OneHotEncoder(drop='first', categories=category_orders), categorical_features),
    ('passthrough', numerical_features)
)

lr_pipe = (preprocessor, LinearRegression())

In [None]:
train_df, test_df = train_test_split(log_total_sales_df, test_size=0.1)

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

In [None]:
index = (preprocessor.named_transformers_['onehotencoder'].get_feature_names_out().tolist() + 
        preprocessor.named_transformers_['passthrough'].get_feature_names_out().tolist()
)

In [None]:
n = 1000
result_list = []

for i in range(n):

    train_df, test_df = train_test_split(log_total_sales_df, test_size=0.1)

    X_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]
    X_test = test_df.iloc[:, :-1]
    y_test = test_df.iloc[:, -1]

    x_train_transformed = preprocessor.fit_transform(X_train)
    x_test_transformed = preprocessor.transform(X_test)

    model = LinearRegression()
    model.fit(x_train_transformed, y_train)
    y_pred = model.predict(x_test_transformed)
    
    mae = mean_absolute_error(np.exp(y_test), np.exp(y_pred))
    result_list.append(mae)

In [None]:
import matplotlib.pyplot as plt

plt.hist(result_list, bins=20, edgecolor='black')
plt.title('Histogram of MAE')
plt.xlabel('MAE')
plt.ylabel('Frequency')
plt.show()