# Purpose
- Purpose of this notebook is to build baseline models;
    - Baseline 1: Predict last observation
    - Baseline 2: Predict Moving average of last 7 days
    - Baseline 3: Predict sale for the same day in the previous week
    - Baseline 4: Predict the average of the last 4 weeks for the same day

In [2]:
# autoreload
%load_ext autoreload
%autoreload 2

# change current working directory to the root of the project
import os
os.chdir(os.path.dirname(os.getcwd()))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from IPython.display import display
from sklearn.metrics import mean_absolute_error


from src import data_split

In [3]:
# import csv data into pandas dataframe
df_train = pd.read_csv('data/preprocessed/train_preprocessed.csv')
df_train

Unnamed: 0,family,store_nbr,date,sales,onpromotion
0,AUTOMOTIVE,1,2013-01-01,0.0,0
1,AUTOMOTIVE,1,2013-01-02,2.0,0
2,AUTOMOTIVE,1,2013-01-03,3.0,0
3,AUTOMOTIVE,1,2013-01-04,3.0,0
4,AUTOMOTIVE,1,2013-01-05,5.0,0
...,...,...,...,...,...
3008011,SEAFOOD,54,2017-08-11,0.0,0
3008012,SEAFOOD,54,2017-08-12,1.0,1
3008013,SEAFOOD,54,2017-08-13,2.0,0
3008014,SEAFOOD,54,2017-08-14,0.0,0


In [4]:
# combine store and family into a single column
df_train['store_family'] = df_train['store_nbr'].astype(str) + '_' + df_train['family'].astype(str)

# add lag features
# add lag features using the sales column
for i in tqdm(range(1, 29)): # 4 weeks
    df_train[f'sales_lag_{i}'] = df_train.groupby(['store_family'])['sales'].shift(i)    

print(f'{df_train.shape=}')
df_train.head()

100%|██████████| 28/28 [00:02<00:00,  9.67it/s]

df_train.shape=(3008016, 34)





Unnamed: 0,family,store_nbr,date,sales,onpromotion,store_family,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,...,sales_lag_19,sales_lag_20,sales_lag_21,sales_lag_22,sales_lag_23,sales_lag_24,sales_lag_25,sales_lag_26,sales_lag_27,sales_lag_28
0,AUTOMOTIVE,1,2013-01-01,0.0,0,1_AUTOMOTIVE,,,,,...,,,,,,,,,,
1,AUTOMOTIVE,1,2013-01-02,2.0,0,1_AUTOMOTIVE,0.0,,,,...,,,,,,,,,,
2,AUTOMOTIVE,1,2013-01-03,3.0,0,1_AUTOMOTIVE,2.0,0.0,,,...,,,,,,,,,,
3,AUTOMOTIVE,1,2013-01-04,3.0,0,1_AUTOMOTIVE,3.0,2.0,0.0,,...,,,,,,,,,,
4,AUTOMOTIVE,1,2013-01-05,5.0,0,1_AUTOMOTIVE,3.0,3.0,2.0,0.0,...,,,,,,,,,,


In [5]:
# split data into train and test
X_train, y_train, X_test, y_test = data_split.train_test_split(df = df_train, cutoff_date=pd.to_datetime('2017-01-01'), target_column_name='sales')

print(f'{X_train.shape=}')
display(X_train.head())
print(f'{y_train.shape=}')
display(y_train.head())

print(f'{X_test.shape=}')
display(X_test.head())
print(f'{y_test.shape=}')
display(y_test.head())


X_train.shape=(2603502, 33)


Unnamed: 0,family,store_nbr,date,onpromotion,store_family,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,sales_lag_5,...,sales_lag_19,sales_lag_20,sales_lag_21,sales_lag_22,sales_lag_23,sales_lag_24,sales_lag_25,sales_lag_26,sales_lag_27,sales_lag_28
0,AUTOMOTIVE,1,2013-01-01,0,1_AUTOMOTIVE,,,,,,...,,,,,,,,,,
1,AUTOMOTIVE,1,2013-01-02,0,1_AUTOMOTIVE,0.0,,,,,...,,,,,,,,,,
2,AUTOMOTIVE,1,2013-01-03,0,1_AUTOMOTIVE,2.0,0.0,,,,...,,,,,,,,,,
3,AUTOMOTIVE,1,2013-01-04,0,1_AUTOMOTIVE,3.0,2.0,0.0,,,...,,,,,,,,,,
4,AUTOMOTIVE,1,2013-01-05,0,1_AUTOMOTIVE,3.0,3.0,2.0,0.0,,...,,,,,,,,,,


y_train.shape=(2603502,)


0    0.0
1    2.0
2    3.0
3    3.0
4    5.0
Name: sales, dtype: float64

X_test.shape=(404514, 33)


Unnamed: 0,family,store_nbr,date,onpromotion,store_family,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,sales_lag_5,...,sales_lag_19,sales_lag_20,sales_lag_21,sales_lag_22,sales_lag_23,sales_lag_24,sales_lag_25,sales_lag_26,sales_lag_27,sales_lag_28
0,AUTOMOTIVE,1,2017-01-01,0,1_AUTOMOTIVE,2.0,4.0,3.0,12.0,5.0,...,4.0,3.0,0.0,3.0,6.0,5.0,1.0,0.0,4.0,1.0
1,AUTOMOTIVE,1,2017-01-02,0,1_AUTOMOTIVE,0.0,2.0,4.0,3.0,12.0,...,0.0,4.0,3.0,0.0,3.0,6.0,5.0,1.0,0.0,4.0
2,AUTOMOTIVE,1,2017-01-03,0,1_AUTOMOTIVE,5.0,0.0,2.0,4.0,3.0,...,5.0,0.0,4.0,3.0,0.0,3.0,6.0,5.0,1.0,0.0
3,AUTOMOTIVE,1,2017-01-04,0,1_AUTOMOTIVE,4.0,5.0,0.0,2.0,4.0,...,1.0,5.0,0.0,4.0,3.0,0.0,3.0,6.0,5.0,1.0
4,AUTOMOTIVE,1,2017-01-05,0,1_AUTOMOTIVE,1.0,4.0,5.0,0.0,2.0,...,6.0,1.0,5.0,0.0,4.0,3.0,0.0,3.0,6.0,5.0


y_test.shape=(404514,)


0    0.0
1    5.0
2    4.0
3    1.0
4    2.0
Name: sales, dtype: float64

# Baseline 1: Predict last observation

# LOCF baseline model
class BaselineModelLOCF:
    '''Prediction = last observed value'''
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X_test: pd.DataFrame, prediction_horizon: int) -> pd.DataFrame:

        predictions = []
        for index in range(0, len(X_test), prediction_horizon):
            preds = [X_test.iloc[index]['sales_lag_1']] * prediction_horizon
            predictions.append(preds)

        predictions = predictions[:len(X_test)]

        return predictions

In [50]:
# LOCF baseline model
class BaselineModelLOCF:
    '''Prediction = last observed value'''
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X_test: pd.DataFrame, prediction_horizon: int) -> pd.DataFrame:

        lag_features = [f'sales_lag_{i}' for i in range(1, prediction_horizon+1)]

        predictions = []
        for i in tqdm(range(prediction_horizon, len(X_test)+prediction_horizon, prediction_horizon)):
            predictions.extend(np.diag(X_test.loc[i-prediction_horizon: i-1, lag_features]))

        return np.array(predictions)

In [51]:
model = BaselineModelLOCF()
predictions = model.predict(X_test, prediction_horizon=7)

100%|██████████| 57788/57788 [00:10<00:00, 5763.63it/s]


In [52]:
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=153.1509


# Baseline 2: Predict Moving average of last 7 days

In [39]:
# Moving Average model
class BaselineModelMovingAverage:
    '''Prediction = moving average of the last n days'''
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X_test: pd.DataFrame, prediction_horizon: int, moving_average_window: int) -> np.array:
        
        lag_features = [f'sales_lag_{i}' for i in range(1, moving_average_window+1)]
        
        predictions = []
        for i in tqdm(range(prediction_horizon, len(X_test)+prediction_horizon, prediction_horizon)):
            preds = X_test.loc[i-prediction_horizon: i-1].groupby(['store_family', 'date']).apply(lambda x: x[lag_features].mean(axis=1)).values
            predictions.extend(preds)
        
        predictions = predictions[:len(X_test)]

        return np.array(predictions)

In [40]:
model = BaselineModelMovingAverage()
predictions = model.predict(X_test, prediction_horizon=7, moving_average_window=7)

100%|██████████| 57788/57788 [02:56<00:00, 326.65it/s]


In [41]:
print(f'{predictions.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

predictions.shape=(404514,)
X_test.shape=(404514, 33)
y_test.shape=(404514,)


In [42]:
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=116.2272


# Baseline 3: Predict sale for the same day in the previous week


In [44]:
class BaselineModelPreviousWeek:
    '''Prediction = actual demand observed at t-7 days'''
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test['sales_lag_7']

In [45]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)

In [46]:
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=96.4037


# Baseline 4: Predict the average of the last 4 weeks for the same day


In [47]:
class BaselineModelPrevious4Weeks:
    '''Prediction = actual demand observed at t-7 days, t-14 days, t-21 days, t-28'''
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X_test: pd.DataFrame) -> np.array:
        return (
            X_test[f'sales_lag_7'] + \
            X_test[f'sales_lag_14'] + \
            X_test[f'sales_lag_21'] + \
            X_test[f'sales_lag_28']
            ) / 4

In [48]:
model = BaselineModelPrevious4Weeks()
predictions = model.predict(X_test)

In [49]:
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=86.7968


# Summary of baseline models
- **Baseline 1**: Predict last observation
    - MAE = 153.1509
- **Baseline 2**: Predict Moving average of last 7 days
    - MAE = 116.2272
- **Baseline 3**: Predict sale for the same day in the previous week
    - MAE = 96.4037
- **Baseline 4**: Predict the average of the last 4 weeks for the same day
    - MAE = 86.7968