In this notebook, we will perform different steps to train a forecasting model to forecast the total sales of all items across all stores in the next 7 days.

1. Data loading
2. Data preprocessing

    2a. Sales revenue variable
    
    2b. Event_name variable
    
    2c. Create features from 'date' variable

3. Train/ Validation split
4. Baseline model
5. Train and evaluate SGDRegressor model
6. Train and evaluate XGBRegressor model
7. Train and evaluate LinearRegression model
8. Save the best model with joblib

# 1. Data loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
final_df = pd.read_csv('../data/processed/final_df_forecast.csv', low_memory=False)
final_df

Unnamed: 0,sales_qty,date,sell_price,event_name
0,0,2011-01-29,,
1,0,2011-01-29,,
2,0,2011-01-29,,
3,0,2011-01-29,,
4,0,2011-01-29,,
...,...,...,...,...
47107045,2,2015-04-18,2.88,
47107046,0,2015-04-18,2.00,
47107047,0,2015-04-18,3.98,
47107048,0,2015-04-18,1.28,


# 2. Data preprocessing

In [3]:
# Make a copy of final_df and save as final_df_clean
final_df_clean = final_df.copy()

In [4]:
final_df_clean.isna().sum()

sales_qty            0
date                 0
sell_price    12291876
event_name    43143350
dtype: int64

## 2a. Sales revenue variable

In [5]:
# Replace NaN values in column 'sell_price' with value 0
final_df_clean['sell_price'] = final_df_clean['sell_price'].replace(np.nan, 0, regex=True)

In [6]:
# Create a column named sales_revenue
final_df_clean['sales_revenue'] = final_df_clean['sales_qty'] * final_df_clean['sell_price']

In [7]:
# Drop 'sales_qty', 'sell_price' in combined_df_clean
final_df_clean = final_df_clean.drop(['sales_qty', 'sell_price'], axis=1)
final_df_clean

Unnamed: 0,date,event_name,sales_revenue
0,2011-01-29,,0.00
1,2011-01-29,,0.00
2,2011-01-29,,0.00
3,2011-01-29,,0.00
4,2011-01-29,,0.00
...,...,...,...
47107045,2015-04-18,,5.76
47107046,2015-04-18,,0.00
47107047,2015-04-18,,0.00
47107048,2015-04-18,,0.00


In [8]:
# Sort the final_df_clean by date to ensure its chronological order
final_df_clean.sort_values(by='date', inplace=True)

In [9]:
# Shift the 'sales_revenue' column by 7 days to create the target variable
final_df_clean['target_revenue'] = final_df_clean['sales_revenue'].shift(-7)
final_df_clean

Unnamed: 0,date,event_name,sales_revenue,target_revenue
0,2011-01-29,,0.00,0.0
20334,2011-01-29,,0.00,0.0
20333,2011-01-29,,0.00,0.0
20332,2011-01-29,,18.90,0.0
20331,2011-01-29,,2.42,0.0
...,...,...,...,...
47086716,2015-04-18,,0.00,
47086715,2015-04-18,,4.52,
47086714,2015-04-18,,0.00,
47086726,2015-04-18,,0.00,


In [10]:
# Replace NaN values in column 'target_revenue' with value 0
final_df_clean['target_revenue'] = final_df_clean['target_revenue'].replace(np.nan, 0, regex=True)

In [11]:
# Drop 'sales_revenue' in final_df_clean
final_df_clean = final_df_clean.drop(['sales_revenue'], axis=1)
final_df_clean

Unnamed: 0,date,event_name,target_revenue
0,2011-01-29,,0.0
20334,2011-01-29,,0.0
20333,2011-01-29,,0.0
20332,2011-01-29,,0.0
20331,2011-01-29,,0.0
...,...,...,...
47086716,2015-04-18,,0.0
47086715,2015-04-18,,0.0
47086714,2015-04-18,,0.0
47086726,2015-04-18,,0.0


## 2b. Event_name variable

In [12]:
# Drop column 'event_name'
final_df_clean = final_df_clean.drop(['event_name'], axis=1)
final_df_clean

Unnamed: 0,date,target_revenue
0,2011-01-29,0.0
20334,2011-01-29,0.0
20333,2011-01-29,0.0
20332,2011-01-29,0.0
20331,2011-01-29,0.0
...,...,...
47086716,2015-04-18,0.0
47086715,2015-04-18,0.0
47086714,2015-04-18,0.0
47086726,2015-04-18,0.0


## 2c. Create features from 'date' variable

In [13]:
# Set date column to be dataframe index
final_df_clean.set_index('date')

Unnamed: 0_level_0,target_revenue
date,Unnamed: 1_level_1
2011-01-29,0.0
2011-01-29,0.0
2011-01-29,0.0
2011-01-29,0.0
2011-01-29,0.0
...,...
2015-04-18,0.0
2015-04-18,0.0
2015-04-18,0.0
2015-04-18,0.0


In [14]:
# Convert the 'date' column to a datetime type
final_df_clean['date'] = pd.to_datetime(final_df_clean['date'])
final_df_clean

Unnamed: 0,date,target_revenue
0,2011-01-29,0.0
20334,2011-01-29,0.0
20333,2011-01-29,0.0
20332,2011-01-29,0.0
20331,2011-01-29,0.0
...,...,...
47086716,2015-04-18,0.0
47086715,2015-04-18,0.0
47086714,2015-04-18,0.0
47086726,2015-04-18,0.0


In [15]:
# Import datetime as dt
import datetime as dt

In [16]:
# Create features 'year', 'quarter', 'month', 'day_of_week' from column 'date'
final_df_clean['year'] = final_df_clean['date'].dt.year
final_df_clean['quarter'] = final_df_clean['date'].dt.quarter
final_df_clean['month'] = final_df_clean['date'].dt.month
final_df_clean['day_of_week'] = final_df_clean['date'].dt.dayofweek  # 0=Monday, 1=Tuesday, ..., 6=Sunday

final_df_clean

Unnamed: 0,date,target_revenue,year,quarter,month,day_of_week
0,2011-01-29,0.0,2011,1,1,5
20334,2011-01-29,0.0,2011,1,1,5
20333,2011-01-29,0.0,2011,1,1,5
20332,2011-01-29,0.0,2011,1,1,5
20331,2011-01-29,0.0,2011,1,1,5
...,...,...,...,...,...,...
47086716,2015-04-18,0.0,2015,2,4,5
47086715,2015-04-18,0.0,2015,2,4,5
47086714,2015-04-18,0.0,2015,2,4,5
47086726,2015-04-18,0.0,2015,2,4,5


# 3. Train/ Validation Split

In [17]:
# Import TimeSeriesSplit from sklearn.model_selection 
from sklearn.model_selection import TimeSeriesSplit

In [18]:
# Initiate TimeSeriesSplit class with n_splits=5
tscv = TimeSeriesSplit(n_splits=5)

In [19]:
# Split df_clean_sample to train_data and val_data
for train_index, val_index in tscv.split(final_df_clean):
    train_data = final_df_clean.iloc[train_index]
    val_data = final_df_clean.iloc[val_index]

In [20]:
# Save the target variable of train_data and val_data as y_train and y_val respectively
y_train = train_data['target_revenue']
y_val = val_data['target_revenue']

# 4. Baseline model

In [21]:
# Find the mean value of target variable and save as y_mean
y_mean = train_data['target_revenue'].mean()

In [22]:
# Create a numpy array with the same dimensions as for the train_data called y_base filled with this value
y_base = np.full(y_train.shape, y_mean)

In [23]:
# Import mean_squared_error from sklearn.metrics
from sklearn.metrics import mean_squared_error as mse

In [24]:
# Print the recall score of this baseline model on the training dataset
print(mse(y_train, y_base, squared=False))

9.006761214320873


# 5. Train and evaluate SGDRegressor model

In [25]:
# Import Pipeline from sklearn.pipeline, import StandardScaler, OneHotEncoder from sklearn.preprocessing, import SGDRegressor from sklearn.linear_model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

In [26]:
# Create a Pipeline called num_transformer with one step that contains StandardScaler
num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [27]:
# Create a list called num_cols that will contain the list of columns that are numeric type
num_cols = ['year', 'quarter', 'month', 'day_of_week']

In [28]:
# Import ColumnTransformer from sklearn.compose
from sklearn.compose import ColumnTransformer

In [29]:
# Create a ColumnTransformer called preprocessor containing the following steps

# num_transformer for num_cols
# cat_transformer for cat_cols

preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', num_transformer, num_cols)
    ]
)

In [30]:
# Create a Pipeline called sgd_pipe that contains 2 steps preprocessor and another that instantiate a SGDRegressor with same parameters as previously
sgd_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('sgd', SGDRegressor())
    ]
)

In [31]:
# Fit sgd_pipe with train_data
sgd_pipe.fit(train_data, y_train)

In [32]:
# Make predictions on train_data and save as y_train_preds
y_train_preds = sgd_pipe.predict(train_data)

In [33]:
# Make predictions on val_data and save as y_val_preds
y_val_preds = sgd_pipe.predict(val_data)

In [34]:
# Display the RMSE score on the training set
print(mse(y_train, y_train_preds, squared=False))

8.992980590770673


In [35]:
# Display the RMSE score on the testing set
print(mse(y_val, y_val_preds, squared=False))

9.947596822424405


# 6. Train and evaluate XGBoost model

In [36]:
# Import xgboost as xgb
import xgboost as xgb

In [37]:
# Create a Pipeline called xgb_pipe that contains 2 steps preprocessor and another that instantiate a XGBRegressor with same parameters as previously
xgb_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('xgb', xgb.XGBRegressor())
    ]
)

In [38]:
# Fit xgb_pipe with train_data
xgb_pipe.fit(train_data, y_train)

  if is_sparse(data):


In [39]:
# Make predictions on train_data and save as y_train_preds
y_train_preds = xgb_pipe.predict(train_data)

In [40]:
# Make predictions on val_data and save as y_val_preds
y_val_preds = xgb_pipe.predict(val_data)

In [41]:
# Display the RMSE score on the training set
print(mse(y_train, y_train_preds, squared=False))

8.986935087062326


In [42]:
# Display the RMSE score on the testing set
print(mse(y_val, y_val_preds, squared=False))

9.94186988485762


# 7. Train and evaluate LinearRegression model

In [43]:
# Import LinearRegression from sklearn.linear_model
from sklearn.linear_model import LinearRegression

In [44]:
# Create a Pipeline called lr_pipe that contains 2 steps preprocessor and another that instantiate a LinearRegression with same parameters as previously
lr_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('lr', LinearRegression())
    ]
)

In [45]:
# Fit lr_pipe with train_data
lr_pipe.fit(train_data, y_train)

In [46]:
# Make predictions on train_data and save as y_train_preds
y_train_preds = lr_pipe.predict(train_data)

In [47]:
# Make predictions on val_data and save as y_val_preds
y_val_preds = lr_pipe.predict(val_data)

In [48]:
# Display the RMSE score on the training set
print(mse(y_train, y_train_preds, squared=False))

8.992566242417691


In [49]:
# Display the RMSE score on the validation set
print(mse(y_val, y_val_preds, squared=False))

9.944121194839221


# 8. Save the best model with joblib

In [50]:
# Import dump from joblib package and save lr_pipe (the best model) into models folder
from joblib import dump

dump(lr_pipe,  '../models/forecasting/lr_pipe.joblib')

['../models/forecasting/lr_pipe.joblib']