<a href="https://colab.research.google.com/github/axpat/Retail-Sales-Prediction/blob/main/Baseline_Model_Building_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import math

import warnings    
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#Setting up the Folder Path where the files are  kept
file_path = '/content/drive/MyDrive/Colab Notebooks/AlmaBetter /Capstone Project/Supervised-Learning-Regression/Cleaned_data/'
train_df = pd.read_csv(file_path+'train.csv', parse_dates=['Date'])
test_df = pd.read_csv(file_path+'test.csv', parse_dates=['Date'])

In [4]:
#setting date and store as index
train_df.set_index(['Date','Store'], inplace=True)
train_df.sort_values(by=['Date','Store'], inplace=True)

test_df.set_index(['Date','Store'], inplace=True)
test_df.sort_values(by=['Date','Store'], inplace=True)

In [5]:
train_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Year,Month,WeekOfYear,DayOfYear,CompetitionOpen,Promo2Open,Promo2running
Date,Store,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2013-01-01,85,2,8.34759,619,0,1,1,b,a,1870.0,0,2013,1,1,1,15.0,0.0,0


In [6]:
train_df.columns

Index(['DayOfWeek', 'Sales', 'Customers', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'Promo2', 'Year', 'Month', 'WeekOfYear', 'DayOfYear', 'CompetitionOpen',
       'Promo2Open', 'Promo2running'],
      dtype='object')

In [7]:
test_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,Year,Month,WeekOfYear,DayOfYear,CompetitionOpen,Promo2Open,Promo2running
Date,Store,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-06-15,1,1,8.615771,586,1,0,0,c,a,1270.0,0,2015,6,25,166,81.0,0.0,0


# Test-Train-Split

In [8]:
# dividing train df into train_x and train_y 
X_train = train_df.drop('Sales', axis=1)
y_train = train_df[['Sales']]

#dividing test df into test_x and test_y
X_test = test_df.drop("Sales", axis=1)
y_test = test_df[['Sales']]

One Hot Encoding of categorical columns to convert them into numerical columns

In [9]:
# making a list of categorical columns
Categorical_columns = ['DayOfWeek','StoreType','Assortment']

In [10]:
# assigning one hot encoder
enc = OneHotEncoder(sparse=False)

In [11]:
enc.fit(X_train[Categorical_columns])

OneHotEncoder(sparse=False)

In [12]:
# getting a new list of encoded columns from Categorical_columns in train data
encoded_cols = enc.get_feature_names(Categorical_columns).tolist()

In [13]:
encoded_cols

['DayOfWeek_1',
 'DayOfWeek_2',
 'DayOfWeek_3',
 'DayOfWeek_4',
 'DayOfWeek_5',
 'DayOfWeek_6',
 'DayOfWeek_7',
 'StoreType_a',
 'StoreType_b',
 'StoreType_c',
 'StoreType_d',
 'Assortment_a',
 'Assortment_b',
 'Assortment_c']

In [14]:

#performing transformation on training data and creating new features for encoded columns
X_train[encoded_cols] = enc.transform(X_train[Categorical_columns])

In [15]:
#dropping old categorical columns from training data
X_train.drop(Categorical_columns, axis=1, inplace=True)

In [16]:
#performing transformation on testing data and creating new features for encoded columns
X_test[encoded_cols] = enc.transform(X_test[Categorical_columns])


In [17]:
#dropping old categorical columns from testing data
X_test.drop(Categorical_columns, axis=1, inplace=True)

In [18]:
X_train.columns

Index(['Customers', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'CompetitionDistance', 'Promo2', 'Year', 'Month', 'WeekOfYear',
       'DayOfYear', 'CompetitionOpen', 'Promo2Open', 'Promo2running',
       'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3', 'DayOfWeek_4',
       'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7', 'StoreType_a',
       'StoreType_b', 'StoreType_c', 'StoreType_d', 'Assortment_a',
       'Assortment_b', 'Assortment_c'],
      dtype='object')


## Time for transformations in our data

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train[list(X_train.columns)] = scaler.fit_transform(X_train[list(X_train.columns)])
X_test[list(X_test.columns)] = scaler.fit_transform(X_test[list(X_test.columns)])

scaler = StandardScaler()
y_train[list(y_train.columns)] = scaler.fit_transform(y_train[list(y_train.columns)])
y_test[list(y_test.columns)] = scaler.transform(y_test[list(y_train.columns)])

## Model Selection

Going through the assumptions of linear models, we can confidently conclude that we can go for them. Given Our Data has a lot of genuine multicollinearity and also some columns have way too many outliers than others.

Linear models like Linear regression and Logistic ones can't be used for our purpose, so we will move ahead with decision trees and randomforests, but, before that just to check.

In [20]:
lrreg = LinearRegression()

In [21]:
lrreg.fit(X_train,y_train)

LinearRegression()

In [22]:
y_train_pred = lrreg.predict(X_train)

In [23]:
y_test_pred = lrreg.predict(X_test)

In [24]:
# importing evaluation matrices
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [25]:
print(f'r2_score for training data is {r2_score(y_train_pred,y_train)}')
print(f'r2_score for training data is {r2_score(y_test_pred,y_test)}')

r2_score for training data is 0.6675621318983829
r2_score for training data is -3.831468475823385e-11


As expected high bias and high variance.

## Model 1(Baseline): DecisionTree

In [26]:

# import the regressor
from sklearn.tree import DecisionTreeRegressor 

In [27]:

# assigning a variable
dtree = DecisionTreeRegressor(random_state=68)

In [28]:
dtree.fit(X_train,y_train)

DecisionTreeRegressor(random_state=68)

In [29]:
# predicting Y_train
y_pred_train = dtree.predict(X_train)

In [30]:
# predicting Y_train
y_pred_test = dtree.predict(X_test)

In [31]:

# printing evaluation matrices for our model
print(f'r2_score for training data is {r2_score(y_pred_train,y_train)}')
print(f'r2_score for testing data is {r2_score(y_pred_test,y_test)}')

print(f'Adjusted r2_score for training data is {round(1 - (1-r2_score(y_train, y_pred_train)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)}')
print(f'Adjusted r2_score for testing data is {round(1 - (1-r2_score(y_test, y_pred_test)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)}')

print(f'Mean Absolute Error for training data is {mean_absolute_error(y_pred_train,y_train)}')
print(f'Mean Absolute Error for testing data is {mean_absolute_error(y_pred_test,y_test)}')

print(f'Mean Squared Error for training data is {mean_squared_error(y_pred_train,y_train)}')
print(f'Mean Squared Error for testing data is {mean_squared_error(y_pred_test,y_test)}')

r2_score for training data is 0.9999987542867533
r2_score for testing data is 0.9132782268502603
Adjusted r2_score for training data is 0.999999
Adjusted r2_score for testing data is 0.911298
Mean Absolute Error for training data is 4.189885590080446e-06
Mean Absolute Error for testing data is 0.20540436100243337
Mean Squared Error for training data is 1.2457116948123794e-06
Mean Squared Error for testing data is 0.08350558487359472


As it is famously assumed, Decision trees are indeed prone to overfitting, with r2_score of 1 on traing data, it has completely overfitted the data, while for test data, it's accuracy is around 93%.

To ease this problem of overfitting we will use Random Forest to improve our model accuracy.

# **Model 2: Random Forest**

A single tree was not able to lift much weight, so now we will use entire forest.

In [32]:
# importing random forest from ScikitLearn
from sklearn.ensemble import RandomForestRegressor

In [33]:

rf = RandomForestRegressor(n_estimators=100, random_state=68)

In [34]:
rf.fit(X_train,y_train)

RandomForestRegressor(random_state=68)

In [35]:

y_pred_train_rf = rf.predict(X_train)

In [36]:
y_pred_test_rf = rf.predict(X_test)

In [37]:

# printing evaluation matrices for our model
print(f'r2_score for training data is {r2_score(y_pred_train_rf,y_train)}')
print(f'r2_score for testing data is {r2_score(y_pred_test_rf,y_test)}')

print(f'Adjusted r2_score for training data is {round(1 - (1-r2_score(y_train, y_pred_train_rf)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)}')
print(f'Adjusted r2_score for testing data is {round(1 - (1-r2_score(y_test, y_pred_test_rf)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)}')

print(f'Mean Absolute Error for training data is {mean_absolute_error(y_pred_train_rf,y_train)}')
print(f'Mean Absolute Error for testing data is {mean_absolute_error(y_pred_test_rf,y_test)}')

print(f'Mean Squared Error for training data is {mean_squared_error(y_pred_train_rf,y_train)}')
print(f'Mean Squared Error for testing data is {mean_squared_error(y_pred_test_rf,y_test)}')

r2_score for training data is 0.9964356349429113
r2_score for testing data is 0.9508591528079314
Adjusted r2_score for training data is 0.9965
Adjusted r2_score for testing data is 0.952252
Mean Absolute Error for training data is 0.04334227746186602
Mean Absolute Error for testing data is 0.1538746851395498
Mean Squared Error for training data is 0.003500113007311803
Mean Squared Error for testing data is 0.0449506975157633


Our r2_score for training data is almost 99% while for test data it is 95%. It seems all good, but let's try finding best parameters to see if we can further improve our score.

# LGBM

In [38]:
# importing lgbm
from lightgbm import LGBMRegressor

In [39]:
# assigning variable
lgbm_reg = LGBMRegressor(random_state=68)

In [40]:

# fitting on training data
lgbm_reg.fit(X_train,y_train)

LGBMRegressor(random_state=68)

In [41]:
# predicting on y_train
y_train_pred_lgbm = lgbm_reg.predict(X_train)

In [42]:
# predicting on y_test
y_test_pred_lgbm = lgbm_reg.predict(X_test)

In [43]:
# printing accuracy
# printing evaluation matrices for our model
print(f'r2_score for training data is {r2_score(y_train_pred_lgbm,y_train)}')
print(f'r2_score for testing data is {r2_score(y_test_pred_lgbm,y_test)}')

print(f'Adjusted r2_score for training data is {round(1 - (1-r2_score(y_train, y_train_pred_lgbm)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)}')
print(f'Adjusted r2_score for testing data is {round(1 - (1-r2_score(y_test, y_test_pred_lgbm)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)}')

print(f'Mean Absolute Error for training data is {mean_absolute_error(y_train_pred_lgbm,y_train)}')
print(f'Mean Absolute Error for testing data is {mean_absolute_error(y_test_pred_lgbm,y_test)}')

print(f'Mean Squared Error for training data is {mean_squared_error(y_train_pred_lgbm,y_train)}')
print(f'Mean Squared Error for testing data is {mean_squared_error(y_test_pred_lgbm,y_test)}')

r2_score for training data is 0.9156963683049166
r2_score for testing data is 0.8854440634180716
Adjusted r2_score for training data is 0.925448
Adjusted r2_score for testing data is 0.896417
Mean Absolute Error for training data is 0.2148672858875459
Mean Absolute Error for testing data is 0.24558678172808118
Mean Squared Error for training data is 0.07454970345199258
Mean Squared Error for testing data is 0.09751441413804593


We have made predictions using default parameters of Light Gradient Boosting model, simply to get and idea of how it might perfrm on our training and test data.

In [44]:
# creating a comparison df
{'Model_name':['Decision Tree', 'Random Forest Regressor', 'LightGradient Boost Regressor'], 'Variable':['dtree', 'rf', 'lgbm_reg'], 'Train_r2':[r2_score(y_pred_train,y_train), r2_score(y_pred_train_rf,y_train), r2_score(y_train_pred_lgbm,y_train)]}

{'Model_name': ['Decision Tree',
  'Random Forest Regressor',
  'LightGradient Boost Regressor'],
 'Variable': ['dtree', 'rf', 'lgbm_reg'],
 'Train_r2': [0.9999987542867533, 0.9964356349429113, 0.9156963683049166]}

In [45]:
comparison_df = pd.DataFrame({'Model_name':['Decision Tree', 'Random Forest Regressor', 'LightGradient Boost Regressor'], 'Variable':['dtree', 'rf', 'lgbm_reg'], 'Train_r2':[r2_score(y_pred_train,y_train), r2_score(y_pred_train_rf,y_train), r2_score(y_train_pred_lgbm,y_train)]})

In [46]:
comparison_df = [r2_score(y_pred_test,y_test), r2_score(y_pred_test_rf,y_test), r2_score(y_test_pred_lgbm,y_test)]
comparison_df

[0.9132782268502603, 0.9508591528079314, 0.8854440634180716]

In [52]:
comparison_df['Train_adjusted_r2'] = [round(1 - (1-r2_score(y_train, y_pred_train)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6),
                                      round(1 - (1-r2_score(y_train, y_pred_train_rf)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6),
                                      round(1 - (1-r2_score(y_train, y_train_pred_lgbm)) * (len(y_train)-1)/(len(y_train)-X_train.shape[1]-1),6)]

TypeError: ignored

In [53]:

comparison_df['Test_adjusted_r2'] = [round(1 - (1-r2_score(y_test, y_pred_test)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6),
                                      round(1 - (1-r2_score(y_test, y_pred_test_rf)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6),
                                      round(1 - (1-r2_score(y_test, y_test_pred_lgbm)) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1),6)]

TypeError: ignored

In [None]:

comparison_df['Train_MAE'] = [mean_absolute_error(y_pred_train,y_train),
                              mean_absolute_error(y_pred_train_rf,y_train),
                              mean_absolute_error(y_train_pred_lgbm,y_train)]

In [None]:
comparison_df['Test_MAE'] = [mean_absolute_error(y_pred_test,y_test),
                             mean_absolute_error(y_pred_test_rf,y_test),
                             mean_absolute_error(y_test_pred_lgbm,y_test)]

In [None]:
comparison_df['Train_MSE'] = [mean_squared_error(y_pred_train,y_train),
                              mean_squared_error(y_pred_train_rf,y_train),
                              mean_squared_error(y_train_pred_lgbm,y_train)]

In [None]:
comparison_df['Test_MSE'] = [mean_squared_error(y_pred_test,y_test),
                             mean_squared_error(y_pred_test_rf,y_test),
                             mean_squared_error(y_test_pred_lgbm,y_test)]

In [54]:
comparison_df['Train_RMSE'] = [math.sqrt(mean_squared_error(y_pred_train,y_train)),
                               math.sqrt(mean_squared_error(y_pred_train_rf,y_train)),
                               math.sqrt(mean_squared_error(y_train_pred_lgbm,y_train))]

TypeError: ignored

In [None]:
comparison_df['Test_RMSE'] = [math.sqrt(mean_squared_error(y_pred_test,y_test)),
                              math.sqrt(mean_squared_error(y_pred_test_rf,y_test)),
                              math.sqrt(mean_squared_error(y_test_pred_lgbm,y_test))]

In [55]:

comparison_df

[0.9132782268502603, 0.9508591528079314, 0.8854440634180716]

In [56]:
c_df = comparison_df.to_csv(file_path+'comparison_df.csv')

AttributeError: ignored