In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
from prettytable import PrettyTable

from downcast import reduce
import warnings
warnings.filterwarnings("ignore")

from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

import os
os.chdir("C:\\Users\\91958\\Desktop\\Datasets\\Wallmart Dataset")

In [8]:
final = pd.read_pickle('final_Encoded.pkl')

In [9]:
final.head(2)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,id_label,item_id_label,dept_id_label,cat_id_label,store_id_label,state_id_label,event_name_1_label,event_type_1_label,event_name_2_label,event_type_2_label
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,14370,1437,3,1,0,0,30,4,0,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,14380,1438,3,1,0,0,30,4,0,0


In [10]:
final = final.reset_index(drop=True)

In [11]:
final.head(2)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,quantity sold,date,wm_yr_wk,...,id_label,item_id_label,dept_id_label,cat_id_label,store_id_label,state_id_label,event_name_1_label,event_type_1_label,event_name_2_label,event_type_2_label
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,14370,1437,3,1,0,0,30,4,0,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1400,0,2014-11-28,11443,...,14380,1438,3,1,0,0,30,4,0,0


In [12]:
final.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'quantity sold', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'day', 'if_weekend',
       'if_month_season', 'if_christmas', 'id_label', 'item_id_label',
       'dept_id_label', 'cat_id_label', 'store_id_label', 'state_id_label',
       'event_name_1_label', 'event_type_1_label', 'event_name_2_label',
       'event_type_2_label'],
      dtype='object')

### DataFrame Cleaning
- As we have already encoded our categorical features, so we can drop old one
- weekday as a feature can be removed as wday is already present
- Date as a feature can also be removed as we have month and year column also present

In [13]:
# features to be dropped

unused_columns =['id','item_id','dept_id','cat_id','store_id','state_id','d','event_name_1','event_type_1','event_name_2','event_type_2','date','weekday']
final_ = final.drop(unused_columns, axis=1)

## Data Splitting - train, CV, test

##### We will split the data on time based:
- Training : day 1400 - 1885
- Validation : day 1886 - 1913 
- Test : day 1914- 1941

In [14]:
# Splitting data for independent variable (Predictors) based on days

X_train = final_.loc[final_['day'] <=1885]
X_CV = final_.loc[(final_['day']>1885) & (final_['day']<1914)]
X_test = final_.loc[final_['day'] >=1914]

# Splitting data for dependent varibale (Target) based on days

Y_train = X_train['quantity sold']
Y_CV = X_CV['quantity sold']
Y_test = X_test['quantity sold']

In [15]:
# dropping day column from the dependent vaiable

X_variable = [X_train, X_CV, X_test]

for var in tqdm(X_variable):
    var = var.drop(columns = ['quantity sold'], axis=1, inplace = True)


100%|██████████| 3/3 [00:00<00:00,  3.90it/s]


In [16]:
print(f"Shape of the Dependent Variable Splits are X_train: {Y_train.shape}, X_CV: {Y_CV.shape}, X_test: {Y_test.shape}")

Shape of the Dependent Variable Splits are X_train: (14818140,), X_CV: (853720,), X_test: (853720,)


In [17]:
Y_train.head(5)

0    0
1    0
2    0
3    3
4    0
Name: quantity sold, dtype: int16

### Metric Calculation Function

In [18]:
# Function for : RMSE

def RMSE_calc(pred,actual):
    '''
    THis function is used to calculate: Root Mean Squared Error
    '''
    return np.sqrt(((pred-actual)**2).mean())


### First Cut Solution - No Feature Engineering/ No HyperParameter Tuning 

###### a. Simple Linear Regression

In [99]:
%%time 

model1 = LinearRegression(n_jobs = -1)
model1.fit(X_train, Y_train)

Wall time: 16.1 s


LinearRegression(n_jobs=-1)

In [100]:
cv_predict = model1.predict(X_CV)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for validation data using Simple Linear regression:',RMSE_cv)


RMSE for validation data using Simple Linear regression: 3.536608


In [101]:
test_predict = model1.predict(X_test)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE of test data using Linear regression with L1 regularization:',RMSE_test)

RMSE of test data using Linear regression with L1 regularization: 3.580427


##### b. Linear Regression with L1 Regularization

In [102]:
%%time 

model2 = linear_model.Lasso(alpha = 0.1)
model2.fit(X_train,Y_train)

Wall time: 6min 8s


Lasso(alpha=0.1)

In [103]:
cv_predict = model2.predict(X_CV)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for validation data using Linear regression with L1 regularization:',RMSE_cv)

RMSE for validation data using Linear regression with L1 regularization: 3.5420496


In [104]:
test_predict = model2.predict(X_test)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE of test data using Linear regression with L1 regularization:',RMSE_test)

RMSE of test data using Linear regression with L1 regularization: 3.5848174


##### c. Linear Regression with L2 Regularization

In [105]:
%%time 

model3 = linear_model.Ridge(alpha=0.1)
model3.fit(X_train,Y_train)

Wall time: 2.46 s


Ridge(alpha=0.1)

In [106]:
cv_predict = model3.predict(X_CV)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for validation data using Linear regression with L2 regularization :',RMSE_cv)

RMSE for validation data using Linear regression with L2 regularization : 3.5365868


In [107]:
test_predict =model3.predict(X_test)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE for test data using Linear regression with L2 regularization :',RMSE_test)

RMSE for test data using Linear regression with L2 regularization : 3.5806003


#### d. DecisionTreeRegressor

In [108]:
%%time 

model4= DecisionTreeRegressor(random_state=0)
model4.fit(X_train,Y_train)

Wall time: 2min 23s


DecisionTreeRegressor(random_state=0)

In [109]:
cv_predict = model4.predict(X_CV)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for validation data using DecisionTree Regressor is:',RMSE_cv)

RMSE for validation data using DecisionTree Regressor is: 3.09388726459688


In [110]:
test_predict =model4.predict(X_test)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE for test data using DicisionTree Regressor:',RMSE_test)

RMSE for test data using DicisionTree Regressor: 3.2821219736274387


#### e. LightGBMRegressor

In [111]:
%%time 

model5 = LGBMRegressor(n_jobs=- 1)
model5.fit(X_train,Y_train)

Wall time: 32.4 s


LGBMRegressor()

In [112]:
cv_predict = model5.predict(X_CV)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for validation data using LGBM Regressor is:',RMSE_cv)

RMSE for validation data using LGBM Regressor is: 2.797847521523142


In [113]:
test_predict =model5.predict(X_test)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE for test data using LGBM Regressor is:',RMSE_test)

RMSE for test data using LGBM Regressor is: 2.8360141322390047


### Using StandardScalar - On Independent Variables

In [114]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_cv_scaled = ss.transform(X_CV)
X_test_scaled = ss.transform(X_test)

#### Simple Linear Regression

In [134]:
%%time 

model1.fit(X_train_scaled, Y_train)

Wall time: 5.55 s


LinearRegression(n_jobs=-1)

In [135]:
cv_predict = model1.predict(X_cv_scaled)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for Scaled validation data using Simple Linear regression:',RMSE_cv)

RMSE for Scaled validation data using Simple Linear regression: 3.5365963


In [136]:
test_predict =model1.predict(X_test_scaled)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE for Scaled test data using Simple Linear regression:',RMSE_test)

RMSE for Scaled test data using Simple Linear regression: 3.5803967


####  Linear Regression with L1 Regularization

In [137]:
model2.fit(X_train_scaled,Y_train)

Lasso(alpha=0.1)

In [138]:
cv_predict = model2.predict(X_cv_scaled)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for Scaled validation data using Simple Linear regression with L1 regularization:',RMSE_cv)

RMSE for Scaled validation data using Simple Linear regression with L1 regularization: 3.546668


In [140]:
test_predict =model2.predict(X_test_scaled)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE for Scaled test data using Simple Linear regression with L1 regularization:',RMSE_test)

RMSE for Scaled test data using Simple Linear regression with L1 regularization: 3.5928266


#### Linear Regression with L2 Regularization

In [121]:
model3.fit(X_train_scaled,Y_train)

Ridge(alpha=0.1)

In [122]:
cv_predict = model3.predict(X_cv_scaled)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for Scaled validation data using Simple Linear regression with L2 regularization:',RMSE_cv)

RMSE for Scaled validation data using Simple Linear regression with L2 regularization: 3.5366046


In [141]:
test_predict =model3.predict(X_test_scaled)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE for Scaled test data using Simple Linear regression with L2 regularization:',RMSE_test)

RMSE for Scaled test data using Simple Linear regression with L2 regularization: 3.580418


#### DecisionTreeRegressor

In [124]:
model4.fit(X_train_scaled,Y_train)

DecisionTreeRegressor(random_state=0)

In [125]:
cv_predict = model4.predict(X_cv_scaled)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for Scaled validation data using DicisionTree Regressor:',RMSE_cv)

RMSE for Scaled validation data using DicisionTree Regressor: 3.1124010421839765


In [142]:
test_predict =model4.predict(X_test_scaled)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE for Scaled test data using DicisionTree Regressor:',RMSE_test)

RMSE for Scaled test data using DicisionTree Regressor: 3.2864545682685127


#### LightGBMRegressor

In [127]:
model5.fit(X_train_scaled,Y_train)

LGBMRegressor()

In [128]:
cv_predict = model5.predict(X_cv_scaled)
RMSE_cv = RMSE_calc(Y_CV,cv_predict)
print('RMSE for Scaled validation data using Simple Linear regression with L2 regularization::',RMSE_cv)

RMSE for Scaled validation data using Simple Linear regression with L2 regularization:: 2.7696777706782143


In [143]:
test_predict =model5.predict(X_test_scaled)
RMSE_test = RMSE_calc(Y_test,test_predict)
print('RMSE for Scaled test data using Simple Linear regression with L2 regularization::',RMSE_test)

RMSE for Scaled test data using Simple Linear regression with L2 regularization:: 2.8153165733559438


In [6]:
myTable = PrettyTable(['Technique Info', 'Model', 'RMSE on Test Data', 'RMSE: After Standardization'])

# Adding rows

myTable.add_row(['Without FE & No Hyperparameter Tuning',' ',' ',' '])
myTable.add_row([' ',' ',' ',' '])
myTable.add_row(['1. Using Date Based Features Only',' ', ' ', ' '])
myTable.add_row(['1.1','Simple Linear Regresion','3.5804','3.5803'])
myTable.add_row(['1.2','LR with L1 Regularization','3.5848','3.5928'])
myTable.add_row(['1.3','LR with L2 Regularization','3.5806','3.5804'])
myTable.add_row(['1.4','Decision Tree','3.2821','3.2864'])
myTable.add_row(['1.5','Light GBM','2.8360','2.8153'])
     

print(myTable)

+---------------------------------------+---------------------------+-------------------+-----------------------------+
|             Technique Info            |           Model           | RMSE on Test Data | RMSE: After Standardization |
+---------------------------------------+---------------------------+-------------------+-----------------------------+
| Without FE & No Hyperparameter Tuning |                           |                   |                             |
|                                       |                           |                   |                             |
|   1. Using Date Based Features Only   |                           |                   |                             |
|                  1.1                  |  Simple Linear Regresion  |       3.5804      |            3.5803           |
|                  1.2                  | LR with L1 Regularization |       3.5848      |            3.5928           |
|                  1.3                  

#### Key Observations:
- Without Hyperparameter tuning and any feature engineering the RMSE score is very high
- No major impact on RMSE score after standardizing the data
- LightGBM model performed well comparitively
- So now our task would be to come up with some meaningful features using feature engineering techniques and optimize our models which can help us to improve our RMSE score.