In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from datetime import datetime

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [98]:
dataset = pd.read_csv('src/walmart_clean.csv')
dataset.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Weekday
0,store 6,1572117.54,N,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,store 13,1807545.43,N,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
2,store 11,1244390.03,N,84.57,,214.556497,7.346,,,,
3,store 6,1644470.66,N,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
4,store 4,1857533.7,N,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


In [99]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ['Store','Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'Weekday']
target_variable = 'Weekly_Sales'

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

Separating labels from features...


In [100]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'Weekday']
Found categorical features  ['Store', 'Holiday_Flag']



iteritems is deprecated and will be removed in a future version. Use .items instead.



In [101]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [102]:
# Create pipeline for numeric and categorical features 
numeric_transformer = Pipeline(
    steps=[
        ('imputer', KNNImputer(n_neighbors=10)),
        ('scaler',StandardScaler())
    ])
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder',OneHotEncoder(drop='first'))
    ])



In [103]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [104]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

Performing preprocessings on train set...
        Store Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
105  store 13            N        52.50       3.529  130.826194         6.104   
33    store 5            N        60.71       3.297  218.569962         6.300   
30    store 3            N        80.19       3.467  219.741491         7.567   
56   store 10            N        43.43       3.287  127.191774         8.744   
48   store 13            N        63.60       3.648  129.518333         6.877   

       Year  Month   Day  Weekday  
105  2012.0    3.0  16.0      4.0  
33   2011.0   11.0  11.0      4.0  
30   2011.0    9.0  23.0      4.0  
56      NaN    NaN   NaN      NaN  
48   2011.0    9.0  23.0      4.0  
...Done.
[[-0.46104336  0.43311104 -1.15804529 -1.27967381  1.43987291 -1.09615562
   0.12122151  0.22475151  0.          0.          1.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0

In [105]:
# Perform grid search
print("Grid search...")
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.09, 0.1, 0.11, ] # 0 corresponds to no regularization
}
gridsearch_R = GridSearchCV(regressor, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
gridsearch_R.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_R.best_params_)
Y_train_pred = gridsearch_R.predict(X_train) # Predictions on training set
Y_test_pred = gridsearch_R.predict(X_test) # Prédictions on test set 
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))
print ("Cross_Val_Score MEAN on training set :", cross_val_score(regressor,X_train, Y_train, cv=10).mean())
print ("Cross_Val_Score Std on training set :", cross_val_score(regressor,X_train, Y_train, cv=10).std())

Grid search...
...Done.
Best hyperparameters :  {'alpha': 0.09}
R2 score on training set :  0.9732040250521322
R2 score on test set :  0.9030764816179001
Cross_Val_Score MEAN on training set : 0.86186746827263
Cross_Val_Score Std on training set : 0.08168062070351048


In [106]:
# Perform grid search
print("Grid search...")
regressor = Lasso()
# Grid of values to be tested
params = {
    'alpha': [600, 605, 610] # 0 corresponds to no regularization
}
gridsearch_L = GridSearchCV(regressor, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
gridsearch_L.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_L.best_params_)
Y_train_pred = gridsearch_L.predict(X_train) # Predictions on training set
Y_test_pred = gridsearch_L.predict(X_test) # Prédictions on test set 
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))
print ("Cross_Val_Score MEAN on training set :", cross_val_score(regressor,X_train, Y_train, cv=10).mean())
print ("Cross_Val_Score Std on training set :", cross_val_score(regressor,X_train, Y_train, cv=10).std())


Grid search...
...Done.
Best hyperparameters :  {'alpha': 605}
R2 score on training set :  0.9740679921486383
R2 score on test set :  0.9024450313867066
Cross_Val_Score MEAN on training set : 0.9464128182453759
Cross_Val_Score Std on training set : 0.017481572632161175


In [107]:
gridsearch_L.best_estimator_.coef_

array([  -22363.16176544,   -17979.44278826,    63169.76625609,
         -50637.46805123,   -13075.06033988,    17084.30534052,
         -37667.8866321 ,    -6090.6817543 ,   417724.37840239,
         -16314.55978536,   509286.95117899,   580417.90053542,
        -758133.46588691, -1052378.04065539,  -599505.89890982,
        -316749.82725769,       -0.        ,   194517.77154466,
         358947.3921567 , -1206022.9159412 ,   658185.62511532,
       -1314548.55097603,    39838.87217704,  -923043.39598008,
        -776692.01064939, -1176031.35289801,   -69993.90010449])

In [108]:
preprocessor.feature_names_in_

array(['Store', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI',
       'Unemployment', 'Year', 'Month', 'Day', 'Weekday'], dtype=object)

In [109]:
len(dataset['Store'].unique())

19

In [110]:
features_list_preproc = []
features_list_preproc.extend(['CPI', 'Temperature', 'Unemployment', 'Fuel_Price', 'Year', 'Month', 'Day', 'Weekday'])
features_list_preproc.extend([f'Holiday_Flag_{i}' for i in range(1,2)])
features_list_preproc.extend([f'Store_{i}' for i in range(1,19)])

In [111]:
import plotly.express as px

podium_coef = pd.DataFrame(gridsearch_L.best_estimator_.coef_, index=features_list_preproc, columns=['coeff_value'])
podium_coef['coeff_value']=round(podium_coef['coeff_value'],2)

fig = px.bar(podium_coef, title='Coefficients importance by features')
fig.show()

In [112]:
baseline = ['Unemployment', 'CPI', 'Temperature', 'Holiday_Flag', 'Store'] #'Fuel_Price'
X = dataset[baseline]
numeric_features = ['Unemployment', 'CPI', 'Temperature', ] #'Fuel_Price'
categorical_features = ['Holiday_Flag', 'Store']

preprocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cat',categorical_transformer,categorical_features)
    ])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [113]:
regressor=Lasso()
params={
    'alpha':[580, 590, 600]
}

gridsearch=GridSearchCV(regressor,param_grid=params,cv=5)

In [114]:
gridsearch.fit(X_train,Y_train)
print("Best hyperparameters : ", gridsearch.best_params_)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_train)
Y_test_pred = regressor.predict(X_test)
print (f'R2_score sur le train set : {r2_score(Y_train, Y_pred)}')
print (f'R2_score sur le test set : {r2_score(Y_test, Y_test_pred)}')
print (f'Cross_Val_Score MEAN sur le train set : {cross_val_score(regressor,X_train, Y_train, cv=10).mean()}')
print (f'Cross_Val_Score Std sur le train set : {cross_val_score(regressor,X_train, Y_train, cv=10).std()}')

Best hyperparameters :  {'alpha': 590}
R2_score sur le train set : 0.9684652403090865
R2_score sur le test set : 0.9029605248388254
Cross_Val_Score MEAN sur le train set : 0.9451757155132195
Cross_Val_Score Std sur le train set : 0.03610491738701365


We have fighted overfitting, the final score is around twice the standard deviation (R2_score on the train set is'nt the better but more closer of the R2_score on the test set)