# Import library

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

In [2]:
walmart_df = pd.read_csv('./data/walmart_store_cleaned.csv')

In [3]:
walmart_df.dtypes

Store             int64
Weekly_Sales    float64
Holiday_Flag    float64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
dtype: object

In [4]:
walmart_df['Store'] = walmart_df['Store'].astype(object)
walmart_df['Holiday_Flag'] = walmart_df['Holiday_Flag'].astype(object)

### Separate target variable Y from features X

In [5]:
target_name = 'Weekly_Sales'

print("Separating labels from features...")
Y = walmart_df.loc[:,target_name]
X = walmart_df.loc[:,[c for c in walmart_df.columns if c!=target_name]]


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state =0)

X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.tolist()
Y_test = Y_test.tolist()

Separating labels from features...


In [7]:
#Identify numerical features = Temperature, Fuel_Price, CPI, Unemployment
numeric_features = [2,3,4,5]
numerical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
])

In [8]:
#Identify categorical features = Holiday_Flag, Store
categorical_features = [0,1]
categorical_transformer = Pipeline(
    steps = [
        ('imputer_cat', SimpleImputer(strategy = 'most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ])

### Pre-processor

In [9]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [10]:
walmart_df.dtypes

Store            object
Weekly_Sales    float64
Holiday_Flag     object
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
dtype: object

In [11]:
#Conduct a pre-process on train set 

print('Train set')
print(X_train[0:5,:])
X_train = preprocessor.fit_transform(X_train)
print('-------------------------------------')
print(X_train[0:5,:])
print()


Train set
[[16 0.0 61.79 2.7110000000000003 189.5231276 6.867999999999999]
 [5 0.0 69.17 3.594 224.0192873 5.4220000000000015]
 [19 0.0 33.26 3.789 133.9587419 7.771]
 [8 0.0 82.92 3.554 219.07019680000005 6.425]
 [1 0.0 74.78 2.854 210.3374261 7.808]]
-------------------------------------
  (0, 0)	0.042603619566292404
  (0, 1)	-1.2684064129831014
  (0, 2)	0.20507787897090277
  (0, 3)	-0.5553454246885398
  (0, 18)	1.0
  (0, 23)	1.0
  (1, 0)	0.45927689938857913
  (1, 1)	0.5805882929947872
  (1, 2)	1.092603852656994
  (1, 3)	-2.1273444147693765
  (1, 8)	1.0
  (1, 23)	1.0
  (2, 0)	-1.5681943036491326
  (2, 1)	0.988916682197153
  (2, 2)	-1.2244967736725922
  (2, 3)	0.4263385919387067
  (2, 21)	1.0
  (2, 23)	1.0
  (3, 0)	1.2355990670791537
  (3, 1)	0.49682862341481476
  (3, 2)	0.9652723930663473
  (3, 3)	-1.0369467750383399
  (3, 11)	1.0
  (3, 23)	1.0
  (4, 0)	0.7760163438063336
  (4, 1)	-0.9689655942347007
  (4, 2)	0.740593446601042
  (4, 3)	0.46656263248710983
  (4, 4)	1.0
  (4, 23)	1.0



In [12]:
#Conduct a pre-process on test set

print('Test set')
print(X_test[0:5,:])
X_test = preprocessor.transform(X_test)
print('-------------------------------------')
print(X_test[0:5,:])
print()

Test set
[[6 0.0 78.89 2.759 212.412888 7.0920000000000005]
 [7 0.0 38.26 2.725 189.7048215 8.963]
 [11 0.0 52.77 3.51 223.9170153 6.832999999999998]
 [10 0.0 57.62 3.882 130.6457931 7.545]
 [3 0.0 82.7 3.346 225.3068615 6.664]]
-------------------------------------
  (0, 0)	1.008066097203298
  (0, 1)	-1.1678948094871355
  (0, 2)	0.7939914584280194
  (0, 3)	-0.3118269089360434
  (0, 9)	1.0
  (0, 23)	1.0
  (1, 0)	-1.2858953335798327
  (1, 1)	-1.2390905286301115
  (1, 2)	0.2097525458588324
  (1, 3)	1.722204979335912
  (1, 10)	1.0
  (1, 23)	1.0
  (2, 0)	-0.4666637224387243
  (2, 1)	0.40469298687684513
  (2, 2)	1.0899725726197316
  (2, 3)	-0.5933951927748682
  (2, 14)	1.0
  (2, 23)	1.0
  (3, 0)	-0.19283372147150377
  (3, 1)	1.1836579139705887
  (3, 2)	-1.30973316331895
  (3, 3)	0.18064580372413577
  (3, 13)	1.0
  (3, 23)	1.0
  (4, 0)	1.2231779123961046
  (4, 1)	0.061278341598959125
  (4, 2)	1.1257308892745825
  (4, 3)	-0.777121215820276
  (4, 6)	1.0
  (4, 23)	1.0



### Train the model 

In [68]:
model = LinearRegression()

print("Training model...")
model.fit(X_train, Y_train) # Training is always done on train set !!
print("...Done.")

Training model...
...Done.


In [69]:
# Print scores LinearRegression Test set

Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
score = model.score(X_test, Y_test)
score

0.9369566144771145

In [78]:
#Score train test
#Compute r2 score difference on linear reg
r2_train_lineartrain = r2_score(Y_train, Y_train_pred)
r2_train_lineartest = r2_score(Y_test, Y_test_pred)

linear_diff = r2_train_lineartrain - r2_train_lineartest

print(linear_diff)

0.04562302530375528


In [71]:
#Feature Importance

feature_importance = list(zip(X.columns.to_list(), model.coef_))
display(feature_importance)


[('Store', -2278.113621522603),
 ('Holiday_Flag', -75460.94596528573),
 ('Temperature', 801273.689885493),
 ('Fuel_Price', 38911.54302437009),
 ('CPI', -423238.2681601989),
 ('Unemployment', -161228.4052975872)]

In [None]:
#We are clearly close to an overfit, the value is quite high, lets' try to reduce the overfit 

### Overfit solutions
Regarding an overfit, as we have a linear regression we can run a regularization:

- Ridge
- Lasso

Lasso works well in the case if we have a high number of features (this isn't the case here) as Lasso shrinks the least important features coefficient to zero

In [72]:
model_2 = Ridge()
#We'll tune in GridSearch to try to get the best possible parameters here in our attempt to reduce overfit...
parameters = {'alpha':[0,0.1,0.3,0.5]}
gridsearch_param_ridge = GridSearchCV(model_2, param_grid = parameters, cv = 3, verbose = 0)

gridsearch_param_ridge.fit(X_train, Y_train)
gridsearch_param_ridge.best_params_

#Instanciate our estimator to get the score according to documentation 
opti_ridge = gridsearch_param_ridge.best_estimator_
print(f'Best parameter was:{opti_ridge}')
print('We have a score on training set at', opti_ridge.score(X_train, Y_train))
print('We have a score on testing set at', opti_ridge.score(X_test, Y_test))
print('Our best R2 score is', gridsearch_param_ridge.best_score_)

Best parameter was:Ridge(alpha=0)
We have a score on training set at 0.9825795636153728
We have a score on testing set at 0.9369387293118862
Our best R2 score is 0.8607855025280138


In [79]:
#Compute r2 score diff on ridge linear reg

Y_train_predridge = gridsearch_param_ridge.predict(X_train)
r2_score_ridgetrain = r2_score(Y_train, Y_train_predridge)
Y_test_predridge = gridsearch_param_ridge.predict(X_test)
r2_score_ridgetest = r2_score(Y_test, Y_test_predridge)

linear_diff_ridge = r2_score_ridgetrain - r2_score_ridgetest
print(linear_diff_ridge)

0.045640834303486644


In [74]:
#Same architecture 
model_3 = Lasso()

parameters = {'alpha':[0,50,100,150,300]}
gridsearch_param_lasso = GridSearchCV(model_3, param_grid=parameters, cv=4, verbose=0)

gridsearch_param_lasso.fit(X_train, Y_train)

gridsearch_param_lasso.best_params_

opti_lasso = gridsearch_param_lasso.best_estimator_

print(f'Best parameter was:{opti_lasso}')
print('We have a score on training set at', opti_lasso.score(X_train, Y_train))
print('We have a score on testing set at', opti_lasso.score(X_test, Y_test))
print('Our best R2 score is', gridsearch_param_lasso.best_score_)

Best parameter was:Lasso(alpha=100)
We have a score on training set at 0.9815642539679015
We have a score on testing set at 0.9400147979244405
Our best R2 score is 0.8631428547350799


  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.sparse_enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.sparse_enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.sparse_enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


In [76]:
#Compute r2 score diff on lasso linear reg
Y_train_predlasso = gridsearch_param_lasso.predict(X_train)
r2_score_lassotrain = r2_score(Y_train, Y_train_predlasso)
Y_test_predlasso = gridsearch_param_lasso.predict(X_test)
r2_score_lassotest = r2_score(Y_test, Y_test_predlasso)

linear_diff_lasso = r2_score_lassotrain - r2_score_lassotest
print(linear_diff_lasso)

0.04154945604346094


In [80]:
print('Linear difference', linear_diff)
print('Ridge difference', linear_diff_ridge)
print('Lasso difference', linear_diff_lasso)

Linear difference 0.04562302530375528
Ridge difference 0.045640834303486644
Lasso difference 0.04154945604346094


### Conclusion 

- Lasso seems to be working better here in comparison to Ridge, it's a slight improvement 0.0035
- We could try to get more features with the date maybe ? 
- We might need more data here to assess our problem 90 rows might not be enough