In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from datetime import datetime

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('src/walmart_clean.csv')
dataset.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Weekday
0,store 6,1572117.54,N,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,store 13,1807545.43,N,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
2,store 11,1244390.03,N,84.57,,214.556497,7.346,,,,
3,store 6,1644470.66,N,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
4,store 4,1857533.7,N,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


In [3]:
print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])
print('Number of lines remaining : ', dataset.shape[0])

Percentage of missing values: 


Store            0.000000
Weekly_Sales     0.000000
Holiday_Flag     0.000000
Temperature      9.401709
Fuel_Price       9.401709
CPI              7.692308
Unemployment     0.000000
Year            12.820513
Month           12.820513
Day             12.820513
Weekday         12.820513
dtype: float64

Number of lines remaining :  117


In [4]:
print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)

Basics statistics: 


Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Weekday
count,117,117.0,117,106.0,106.0,108.0,117.0,102.0,102.0,102.0,102.0
unique,19,,2,,,,,,,,
top,store 3,,N,,,,,,,,
freq,11,,109,,,,,,,,
mean,,1268911.0,,60.581792,3.338991,179.750523,7.399427,2010.882353,6.666667,16.058824,3.735294
std,,653256.5,,17.935627,0.475925,39.674311,0.994117,0.82407,3.342891,8.672049,1.311989
min,,268929.0,,18.79,2.548,126.1266,5.143,2010.0,1.0,1.0,0.0
25%,,570611.2,,45.3425,2.85075,132.610242,6.664,2010.0,4.0,8.0,4.0
50%,,1420405.0,,61.45,3.4975,197.500965,7.368,2011.0,6.0,16.5,4.0
75%,,1807545.0,,75.485,3.73775,214.892987,8.099,2012.0,9.75,24.0,4.0


In [7]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ['Store','Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'Weekday']
target_variable = 'Weekly_Sales'

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

Separating labels from features...


In [8]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'Weekday']
Found categorical features  ['Store', 'Holiday_Flag']


  for i,t in X.dtypes.iteritems():


In [9]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [10]:
# Create pipeline for numeric and categorical features 
numeric_transformer = Pipeline(
    steps=[
        ('imputer', KNNImputer(n_neighbors=10)),
        ('scaler',StandardScaler())
    ])
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder',OneHotEncoder(drop='first'))
    ])



In [11]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [12]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

Performing preprocessings on train set...
        Store Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
105  store 13            N        52.50       3.529  130.826194         6.104   
33    store 5            N        60.71       3.297  218.569962         6.300   
30    store 3            N        80.19       3.467  219.741491         7.567   
56   store 10            N        43.43       3.287  127.191774         8.744   
48   store 13            N        63.60       3.648  129.518333         6.877   

       Year  Month   Day  Weekday  
105  2012.0    3.0  16.0      4.0  
33   2011.0   11.0  11.0      4.0  
30   2011.0    9.0  23.0      4.0  
56      NaN    NaN   NaN      NaN  
48   2011.0    9.0  23.0      4.0  
...Done.
[[-0.46104336  0.43311104 -1.15804529 -1.27967381  1.43987291 -1.09615562
   0.12122151  0.22475151  0.          0.          1.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0

In [13]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


In [15]:
Y_train_pred = regressor.predict(X_train) # Predictions on training set
Y_test_pred = regressor.predict(X_test) # Prédictions on test set 
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))
print ("Cross_Val_Score MEAN on training set :", cross_val_score(regressor,X_train, Y_train, cv=10).mean())
print ("Cross_Val_Score Std on training set :", cross_val_score(regressor,X_train, Y_train, cv=10).std())

R2 score on training set :  0.9752004575307224
R2 score on test set :  0.8971930534373868
Cross_Val_Score MEAN on training set : 0.9464071348907446
Cross_Val_Score Std on training set : 0.017467095738451895


We have small overfitting!