In [15]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

In [16]:
# Read CSV into dataframe and preview
file_csv = "data/classify.csv"
classify_df = pd.read_csv(file_csv, encoding="ISO-8859-1")
classify_df.head()

Unnamed: 0,ID,Dept,IsHoliday,Weekly_Sales,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Month
0,0,1,False,24924.5,A,151315,42.31,2.572,,,,,,211.096358,8.106,2
1,1,1,True,46039.49,A,151315,38.51,2.548,,,,,,211.24217,8.106,2
2,2,1,False,41595.55,A,151315,39.93,2.514,,,,,,211.289143,8.106,2
3,3,1,False,19403.54,A,151315,46.63,2.561,,,,,,211.319643,8.106,2
4,4,1,False,21827.9,A,151315,46.5,2.625,,,,,,211.350143,8.106,3


In [17]:
#Replace missing values with 0
classify_df['MarkDown1'] = classify_df['MarkDown1'].fillna(0)
classify_df['MarkDown2'] = classify_df['MarkDown2'].fillna(0)
classify_df['MarkDown3'] = classify_df['MarkDown3'].fillna(0)
classify_df['MarkDown4'] = classify_df['MarkDown4'].fillna(0)
classify_df['MarkDown5'] = classify_df['MarkDown5'].fillna(0)
classify_df['Weekly_Sales'] = classify_df['Weekly_Sales'].fillna(0)
classify_df.head()

Unnamed: 0,ID,Dept,IsHoliday,Weekly_Sales,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Month
0,0,1,False,24924.5,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,2
1,1,1,True,46039.49,A,151315,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,2
2,2,1,False,41595.55,A,151315,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,2
3,3,1,False,19403.54,A,151315,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,2
4,4,1,False,21827.9,A,151315,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,3


In [18]:
classify_df.describe()

Unnamed: 0,ID,Dept,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Month
count,536634.0,536634.0,536634.0,536634.0,536634.0,536634.0,536634.0,536634.0,536634.0,536634.0,536634.0,498472.0,498472.0,536634.0
mean,268316.5,44.277301,12554.588392,136678.55096,58.771762,3.40831,3681.287709,1292.745138,838.971725,1489.918846,2147.337557,172.090481,7.791888,6.295203
std,154913.036515,30.527358,21171.249456,61007.711799,18.678716,0.430861,7593.883,5714.136207,7860.456173,4865.593575,9789.902903,39.542149,1.865076,3.333808
min,0.0,1.0,-4988.94,34875.0,-7.29,2.472,-2781.45,-265.76,-179.26,0.0,-185.17,126.064,3.684,1.0
25%,134158.25,18.0,49.8525,93638.0,45.25,3.041,0.0,0.0,0.0,0.0,0.0,132.521867,6.623,3.0
50%,268316.5,37.0,4118.755,140167.0,60.06,3.523,0.0,0.0,0.0,0.0,0.0,182.44242,7.795,6.0
75%,402474.75,74.0,15497.4175,202505.0,73.23,3.744,5046.74,118.73,29.14,906.45,2852.19,213.748126,8.549,9.0
max,536633.0,99.0,693099.36,219622.0,101.95,4.468,103184.98,104519.54,149483.31,67474.85,771448.1,228.976456,14.313,12.0


In [19]:
len(classify_df)

536634

In [20]:
print(classify_df.dtypes)

ID                int64
Dept              int64
IsHoliday          bool
Weekly_Sales    float64
Type             object
Size              int64
Temperature     float64
Fuel_Price      float64
MarkDown1       float64
MarkDown2       float64
MarkDown3       float64
MarkDown4       float64
MarkDown5       float64
CPI             float64
Unemployment    float64
Month             int64
dtype: object


In [21]:
#Perform one-hot encoding on Type
classify_df = pd.get_dummies(classify_df, columns=["Type"])
classify_df.head()

Unnamed: 0,ID,Dept,IsHoliday,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Month,Type_A,Type_B,Type_C
0,0,1,False,24924.5,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,2,1,0,0
1,1,1,True,46039.49,151315,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,2,1,0,0
2,2,1,False,41595.55,151315,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,2,1,0,0
3,3,1,False,19403.54,151315,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,2,1,0,0
4,4,1,False,21827.9,151315,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,3,1,0,0


In [22]:
#Convert Dept to object
classify_df['Dept'] = classify_df['Dept'].astype('category')

In [23]:
#Perform one-hot encoding on Dept
classify_df = pd.get_dummies(classify_df, columns=["Dept"])
classify_df.head()

Unnamed: 0,ID,IsHoliday,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,...,Dept_90,Dept_91,Dept_92,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99
0,0,False,24924.5,151315,42.31,2.572,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,True,46039.49,151315,38.51,2.548,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2,False,41595.55,151315,39.93,2.514,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3,False,19403.54,151315,46.63,2.561,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4,False,21827.9,151315,46.5,2.625,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
X = classify_df.loc[:, classify_df.columns != 'IsHoliday']
X.head()

Unnamed: 0,ID,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,...,Dept_90,Dept_91,Dept_92,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99
0,0,24924.5,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,46039.49,151315,38.51,2.548,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2,41595.55,151315,39.93,2.514,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3,19403.54,151315,46.63,2.561,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4,21827.9,151315,46.5,2.625,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
y = classify_df['IsHoliday'].to_frame()
y.head()

Unnamed: 0,IsHoliday
0,False
1,True
2,False
3,False
4,False


In [29]:
y['IsHoliday'].value_counts()

False    498045
True      38589
Name: IsHoliday, dtype: int64