In [5]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Read CSV into dataframe and preview
file_csv = "data/classify.csv"
classify_df = pd.read_csv(file_csv, encoding="ISO-8859-1")
classify_df.head()

Unnamed: 0,ID,Dept,IsHoliday,Weekly_Sales,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Month
0,0,1,False,24924.5,A,151315,42.31,2.572,,,,,,211.096358,8.106,2
1,1,1,True,46039.49,A,151315,38.51,2.548,,,,,,211.24217,8.106,2
2,2,1,False,41595.55,A,151315,39.93,2.514,,,,,,211.289143,8.106,2
3,3,1,False,19403.54,A,151315,46.63,2.561,,,,,,211.319643,8.106,2
4,4,1,False,21827.9,A,151315,46.5,2.625,,,,,,211.350143,8.106,3


In [7]:
#Replace missing values with 0
classify_df['MarkDown1'] = classify_df['MarkDown1'].fillna(0)
classify_df['MarkDown2'] = classify_df['MarkDown2'].fillna(0)
classify_df['MarkDown3'] = classify_df['MarkDown3'].fillna(0)
classify_df['MarkDown4'] = classify_df['MarkDown4'].fillna(0)
classify_df['MarkDown5'] = classify_df['MarkDown5'].fillna(0)
classify_df['Weekly_Sales'] = classify_df['Weekly_Sales'].fillna(0)
classify_df.head()
len(classify_df)

498472

In [8]:
classify_df.describe()

Unnamed: 0,ID,Dept,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Month
count,498472.0,498472.0,498472.0,498472.0,498472.0,498472.0,498472.0,498472.0,498472.0,498472.0,498472.0,498472.0,498472.0,498472.0
mean,252172.834647,44.265054,13515.742082,136680.329298,57.711545,3.392078,3543.488498,1346.858058,888.047395,1402.296842,2040.249308,172.090481,7.791888,6.32381
std,148613.893748,30.51391,21669.012168,61003.311799,18.707997,0.440961,7712.59739,5921.247069,8153.322054,4941.193873,10087.061742,39.542149,1.865076,3.44979
min,0.0,1.0,-4988.94,34875.0,-7.29,2.472,-2781.45,-265.76,-179.26,0.0,-185.17,126.064,3.684,1.0
25%,124617.75,18.0,388.0,93638.0,43.85,3.001,0.0,0.0,0.0,0.0,0.0,132.521867,6.623,3.0
50%,249235.5,37.0,5088.24,140167.0,58.74,3.501,0.0,0.0,0.0,0.0,0.0,182.44242,7.795,6.0
75%,373853.25,74.0,16901.76,202505.0,71.93,3.743,4657.77,58.08,17.92,687.56,2686.11,213.748126,8.549,9.0
max,536620.0,99.0,693099.36,219622.0,100.14,4.468,103184.98,104519.54,149483.31,67474.85,771448.1,228.976456,14.313,12.0


In [9]:
len(classify_df)

498472

In [10]:
print(classify_df.dtypes)

ID                int64
Dept              int64
IsHoliday          bool
Weekly_Sales    float64
Type             object
Size              int64
Temperature     float64
Fuel_Price      float64
MarkDown1       float64
MarkDown2       float64
MarkDown3       float64
MarkDown4       float64
MarkDown5       float64
CPI             float64
Unemployment    float64
Month             int64
dtype: object


In [11]:
#Convert Dept to object
classify_df['Dept'] = classify_df['Dept'].astype('category')

In [12]:
classify_df = classify_df.reset_index()

In [13]:
X = classify_df.loc[:, classify_df.columns != 'IsHoliday']
X.head()

Unnamed: 0,index,ID,Dept,Weekly_Sales,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Month
0,0,0,1,24924.5,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,2
1,1,1,1,46039.49,A,151315,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,2
2,2,2,1,41595.55,A,151315,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,2
3,3,3,1,19403.54,A,151315,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,2
4,4,4,1,21827.9,A,151315,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,3


In [14]:
y = classify_df['IsHoliday'].to_frame()
y.head()

Unnamed: 0,IsHoliday
0,False
1,True
2,False
3,False
4,False


In [15]:
y['IsHoliday'].value_counts()

False    459883
True      38589
Name: IsHoliday, dtype: int64

In [16]:
X_encoded = pd.get_dummies(X)
X_encoded.head()

Unnamed: 0,index,ID,Weekly_Sales,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,...,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99,Type_A,Type_B,Type_C
0,0,0,24924.5,151315,42.31,2.572,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,46039.49,151315,38.51,2.548,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2,2,2,41595.55,151315,39.93,2.514,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3,3,3,19403.54,151315,46.63,2.561,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,4,4,21827.9,151315,46.5,2.625,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X_encoded)

In [18]:
le = preprocessing.LabelEncoder()
y_encoded = le.fit_transform(y.values.ravel())

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X_std,
    y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42    
)

In [20]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.f1_score(y_test, y_pred)

0.4085914085914086

In [22]:
param_grid = [{'C': np.logspace(-3, 3, 10)}]

grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42),
    scoring='f1',
    n_jobs=-1
)

scores = cross_val_score(
    estimator=grid_search,
    X=X_std,
    y=y_encoded,
    cv=StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0),
    scoring='f1',
    n_jobs=-1
)

In [23]:
scores

array([0.41621138, 0.40564621, 0.40858283, 0.41573927, 0.41598727,
       0.40747776, 0.40032106, 0.40549759, 0.40461384, 0.4107125 ])

In [24]:
scores.mean()

0.40907897226229856

In [25]:
grid_search.fit(X_std, y_encoded)
grid_search.best_params_

{'C': 0.1}

In [28]:
final_clf = LogisticRegression(C=0.1)
final_clf.fit(X_std, y_encoded);

In [30]:
feature_ranks = pd.DataFrame(final_clf.coef_, index=['parameter value'])
feature_ranks.columns = X_encoded.columns
feature_ranks.sort_values('parameter value', axis=1, ascending=False).T.head()

Unnamed: 0,parameter value
MarkDown3,1.114481
MarkDown1,0.524534
Month,0.49921
MarkDown2,0.423139
Unemployment,0.159726


In [31]:
feature_ranks.sort_values('parameter value', axis=1, ascending=True).T.head()

Unnamed: 0,parameter value
MarkDown5,-1.834897
Temperature,-0.587386
Fuel_Price,-0.303047
Size,-0.275486
ID,-0.192285
