In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import pickle

## data preparation

In [3]:
df = pd.read_csv('../data/dayofweek.csv')
y = df['dayofweek'].values
x = df.drop(columns='dayofweek')

## Using train_test_split with parameters test_size=0.2, random_state=21 get X_train, y_train, X_test, y_test. Use the additional parameter stratify

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21, stratify=y)

## Use stratified K-fold cross-validation with 10 splits to evaluate the accuracy of the model

In [14]:
def cross_validation(model, x, y, kind='short'):
    skf = StratifiedKFold(n_splits=10)
    scores = []
    for train_index, test_index in skf.split(x, y):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(x_train, y_train)
        predict_train = model.predict(x_train)
        predict_test = model.predict(x_test)

        score_train = accuracy_score(y_train, predict_train)
        score_test = accuracy_score(y_test, predict_test)
        scores.append(score_test)
        if kind == 'long':
            print(f'train - {score_train:.5f} | test - {score_test:.5f} ')

    print(f'Average accuracy on cross-validation is {np.mean(scores):.5f}')
    print(f'Std is {np.std(scores):.5f}')


In [15]:
%%time
lr = LogisticRegression(random_state=21, fit_intercept=False)
cross_validation(lr, x_train, y_train, 'long')

train - 0.62902 | test - 0.59259 
train - 0.64633 | test - 0.62963 
train - 0.63479 | test - 0.56296 
train - 0.65622 | test - 0.61481 
train - 0.63397 | test - 0.57778 
train - 0.64056 | test - 0.59259 
train - 0.64138 | test - 0.65926 
train - 0.65952 | test - 0.56296 
train - 0.64333 | test - 0.59701 
train - 0.63674 | test - 0.62687 
Average accuracy on cross-validation is 0.60165
Std is 0.02943
CPU times: user 1.36 s, sys: 241 ms, total: 1.6 s
Wall time: 461 ms


## In the cells below try different values of penalty: none, l1, l2 – you can change the values of solver too.

In [16]:
dic = {
    'none' : ['newton-cg', 'lbfgs'],
    'l2' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'l1' : ['liblinear', 'saga']
}
for penalty in ['none', 'l2', 'l1']:
    s = dic[penalty]
    for solver in s:
        print(f'Solver = {solver}:')
        print(f'Penalty = {penalty}:')
        lr = LogisticRegression(penalty=penalty, solver = solver, random_state=21, fit_intercept=False, max_iter = 10000)
        cross_validation(lr, x_train, y_train)
        print("--------------------")


Solver = newton-cg:
Penalty = none:
Average accuracy on cross-validation is 0.62462
Std is 0.03379
--------------------
Solver = lbfgs:
Penalty = none:
Average accuracy on cross-validation is 0.62462
Std is 0.03379
--------------------
Solver = newton-cg:
Penalty = l2:
Average accuracy on cross-validation is 0.60165
Std is 0.02943
--------------------
Solver = lbfgs:
Penalty = l2:
Average accuracy on cross-validation is 0.60165
Std is 0.02943
--------------------
Solver = liblinear:
Penalty = l2:
Average accuracy on cross-validation is 0.58160
Std is 0.02532
--------------------
Solver = sag:
Penalty = l2:
Average accuracy on cross-validation is 0.60165
Std is 0.02943
--------------------
Solver = saga:
Penalty = l2:
Average accuracy on cross-validation is 0.60165
Std is 0.02943
--------------------
Solver = liblinear:
Penalty = l1:
Average accuracy on cross-validation is 0.58903
Std is 0.03129
--------------------
Solver = saga:
Penalty = l1:
Average accuracy on cross-validation is 0.

## SVM regularization
## Train a baseline model with the only parameters probability=True, kernel='linear', random_state=21.
## Use stratified K-fold cross-validation with 10 splits to evaluate the accuracy of the model.
## The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [17]:
%%time
svm = SVC(probability=True, kernel='linear', random_state=21)
cross_validation(lr, x_train, y_train, 'long')

train - 0.63726 | test - 0.58519 
train - 0.64221 | test - 0.61481 
train - 0.62984 | test - 0.55556 
train - 0.64386 | test - 0.60000 
train - 0.63232 | test - 0.57778 
train - 0.63644 | test - 0.57778 
train - 0.63644 | test - 0.65926 
train - 0.65622 | test - 0.57778 
train - 0.64580 | test - 0.58955 
train - 0.63839 | test - 0.62687 
Average accuracy on cross-validation is 0.59646
Std is 0.02848
CPU times: user 5.84 s, sys: 447 ms, total: 6.29 s
Wall time: 4.28 s


## In the cells below try different values of the parameter C.

In [18]:
for C in range(1, 20, 2):
    print(f'C = {C}:')
    svc = SVC(probability=True, kernel='linear', random_state=21, C=C)
    cross_validation(svc, x_train, y_train)
    print("--------------------")

C = 1:
Average accuracy on cross-validation is 0.65871
Std is 0.04359
--------------------
C = 3:
Average accuracy on cross-validation is 0.67949
Std is 0.04227
--------------------
C = 5:
Average accuracy on cross-validation is 0.69952
Std is 0.04542
--------------------
C = 7:
Average accuracy on cross-validation is 0.70694
Std is 0.04569
--------------------
C = 9:
Average accuracy on cross-validation is 0.72475
Std is 0.04357
--------------------
C = 11:
Average accuracy on cross-validation is 0.73216
Std is 0.04472
--------------------
C = 13:
Average accuracy on cross-validation is 0.73364
Std is 0.04452
--------------------
C = 15:
Average accuracy on cross-validation is 0.73512
Std is 0.04510
--------------------
C = 17:
Average accuracy on cross-validation is 0.73883
Std is 0.04161
--------------------
C = 19:
Average accuracy on cross-validation is 0.73883
Std is 0.04099
--------------------


## Tree
## Train a baseline model with the only parameter max_depth=10 and random_state=21.
## Use stratified K-fold cross-validation with 10 splits to evaluate the accuracy of the model.
## The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.


In [19]:
%%time
tree = DecisionTreeClassifier(max_depth=10, random_state=21)
cross_validation(lr, x_train, y_train, 'long')

train - 0.63726 | test - 0.58519 
train - 0.64221 | test - 0.61481 
train - 0.62984 | test - 0.55556 
train - 0.64386 | test - 0.60000 
train - 0.63232 | test - 0.57778 
train - 0.63644 | test - 0.57778 
train - 0.63644 | test - 0.65926 
train - 0.65622 | test - 0.57778 
train - 0.64580 | test - 0.58955 
train - 0.63839 | test - 0.62687 
Average accuracy on cross-validation is 0.59646
Std is 0.02848
CPU times: user 5.75 s, sys: 340 ms, total: 6.09 s
Wall time: 4.13 s


## In the cells below try different values of the parameter max_depth.


In [20]:
for max_depth in range(10, 30, 2):
    print(f'max_depth={max_depth}:')
    tree = DecisionTreeClassifier(max_depth=max_depth, random_state=21)
    cross_validation(tree, x_train, y_train)
    print("--------------------")

max_depth=10:
Average accuracy on cross-validation is 0.72849
Std is 0.03801
--------------------
max_depth=12:
Average accuracy on cross-validation is 0.80709
Std is 0.03865
--------------------
max_depth=14:
Average accuracy on cross-validation is 0.85310
Std is 0.02351
--------------------
max_depth=16:
Average accuracy on cross-validation is 0.86497
Std is 0.02553
--------------------
max_depth=18:
Average accuracy on cross-validation is 0.87981
Std is 0.02046
--------------------
max_depth=20:
Average accuracy on cross-validation is 0.88278
Std is 0.01660
--------------------
max_depth=22:
Average accuracy on cross-validation is 0.88277
Std is 0.02022
--------------------
max_depth=24:
Average accuracy on cross-validation is 0.88574
Std is 0.02237
--------------------
max_depth=26:
Average accuracy on cross-validation is 0.88574
Std is 0.01737
--------------------
max_depth=28:
Average accuracy on cross-validation is 0.88352
Std is 0.02076
--------------------


## Random forest
## Train a baseline model with the only parameters n_estimators=50, max_depth=14, random_state=21.
## Use stratified K-fold cross-validation with 10 splits to evaluate the accuracy of the model.
## The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [21]:
%%time
random_forest = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
cross_validation(random_forest, x_train, y_train, 'long')

train - 0.96290 | test - 0.86667 
train - 0.96702 | test - 0.91111 
train - 0.96620 | test - 0.83704 
train - 0.97362 | test - 0.90370 
train - 0.96455 | test - 0.91111 
train - 0.96867 | test - 0.85926 
train - 0.96867 | test - 0.91111 
train - 0.96702 | test - 0.87407 
train - 0.97117 | test - 0.90299 
train - 0.96376 | test - 0.86567 
Average accuracy on cross-validation is 0.88427
Std is 0.02551
CPU times: user 907 ms, sys: 8.79 ms, total: 915 ms
Wall time: 914 ms


## In the new cells try different values of the parameters max_depth and n_estimators.
## As a bonus, play with other regularization parameters trying to find the best combination.

In [None]:
max_depth = [i for i in range(15, 40, 5)]
n_estimators = [i for i in range(50, 200, 20)]
for max in max_depth:
    for n in n_estimators:
        print(f'max_depth = {max}:')
        print(f'n_estimators = {n}:')
        random_forest = RandomForestClassifier(max_depth=max, random_state=21, n_estimators=n)
        cross_validation(random_forest, x_train, y_train)
        print("--------------------")

max_depth = 15:
n_estimators = 50:
Average accuracy on cross-validation is 0.89316
Std is 0.01917
--------------------
max_depth = 15:
n_estimators = 70:
Average accuracy on cross-validation is 0.89316
Std is 0.01741
--------------------
max_depth = 15:
n_estimators = 90:
Average accuracy on cross-validation is 0.89242
Std is 0.01640
--------------------
max_depth = 15:
n_estimators = 110:
Average accuracy on cross-validation is 0.89464
Std is 0.01598
--------------------
max_depth = 15:
n_estimators = 130:
Average accuracy on cross-validation is 0.89389
Std is 0.01742
--------------------
max_depth = 15:
n_estimators = 150:
Average accuracy on cross-validation is 0.89463
Std is 0.02237
--------------------
max_depth = 15:
n_estimators = 170:
Average accuracy on cross-validation is 0.89315
Std is 0.01867
--------------------
max_depth = 15:
n_estimators = 190:
Average accuracy on cross-validation is 0.89389
Std is 0.01677
--------------------
max_depth = 20:
n_estimators = 50:
Average 

## Predictions
## Choose the best model and use it to make predictions for the test dataset.
## Calculate the final accuracy.
## Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
## Save the model

In [None]:
random_forest = RandomForestClassifier(max_depth=25, random_state=21, n_estimators=130).fit(x_train, y_train)
pred = random_forest.predict(x_test)
print(f'Final accuracy is {random_forest.score(x_test, y_test)}')

In [None]:
with open('../data/random_forest.pkl','wb') as f:
    pickle.dump(random_forest, f)