# Day 09. Exercise 00
# Regularization

## 0. Imports

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../../data/dayofweek.csv')
X = df.drop(axis=1, labels=['dayofweek'])
y = df['dayofweek']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [4]:
%%time
base = LogisticRegression(random_state=21, fit_intercept=False)
cv_scores = cross_validate(base, X_train, y_train, return_train_score=True, cv = 10)
for train_score, valid_score in zip(cv_scores['train_score'], cv_scores['test_score']):
    print("train -", "{:.5f}".format(train_score), "| valid -", "{:.5f}".format(valid_score))

print("Average accuracy on crossval is {:.5f}".format(np.mean(cv_scores["test_score"])))
print("Std is {:.5f}".format(np.std(cv_scores["test_score"])))

train - 0.62819 | valid - 0.59259
train - 0.64716 | valid - 0.62963
train - 0.63479 | valid - 0.57037
train - 0.65540 | valid - 0.61481
train - 0.63314 | valid - 0.57778
train - 0.64056 | valid - 0.59259
train - 0.64221 | valid - 0.65926
train - 0.65952 | valid - 0.56296
train - 0.64333 | valid - 0.59701
train - 0.63591 | valid - 0.62687
Average accuracy on crossval is 0.60239
Std is 0.02852
CPU times: user 87.1 ms, sys: 7.33 ms, total: 94.4 ms
Wall time: 108 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [5]:
none_reg  = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver='newton-cg')
l1_reg = LogisticRegression(random_state=21, fit_intercept=False, penalty='l1', solver='liblinear')
l2_reg = LogisticRegression(random_state=21, fit_intercept=False, penalty='l2')

In [6]:
cv_scores = cross_validate(none_reg, X_train, y_train, return_train_score=True, cv = 10)
for train_score, valid_score in zip(cv_scores['train_score'], cv_scores['test_score']):
    print("train -", "{:.5f}".format(train_score), "| valid -", "{:.5f}".format(valid_score))

print("Average accuracy on crossval is {:.5f}".format(np.mean(cv_scores["test_score"])))
print("Std is {:.5f}".format(np.std(cv_scores["test_score"])))

train - 0.66694 | valid - 0.63704
train - 0.65787 | valid - 0.65926
train - 0.66612 | valid - 0.57778
train - 0.66529 | valid - 0.62963
train - 0.66694 | valid - 0.62222
train - 0.65952 | valid - 0.57778
train - 0.65045 | valid - 0.69630
train - 0.68425 | valid - 0.61481
train - 0.66474 | valid - 0.62687
train - 0.65651 | valid - 0.60448
Average accuracy on crossval is 0.62462
Std is 0.03379


In [7]:
cv_scores = cross_validate(l1_reg, X_train, y_train, return_train_score=True, cv = 10)
for train_score, valid_score in zip(cv_scores['train_score'], cv_scores['test_score']):
    print("train -", "{:.5f}".format(train_score), "| valid -", "{:.5f}".format(valid_score))

print("Average accuracy on crossval is {:.5f}".format(np.mean(cv_scores["test_score"])))
print("Std is {:.5f}".format(np.std(cv_scores["test_score"])))

train - 0.61830 | valid - 0.54815
train - 0.62737 | valid - 0.62222
train - 0.60511 | valid - 0.54074
train - 0.63644 | valid - 0.62222
train - 0.62407 | valid - 0.55556
train - 0.62325 | valid - 0.58519
train - 0.61253 | valid - 0.63704
train - 0.64716 | valid - 0.58519
train - 0.63015 | valid - 0.59701
train - 0.61367 | valid - 0.59701
Average accuracy on crossval is 0.58903
Std is 0.03129


In [8]:
cv_scores = cross_validate(l2_reg, X_train, y_train, return_train_score=True, cv = 10)
for train_score, valid_score in zip(cv_scores['train_score'], cv_scores['test_score']):
    print("train -", "{:.5f}".format(train_score), "| valid -", "{:.5f}".format(valid_score))

print("Average accuracy on crossval is {:.5f}".format(np.mean(cv_scores["test_score"])))
print("Std is {:.5f}".format(np.std(cv_scores["test_score"])))

train - 0.62819 | valid - 0.59259
train - 0.64716 | valid - 0.62963
train - 0.63479 | valid - 0.57037
train - 0.65540 | valid - 0.61481
train - 0.63314 | valid - 0.57778
train - 0.64056 | valid - 0.59259
train - 0.64221 | valid - 0.65926
train - 0.65952 | valid - 0.56296
train - 0.64333 | valid - 0.59701
train - 0.63591 | valid - 0.62687
Average accuracy on crossval is 0.60239
Std is 0.02852


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [19]:
base_svm = SVC(probability=True,kernel='linear', random_state=21)

In [10]:
cv_scores = cross_validate(base_svm, X_train, y_train, return_train_score=True, cv = 10)
for train_score, valid_score in zip(cv_scores['train_score'], cv_scores['test_score']):
    print("train -", "{:.5f}".format(train_score), "| valid -", "{:.5f}".format(valid_score))

print("Average accuracy on crossval is {:.5f}".format(np.mean(cv_scores["test_score"])))
print("Std is {:.5f}".format(np.std(cv_scores["test_score"])))

train - 0.70486 | valid - 0.65926
train - 0.69662 | valid - 0.75556
train - 0.69415 | valid - 0.62222
train - 0.70239 | valid - 0.65185
train - 0.69085 | valid - 0.65185
train - 0.68920 | valid - 0.64444
train - 0.69250 | valid - 0.72593
train - 0.70074 | valid - 0.62222
train - 0.69605 | valid - 0.61940
train - 0.71087 | valid - 0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [12]:
for c in range(1, 20, 1):
    m = SVC(probability=True, kernel='linear', random_state=21, C=c)
    print(c, np.mean(cross_validate(m, X_train, y_train, cv=10)['test_score']))

1 0.6587064676616916
2 0.6683637368711995
3 0.6794914317302376
4 0.6935986733001658
5 0.6995245992260919
6 0.6987949143173022
7 0.7069375345494748
8 0.7195577667219458
9 0.7247484798231068
10 0.7277114427860697
11 0.7321614151464898
12 0.7336373687119956
13 0.7336373687119956
14 0.7336373687119956
15 0.7351188501934771
16 0.7410613598673301
17 0.7388336097291323
18 0.7388280818131564
19 0.7388280818131564


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [44]:
base_tree = DecisionTreeClassifier(max_depth=10, random_state=21)
cv_scores = cross_validate(base_tree, X_train, y_train, return_train_score=True, cv = 10)
for train_score, valid_score in zip(cv_scores['train_score'], cv_scores['test_score']):
    print("train -", "{:.5f}".format(train_score), "| valid -", "{:.5f}".format(valid_score))

print("Average accuracy on crossval is {:.5f}".format(np.mean(cv_scores["test_score"])))
print("Std is {:.5f}".format(np.std(cv_scores["test_score"])))

train - 0.81039 | valid - 0.74074
train - 0.77741 | valid - 0.74074
train - 0.83347 | valid - 0.70370
train - 0.79720 | valid - 0.76296
train - 0.82440 | valid - 0.75556
train - 0.80379 | valid - 0.68889
train - 0.80709 | valid - 0.76296
train - 0.80132 | valid - 0.65926
train - 0.80807 | valid - 0.75373
train - 0.80478 | valid - 0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [None]:
for d in range(1, 20):
    m = DecisionTreeClassifier(random_state=21, max_depth=d)
    print(d, float(np.mean(cross_validate(m, X_train, y_train, cv=10)['test_score'])))

1 0.3553178551686015
2 0.42948590381426205
3 0.4613985627418463
4 0.5088944168048645
5 0.5430127142067441
6 0.5956992813709232
7 0.6498894416804866
8 0.6609839690436705
9 0.7032504145936983
10 0.7255113322277501
11 0.7699889441680486
12 0.8041348811498065
13 0.8300884466556109
14 0.8523548922056385
15 0.8545881702598119
16 0.8657103372028745
17 0.8738750690989496
18 0.8805527915975677
19 0.8805417357656165


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [20]:
base_forest = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
cv_scores = cross_validate(base_forest, X_train, y_train, return_train_score=True, cv = 10)
for train_score, valid_score in zip(cv_scores['train_score'], cv_scores['test_score']):
    print("train -", "{:.5f}".format(train_score), "| valid -", "{:.5f}".format(valid_score))

print("Average accuracy on crossval is {:.5f}".format(np.mean(cv_scores["test_score"])))
print("Std is {:.5f}".format(np.std(cv_scores["test_score"])))

train - 0.96455 | valid - 0.88148
train - 0.96208 | valid - 0.91852
train - 0.96785 | valid - 0.86667
train - 0.96455 | valid - 0.89630
train - 0.96538 | valid - 0.91111
train - 0.96538 | valid - 0.88148
train - 0.97115 | valid - 0.91852
train - 0.96867 | valid - 0.85185
train - 0.97364 | valid - 0.88060
train - 0.97941 | valid - 0.86567
Average accuracy on crossval is 0.88722
Std is 0.02204


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [21]:
for d in range(1, 25):
    m = RandomForestClassifier(n_estimators=50, max_depth=d, random_state=21)
    print(d, float(np.mean(cross_validate(m, X_train, y_train, cv=10)['test_score'])))

1 0.40279712548369256
2 0.46214483139856277
3 0.4859038142620232
4 0.5170757324488668
5 0.576384742951907
6 0.6335102266445551
7 0.6787672747374239
8 0.7210503040353787
9 0.7641238253178552
10 0.7952294085129906
11 0.8323548922056384
12 0.8597844112769486
13 0.8679380873410725
14 0.8872194582642343
15 0.8901824212271974
16 0.8916749585406301
17 0.900580431177446
18 0.9072692095080155
19 0.9087506909894969
20 0.9087396351575455
21 0.9079933665008291
22 0.910978441127695
23 0.9117025981205085
24 0.9080099502487562


In [24]:
for n in range(50, 700, 50):
    m = RandomForestClassifier(n_estimators=n, max_depth=23, random_state=21)
    print(n, float(np.mean(cross_validate(m, X_train, y_train, cv=10)['test_score'])))

50 0.9117025981205085
100 0.91171365395246
150 0.9131951354339414
200 0.9124543946932008
250 0.9117025981205085
300 0.9117081260364841
350 0.9139358761746822
400 0.9139358761746822
450 0.9139414040906578
500 0.9139414040906578
550 0.9131951354339414
600 0.9124543946932006
650 0.9131951354339414


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [27]:
best_model = RandomForestClassifier(n_estimators=350, max_depth=23, random_state=21)
best_model.fit(X_train, y_train)
test_pred = best_model.predict(X_test)
accuracy_score(y_pred=test_pred, y_true=y_test)

0.9319526627218935

In [43]:
cm = confusion_matrix(y_pred=test_pred, y_true=y_test)
for i, cls in enumerate(cm):
    sum = cls.sum()
    err = sum - cls[i]
    frac = err/sum*100
    print(i, frac)

0 25.925925925925924
1 7.2727272727272725
2 6.666666666666667
3 2.5
4 14.285714285714285
5 7.4074074074074066
6 1.4084507042253522


In [47]:
joblib.dump(best_model, "model")

['model']