# Day 09. Exercise 00
# Regularization

## 0. Imports

In [37]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [38]:
TEST_SIZE = 0.2
RANDOM_STATE = 21

In [39]:
df = pd.read_csv("../data/dayofweek.csv")
df

Unnamed: 0,dayofweek,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,4,-0.788667,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4,-0.756764,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4,-0.724861,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,-0.692958,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,-0.661055,-2.562352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,3,-0.533442,0.945382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,3,-0.629151,0.945382,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,3,-0.597248,0.945382,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,3,-0.565345,0.945382,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [40]:
X = df.drop("dayofweek", axis=1)
y = df["dayofweek"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [41]:
logreg_model = LogisticRegression(random_state=RANDOM_STATE, fit_intercept=False)

In [42]:
def cross_val_scores(
  model, X, y, n_splits=10
):

  skf = StratifiedKFold(n_splits=n_splits)

  scores = cross_validate(
    model, 
    X,
    y,
    cv=skf,
    scoring="accuracy",
    return_train_score=True,
  )

  train_scores = scores["train_score"]
  valid_scores = scores["test_score"]

  for tr, val in zip(train_scores, valid_scores):
    print(f"train  -  {tr:.5f}\t|\tvalid  -  {val:.5f}")

  avg_acc = np.mean(valid_scores)
  std_acc = np.std(valid_scores)
  print(f"Average accuracy on crossval is {avg_acc:.5f}", f"Std is {std_acc:.5f}", sep="\n")

In [43]:
%%time 

cross_val_scores(logreg_model, X_train, y_train, n_splits=10)

train  -  0.62819	|	valid  -  0.59259
train  -  0.64716	|	valid  -  0.62963
train  -  0.63479	|	valid  -  0.57037
train  -  0.65540	|	valid  -  0.61481
train  -  0.63314	|	valid  -  0.57778
train  -  0.64056	|	valid  -  0.59259
train  -  0.64221	|	valid  -  0.65926
train  -  0.65952	|	valid  -  0.56296
train  -  0.64333	|	valid  -  0.59701
train  -  0.63591	|	valid  -  0.62687
Average accuracy on crossval is 0.60239
Std is 0.02852
CPU times: user 3.19 s, sys: 0 ns, total: 3.19 s
Wall time: 430 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [44]:
%%time

logreg_model = LogisticRegression(random_state=RANDOM_STATE, fit_intercept=False, penalty=None, solver='lbfgs', max_iter=1000)
cross_val_scores(logreg_model, X_train, y_train, n_splits=10)

train  -  0.66612	|	valid  -  0.63704
train  -  0.65870	|	valid  -  0.65926
train  -  0.66694	|	valid  -  0.57778
train  -  0.66529	|	valid  -  0.62963
train  -  0.66694	|	valid  -  0.62222
train  -  0.65870	|	valid  -  0.57778
train  -  0.64963	|	valid  -  0.69630
train  -  0.68425	|	valid  -  0.61481
train  -  0.66474	|	valid  -  0.62687
train  -  0.65733	|	valid  -  0.60448
Average accuracy on crossval is 0.62462
Std is 0.03379
CPU times: user 7.33 s, sys: 39.8 ms, total: 7.37 s
Wall time: 929 ms


In [45]:
%%time

logreg_model = LogisticRegression(random_state=RANDOM_STATE, fit_intercept=False, penalty="l1", solver='saga', max_iter=1000)
cross_val_scores(logreg_model, X_train, y_train, n_splits=10)

train  -  0.63726	|	valid  -  0.58519
train  -  0.64221	|	valid  -  0.61481
train  -  0.62984	|	valid  -  0.55556
train  -  0.64386	|	valid  -  0.60000
train  -  0.63232	|	valid  -  0.57778
train  -  0.63644	|	valid  -  0.57778
train  -  0.63644	|	valid  -  0.65926
train  -  0.65622	|	valid  -  0.57778
train  -  0.64580	|	valid  -  0.58955
train  -  0.63839	|	valid  -  0.62687
Average accuracy on crossval is 0.59646
Std is 0.02848
CPU times: user 5.15 s, sys: 98 μs, total: 5.15 s
Wall time: 4.36 s


In [46]:
%%time

logreg_model = LogisticRegression(random_state=RANDOM_STATE, fit_intercept=False, penalty="l2", solver='lbfgs', max_iter=1000)
cross_val_scores(logreg_model, X_train, y_train, n_splits=10)

train  -  0.62819	|	valid  -  0.59259
train  -  0.64716	|	valid  -  0.62963
train  -  0.63479	|	valid  -  0.57037
train  -  0.65540	|	valid  -  0.61481
train  -  0.63314	|	valid  -  0.57778
train  -  0.64056	|	valid  -  0.59259
train  -  0.64221	|	valid  -  0.65926
train  -  0.65952	|	valid  -  0.56296
train  -  0.64333	|	valid  -  0.59701
train  -  0.63591	|	valid  -  0.62687
Average accuracy on crossval is 0.60239
Std is 0.02852
CPU times: user 3.04 s, sys: 0 ns, total: 3.04 s
Wall time: 426 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [47]:
%%time 

svc_model = SVC(probability=True, kernel='linear', random_state=RANDOM_STATE)
cross_val_scores(svc_model, X_train, y_train, n_splits=10)

train  -  0.70486	|	valid  -  0.65926
train  -  0.69662	|	valid  -  0.75556
train  -  0.69415	|	valid  -  0.62222
train  -  0.70239	|	valid  -  0.65185
train  -  0.69085	|	valid  -  0.65185
train  -  0.68920	|	valid  -  0.64444
train  -  0.69250	|	valid  -  0.72593
train  -  0.70074	|	valid  -  0.62222
train  -  0.69605	|	valid  -  0.61940
train  -  0.71087	|	valid  -  0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359
CPU times: user 3.79 s, sys: 9.98 ms, total: 3.8 s
Wall time: 3.13 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [48]:
%%time 

C =[0.001, 0.01, 0.1, 1, 10, 100]

svc_model = SVC(probability=True, kernel='linear', random_state=RANDOM_STATE, C=C[0])
cross_val_scores(svc_model, X_train, y_train, n_splits=10)

train  -  0.23495	|	valid  -  0.22963
train  -  0.23495	|	valid  -  0.22963
train  -  0.23495	|	valid  -  0.22963
train  -  0.23495	|	valid  -  0.22963
train  -  0.23413	|	valid  -  0.23704
train  -  0.23413	|	valid  -  0.23704
train  -  0.23413	|	valid  -  0.23704
train  -  0.23413	|	valid  -  0.23704
train  -  0.23394	|	valid  -  0.23881
train  -  0.23394	|	valid  -  0.23881
Average accuracy on crossval is 0.23443
Std is 0.00397
CPU times: user 3.79 s, sys: 0 ns, total: 3.79 s
Wall time: 3.8 s


In [49]:
%%time 

svc_model = SVC(probability=True, kernel='linear', random_state=RANDOM_STATE, C=C[1])
cross_val_scores(svc_model, X_train, y_train, n_splits=10)

train  -  0.37923	|	valid  -  0.40000
train  -  0.37923	|	valid  -  0.40000
train  -  0.38417	|	valid  -  0.35556
train  -  0.35449	|	valid  -  0.36296
train  -  0.38252	|	valid  -  0.37037
train  -  0.38087	|	valid  -  0.38519
train  -  0.37923	|	valid  -  0.40000
train  -  0.38252	|	valid  -  0.37037
train  -  0.38468	|	valid  -  0.35075
train  -  0.38386	|	valid  -  0.35821
Average accuracy on crossval is 0.37534
Std is 0.01848
CPU times: user 3.81 s, sys: 0 ns, total: 3.81 s
Wall time: 3.81 s


In [50]:
%%time 

svc_model = SVC(probability=True, kernel='linear', random_state=RANDOM_STATE, C=C[2])
cross_val_scores(svc_model, X_train, y_train, n_splits=10)

train  -  0.58120	|	valid  -  0.55556
train  -  0.57543	|	valid  -  0.56296
train  -  0.57378	|	valid  -  0.57037
train  -  0.59275	|	valid  -  0.57037
train  -  0.58120	|	valid  -  0.54815
train  -  0.57955	|	valid  -  0.54815
train  -  0.57296	|	valid  -  0.61481
train  -  0.59192	|	valid  -  0.54815
train  -  0.59967	|	valid  -  0.52985
train  -  0.57825	|	valid  -  0.57463
Average accuracy on crossval is 0.56230
Std is 0.02177
CPU times: user 3.23 s, sys: 2.98 ms, total: 3.24 s
Wall time: 3.24 s


In [51]:
%%time 

svc_model = SVC(probability=True, kernel='linear', random_state=RANDOM_STATE, C=C[3])
cross_val_scores(svc_model, X_train, y_train, n_splits=10)

train  -  0.70486	|	valid  -  0.65926
train  -  0.69662	|	valid  -  0.75556
train  -  0.69415	|	valid  -  0.62222
train  -  0.70239	|	valid  -  0.65185
train  -  0.69085	|	valid  -  0.65185
train  -  0.68920	|	valid  -  0.64444
train  -  0.69250	|	valid  -  0.72593
train  -  0.70074	|	valid  -  0.62222
train  -  0.69605	|	valid  -  0.61940
train  -  0.71087	|	valid  -  0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359
CPU times: user 3.09 s, sys: 21 μs, total: 3.09 s
Wall time: 3.09 s


In [52]:
%%time 

svc_model = SVC(probability=True, kernel='linear', random_state=RANDOM_STATE, C=C[4])
cross_val_scores(svc_model, X_train, y_train, n_splits=10)

train  -  0.75021	|	valid  -  0.72593
train  -  0.77741	|	valid  -  0.82963
train  -  0.78566	|	valid  -  0.68148
train  -  0.76834	|	valid  -  0.73333
train  -  0.75185	|	valid  -  0.77778
train  -  0.75598	|	valid  -  0.68889
train  -  0.76257	|	valid  -  0.74074
train  -  0.77411	|	valid  -  0.68889
train  -  0.78254	|	valid  -  0.71642
train  -  0.78418	|	valid  -  0.69403
Average accuracy on crossval is 0.72771
Std is 0.04417
CPU times: user 4.53 s, sys: 0 ns, total: 4.53 s
Wall time: 4.53 s


In [53]:
%%time 
  
svc_model = SVC(probability=True, kernel='linear', random_state=RANDOM_STATE, C=C[5])
cross_val_scores(svc_model, X_train, y_train, n_splits=10)

train  -  0.78401	|	valid  -  0.74815
train  -  0.79720	|	valid  -  0.84444
train  -  0.80956	|	valid  -  0.72593
train  -  0.79060	|	valid  -  0.76296
train  -  0.79060	|	valid  -  0.77778
train  -  0.79637	|	valid  -  0.74815
train  -  0.78401	|	valid  -  0.77037
train  -  0.80462	|	valid  -  0.73333
train  -  0.79819	|	valid  -  0.70896
train  -  0.79901	|	valid  -  0.73881
Average accuracy on crossval is 0.75589
Std is 0.03550
CPU times: user 15.4 s, sys: 0 ns, total: 15.4 s
Wall time: 15.4 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [54]:
%%time 

dt = DecisionTreeClassifier(max_depth=10, random_state=RANDOM_STATE)
cross_val_scores(dt, X_train, y_train, n_splits=10)

train  -  0.81039	|	valid  -  0.74074
train  -  0.77741	|	valid  -  0.74074
train  -  0.83347	|	valid  -  0.70370
train  -  0.79720	|	valid  -  0.76296
train  -  0.82440	|	valid  -  0.75556
train  -  0.80379	|	valid  -  0.68889
train  -  0.80709	|	valid  -  0.76296
train  -  0.80132	|	valid  -  0.65926
train  -  0.80807	|	valid  -  0.75373
train  -  0.80478	|	valid  -  0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562
CPU times: user 105 ms, sys: 0 ns, total: 105 ms
Wall time: 104 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [55]:
%%time

max_depth_values = [2, 4, 6, 8, 12, 15, 20, None]
dt = DecisionTreeClassifier(max_depth=max_depth_values[0], random_state=RANDOM_STATE)
cross_val_scores(dt, X_train, y_train, n_splits=10)

train  -  0.43034	|	valid  -  0.47407
train  -  0.43281	|	valid  -  0.45185
train  -  0.44023	|	valid  -  0.38519
train  -  0.43034	|	valid  -  0.47407
train  -  0.43776	|	valid  -  0.40741
train  -  0.43364	|	valid  -  0.44444
train  -  0.43364	|	valid  -  0.44444
train  -  0.43776	|	valid  -  0.40741
train  -  0.41763	|	valid  -  0.38806
train  -  0.43657	|	valid  -  0.41791
Average accuracy on crossval is 0.42949
Std is 0.03116
CPU times: user 86.4 ms, sys: 0 ns, total: 86.4 ms
Wall time: 85.3 ms


In [56]:
%%time

dt = DecisionTreeClassifier(max_depth=max_depth_values[1], random_state=RANDOM_STATE)
cross_val_scores(dt, X_train, y_train, n_splits=10)

train  -  0.53998	|	valid  -  0.48148
train  -  0.52679	|	valid  -  0.52593
train  -  0.54493	|	valid  -  0.47407
train  -  0.54163	|	valid  -  0.52593
train  -  0.54493	|	valid  -  0.57037
train  -  0.52679	|	valid  -  0.51111
train  -  0.52844	|	valid  -  0.50370
train  -  0.53669	|	valid  -  0.48889
train  -  0.53624	|	valid  -  0.54478
train  -  0.54613	|	valid  -  0.46269
Average accuracy on crossval is 0.50889
Std is 0.03190
CPU times: user 94.7 ms, sys: 0 ns, total: 94.7 ms
Wall time: 94.1 ms


In [57]:
%%time

dt = DecisionTreeClassifier(max_depth=max_depth_values[2], random_state=RANDOM_STATE)
cross_val_scores(dt, X_train, y_train, n_splits=10)

train  -  0.64880	|	valid  -  0.57778
train  -  0.62077	|	valid  -  0.60000
train  -  0.64468	|	valid  -  0.60000
train  -  0.65293	|	valid  -  0.62222
train  -  0.66447	|	valid  -  0.62963
train  -  0.62572	|	valid  -  0.54074
train  -  0.65293	|	valid  -  0.62963
train  -  0.64551	|	valid  -  0.56296
train  -  0.64580	|	valid  -  0.62687
train  -  0.65239	|	valid  -  0.56716
Average accuracy on crossval is 0.59570
Std is 0.03040
CPU times: user 97.9 ms, sys: 0 ns, total: 97.9 ms
Wall time: 97 ms


In [58]:
%%time

dt = DecisionTreeClassifier(max_depth=max_depth_values[3], random_state=RANDOM_STATE)
cross_val_scores(dt, X_train, y_train, n_splits=10)

train  -  0.74031	|	valid  -  0.68148
train  -  0.71146	|	valid  -  0.68148
train  -  0.73289	|	valid  -  0.59259
train  -  0.72135	|	valid  -  0.67407
train  -  0.74856	|	valid  -  0.68148
train  -  0.72795	|	valid  -  0.60000
train  -  0.73454	|	valid  -  0.71852
train  -  0.73372	|	valid  -  0.65185
train  -  0.73394	|	valid  -  0.67910
train  -  0.74053	|	valid  -  0.64925
Average accuracy on crossval is 0.66098
Std is 0.03700
CPU times: user 101 ms, sys: 0 ns, total: 101 ms
Wall time: 101 ms


In [59]:
%%time

dt = DecisionTreeClassifier(max_depth=max_depth_values[4], random_state=RANDOM_STATE)
cross_val_scores(dt, X_train, y_train, n_splits=10)

train  -  0.90272	|	valid  -  0.82963
train  -  0.85985	|	valid  -  0.83704
train  -  0.90932	|	valid  -  0.82222
train  -  0.89035	|	valid  -  0.85185
train  -  0.90932	|	valid  -  0.82963
train  -  0.86232	|	valid  -  0.71852
train  -  0.86562	|	valid  -  0.81481
train  -  0.88706	|	valid  -  0.75556
train  -  0.86820	|	valid  -  0.80597
train  -  0.87150	|	valid  -  0.77612
Average accuracy on crossval is 0.80413
Std is 0.03947
CPU times: user 109 ms, sys: 0 ns, total: 109 ms
Wall time: 108 ms


In [60]:
%%time

dt = DecisionTreeClassifier(max_depth=max_depth_values[5], random_state=RANDOM_STATE)
cross_val_scores(dt, X_train, y_train, n_splits=10)

train  -  0.95796	|	valid  -  0.82222
train  -  0.93075	|	valid  -  0.83704
train  -  0.95631	|	valid  -  0.83704
train  -  0.95301	|	valid  -  0.86667
train  -  0.95136	|	valid  -  0.88889
train  -  0.94724	|	valid  -  0.82222
train  -  0.95466	|	valid  -  0.90370
train  -  0.94971	|	valid  -  0.87407
train  -  0.95305	|	valid  -  0.83582
train  -  0.94316	|	valid  -  0.85821
Average accuracy on crossval is 0.85459
Std is 0.02682
CPU times: user 110 ms, sys: 0 ns, total: 110 ms
Wall time: 109 ms


In [61]:
%%time

dt = DecisionTreeClassifier(max_depth=max_depth_values[6], random_state=RANDOM_STATE)
cross_val_scores(dt, X_train, y_train, n_splits=10)

train  -  0.98846	|	valid  -  0.86667
train  -  0.99011	|	valid  -  0.91111
train  -  0.98681	|	valid  -  0.85926
train  -  0.98763	|	valid  -  0.91111
train  -  0.98928	|	valid  -  0.88148
train  -  0.98186	|	valid  -  0.85926
train  -  0.98846	|	valid  -  0.91852
train  -  0.99176	|	valid  -  0.89630
train  -  0.99094	|	valid  -  0.88060
train  -  0.98847	|	valid  -  0.88060
Average accuracy on crossval is 0.88649
Std is 0.02075
CPU times: user 110 ms, sys: 0 ns, total: 110 ms
Wall time: 110 ms


In [62]:
%%time

dt = DecisionTreeClassifier(max_depth=max_depth_values[7], random_state=RANDOM_STATE)
cross_val_scores(dt, X_train, y_train, n_splits=10)

train  -  1.00000	|	valid  -  0.85926
train  -  1.00000	|	valid  -  0.91852
train  -  1.00000	|	valid  -  0.86667
train  -  1.00000	|	valid  -  0.91111
train  -  1.00000	|	valid  -  0.88148
train  -  1.00000	|	valid  -  0.85185
train  -  1.00000	|	valid  -  0.92593
train  -  1.00000	|	valid  -  0.88148
train  -  1.00000	|	valid  -  0.88060
train  -  1.00000	|	valid  -  0.88060
Average accuracy on crossval is 0.88575
Std is 0.02374
CPU times: user 111 ms, sys: 7 μs, total: 111 ms
Wall time: 109 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [63]:
%%time 

rfc = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=RANDOM_STATE)
cross_val_scores(rfc, X_train, y_train, n_splits=10)

train  -  0.96455	|	valid  -  0.88148
train  -  0.96208	|	valid  -  0.91852
train  -  0.96785	|	valid  -  0.86667
train  -  0.96455	|	valid  -  0.89630
train  -  0.96538	|	valid  -  0.91111
train  -  0.96538	|	valid  -  0.88148
train  -  0.97115	|	valid  -  0.91852
train  -  0.96867	|	valid  -  0.85185
train  -  0.97364	|	valid  -  0.88060
train  -  0.97941	|	valid  -  0.86567
Average accuracy on crossval is 0.88722
Std is 0.02204
CPU times: user 1.01 s, sys: 4.8 ms, total: 1.01 s
Wall time: 1.01 s


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [64]:
%%time 

n_estimators = [50, 100, 200]
max_depth = [None, 5, 10, 15]

rfc = RandomForestClassifier(n_estimators=n_estimators[0], max_depth=max_depth[0], random_state=RANDOM_STATE)
cross_val_scores(rfc, X_train, y_train, n_splits=10)

train  -  1.00000	|	valid  -  0.89630
train  -  1.00000	|	valid  -  0.94815
train  -  1.00000	|	valid  -  0.90370
train  -  1.00000	|	valid  -  0.93333
train  -  1.00000	|	valid  -  0.91111
train  -  1.00000	|	valid  -  0.89630
train  -  1.00000	|	valid  -  0.91852
train  -  1.00000	|	valid  -  0.90370
train  -  1.00000	|	valid  -  0.93284
train  -  0.99918	|	valid  -  0.89552
Average accuracy on crossval is 0.91395
Std is 0.01762
CPU times: user 1.09 s, sys: 4.19 ms, total: 1.09 s
Wall time: 1.09 s


In [65]:
%%time 

rfc = RandomForestClassifier(n_estimators=n_estimators[1], max_depth=max_depth[1], random_state=RANDOM_STATE)
cross_val_scores(rfc, X_train, y_train, n_splits=10)

train  -  0.61171	|	valid  -  0.57037
train  -  0.57708	|	valid  -  0.57037
train  -  0.58120	|	valid  -  0.56296
train  -  0.58450	|	valid  -  0.58519
train  -  0.60016	|	valid  -  0.55556
train  -  0.60346	|	valid  -  0.57778
train  -  0.60429	|	valid  -  0.61481
train  -  0.59522	|	valid  -  0.55556
train  -  0.62685	|	valid  -  0.59701
train  -  0.60544	|	valid  -  0.57463
Average accuracy on crossval is 0.57642
Std is 0.01762
CPU times: user 1.58 s, sys: 9.99 ms, total: 1.59 s
Wall time: 1.58 s


In [66]:
%%time 

rfc = RandomForestClassifier(n_estimators=n_estimators[2], max_depth=max_depth[2], random_state=RANDOM_STATE)
cross_val_scores(rfc, X_train, y_train, n_splits=10)

train  -  0.87387	|	valid  -  0.77037
train  -  0.89860	|	valid  -  0.87407
train  -  0.90767	|	valid  -  0.81481
train  -  0.90437	|	valid  -  0.83704
train  -  0.88706	|	valid  -  0.85926
train  -  0.88788	|	valid  -  0.77778
train  -  0.87799	|	valid  -  0.82963
train  -  0.88129	|	valid  -  0.74074
train  -  0.88468	|	valid  -  0.81343
train  -  0.88138	|	valid  -  0.79104
Average accuracy on crossval is 0.81082
Std is 0.03930
CPU times: user 3.36 s, sys: 0 ns, total: 3.36 s
Wall time: 3.35 s


In [67]:
%%time 

rfc = RandomForestClassifier(n_estimators=n_estimators[2], max_depth=max_depth[3], random_state=RANDOM_STATE)
cross_val_scores(rfc, X_train, y_train, n_splits=10)

train  -  0.98021	|	valid  -  0.89630
train  -  0.98763	|	valid  -  0.92593
train  -  0.98186	|	valid  -  0.89630
train  -  0.98434	|	valid  -  0.88889
train  -  0.98351	|	valid  -  0.90370
train  -  0.98351	|	valid  -  0.88148
train  -  0.98599	|	valid  -  0.91852
train  -  0.98599	|	valid  -  0.88148
train  -  0.98188	|	valid  -  0.88060
train  -  0.98353	|	valid  -  0.87313
Average accuracy on crossval is 0.89463
Std is 0.01636
CPU times: user 3.79 s, sys: 0 ns, total: 3.79 s
Wall time: 3.78 s


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [68]:
rfc = RandomForestClassifier(n_estimators=50, max_depth=None, random_state=RANDOM_STATE)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

print(f"Accurace score on test set: {accuracy_score(y_test, y_pred)}")

Accurace score on test set: 0.9289940828402367


In [69]:
errors_per_class = {}

for label in np.unique(y_test):
  mask = y_test == label
  total = mask.sum()
  errors = (y_pred[mask] != y_test[mask]).sum()
  errors_per_class[label] = errors / total * 100


for weekday, error_pct in errors_per_class.items():
  print(f"Weekday {weekday}: {error_pct:.2f}% errors")

Weekday 0: 25.93% errors
Weekday 1: 5.45% errors
Weekday 2: 6.67% errors
Weekday 3: 3.75% errors
Weekday 4: 14.29% errors
Weekday 5: 9.26% errors
Weekday 6: 1.41% errors


In [71]:
joblib.dump(rfc, "../data/ex00_rfc_model.joblib")

['../data/ex00_rfc_model.joblib']