# Day 09. Exercise 00
# Regularization

## 0. Imports

In [104]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (
  train_test_split
)

from sklearn.linear_model import (
  LogisticRegression,
)

from sklearn.tree import (
  DecisionTreeClassifier,
)

from sklearn.ensemble import (
  RandomForestClassifier,
)

from sklearn.svm import (
  SVC
)

from sklearn.metrics import (
  accuracy_score,
  confusion_matrix,
)

from sklearn.model_selection import (
  cross_validate,
  StratifiedKFold,
)


import joblib


## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [105]:
df  = pd.read_csv(
  '../../datasets/dayofweek.csv'
)
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,-0.756764,-2.562352,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,-0.724861,-2.562352,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,-0.692958,-2.562352,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,-0.661055,-2.562352,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,-0.533442,0.945382,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1682,-0.629151,0.945382,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1683,-0.597248,0.945382,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1684,-0.565345,0.945382,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [106]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

In [107]:
random_state = 21

In [108]:
X_train, X_test, y_train, y_test = train_test_split(
  X,
  y,
  test_size=0.2,
  random_state=random_state,
  stratify=y
)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [109]:
def cross_val_print(model, X, y, n_splits):
  kf = StratifiedKFold(
    n_splits=n_splits,
  )

  scores = cross_validate(
    model,
    X,
    y,
    cv=kf,
    return_train_score=True,
  ) 

  for train, test in zip(scores['train_score'], scores['test_score']):
    print(f'train - {train:.5f}\t| valid - {test:.5f}')

  print(f'Average accuracy on crossval is - {np.mean(scores["test_score"]):.5f}')
  print(f'Std is {np.std(scores["test_score"]):.5f}')

In [110]:
%%time
logreg = LogisticRegression(
  random_state=random_state,
  fit_intercept=False,
)

cross_val_print(logreg, X_train, y_train, 10)

train - 0.62902	| valid - 0.59259
train - 0.64633	| valid - 0.62963
train - 0.63479	| valid - 0.56296
train - 0.65622	| valid - 0.61481
train - 0.63397	| valid - 0.57778
train - 0.64056	| valid - 0.59259
train - 0.64138	| valid - 0.65926
train - 0.65952	| valid - 0.56296
train - 0.64333	| valid - 0.59701
train - 0.63674	| valid - 0.62687
Average accuracy on crossval is - 0.60165
Std is 0.02943
CPU times: user 3.72 s, sys: 12 s, total: 15.7 s
Wall time: 993 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [111]:
%%time
logreg = LogisticRegression(
  random_state=random_state,
  fit_intercept=False,
  penalty='l1',
  solver='liblinear',
  max_iter=1000
)

cross_val_print(logreg, X_train, y_train, 10)

train - 0.61830	| valid - 0.54815
train - 0.62737	| valid - 0.62222
train - 0.60511	| valid - 0.54074
train - 0.63644	| valid - 0.62222
train - 0.62407	| valid - 0.55556
train - 0.62325	| valid - 0.58519
train - 0.61253	| valid - 0.63704
train - 0.64716	| valid - 0.58519
train - 0.63015	| valid - 0.59701
train - 0.61367	| valid - 0.59701
Average accuracy on crossval is - 0.58903
Std is 0.03129
CPU times: user 880 ms, sys: 3.11 s, total: 3.99 s
Wall time: 249 ms


In [112]:
%%time
logreg = LogisticRegression(
  random_state=random_state,
  fit_intercept=False,
  penalty='l2',
  solver='liblinear',
  max_iter=1000
)

cross_val_print(logreg, X_train, y_train, 10)

train - 0.61006	| valid - 0.56296
train - 0.61665	| valid - 0.61481
train - 0.61336	| valid - 0.59259
train - 0.62902	| valid - 0.60741
train - 0.60923	| valid - 0.55556
train - 0.61500	| valid - 0.57778
train - 0.61665	| valid - 0.61481
train - 0.64056	| valid - 0.53333
train - 0.62109	| valid - 0.58209
train - 0.61120	| valid - 0.57463
Average accuracy on crossval is - 0.58160
Std is 0.02532
CPU times: user 655 ms, sys: 2.19 s, total: 2.85 s
Wall time: 179 ms


In [113]:
%%time
logreg = LogisticRegression(
  random_state=random_state,
  fit_intercept=False,
  penalty='none',
  solver='lbfgs',
  max_iter=1000
)

cross_val_print(logreg, X_train, y_train, 10)

train - 0.66694	| valid - 0.63704
train - 0.65787	| valid - 0.65926
train - 0.66694	| valid - 0.57778
train - 0.66529	| valid - 0.62963
train - 0.66694	| valid - 0.62222
train - 0.65952	| valid - 0.57778
train - 0.65045	| valid - 0.69630
train - 0.68425	| valid - 0.61481
train - 0.66474	| valid - 0.62687
train - 0.65651	| valid - 0.60448
Average accuracy on crossval is - 0.62462
Std is 0.03379
CPU times: user 15.9 s, sys: 49.5 s, total: 1min 5s
Wall time: 4.06 s


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [114]:
%%time
svc = SVC(
  random_state=random_state,
  probability=True,
  kernel='linear',
)

cross_val_print(svc, X_train, y_train, 10)

train - 0.70486	| valid - 0.65926
train - 0.69662	| valid - 0.75556
train - 0.69415	| valid - 0.62222
train - 0.70239	| valid - 0.65185
train - 0.69085	| valid - 0.65185
train - 0.68920	| valid - 0.64444
train - 0.69250	| valid - 0.72593
train - 0.70074	| valid - 0.62222
train - 0.69605	| valid - 0.61940
train - 0.71087	| valid - 0.63433
Average accuracy on crossval is - 0.65871
Std is 0.04359
CPU times: user 3.11 s, sys: 918 ms, total: 4.03 s
Wall time: 2.85 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [115]:
%%time
svc = SVC(
  random_state=random_state,
  probability=True,
  kernel='linear',
  C=0.1,
)

cross_val_print(svc, X_train, y_train, 10)

train - 0.58120	| valid - 0.55556
train - 0.57543	| valid - 0.56296
train - 0.57378	| valid - 0.57037
train - 0.59275	| valid - 0.57037
train - 0.58120	| valid - 0.54815
train - 0.57955	| valid - 0.54815
train - 0.57296	| valid - 0.61481
train - 0.59192	| valid - 0.54815
train - 0.59967	| valid - 0.52985
train - 0.57825	| valid - 0.57463
Average accuracy on crossval is - 0.56230
Std is 0.02177
CPU times: user 3.06 s, sys: 26.1 ms, total: 3.09 s
Wall time: 3.05 s


In [116]:
%%time
svc = SVC(
  random_state=random_state,
  probability=True,
  kernel='linear',
  C=10,
)

cross_val_print(svc, X_train, y_train, 10)

train - 0.75021	| valid - 0.72593
train - 0.77741	| valid - 0.82963
train - 0.78566	| valid - 0.68148
train - 0.76834	| valid - 0.73333
train - 0.75185	| valid - 0.77778
train - 0.75598	| valid - 0.68889
train - 0.76257	| valid - 0.74074
train - 0.77411	| valid - 0.68889
train - 0.78254	| valid - 0.71642
train - 0.78418	| valid - 0.69403
Average accuracy on crossval is - 0.72771
Std is 0.04417
CPU times: user 3.94 s, sys: 0 ns, total: 3.94 s
Wall time: 3.98 s


In [117]:
%%time
svc = SVC(
  random_state=random_state,
  probability=True,
  kernel='linear',
  C=100,
)

cross_val_print(svc, X_train, y_train, 10)

train - 0.78401	| valid - 0.74815
train - 0.79720	| valid - 0.84444
train - 0.80956	| valid - 0.72593
train - 0.79060	| valid - 0.76296
train - 0.79060	| valid - 0.77778
train - 0.79637	| valid - 0.74815
train - 0.78401	| valid - 0.77037
train - 0.80462	| valid - 0.73333
train - 0.79819	| valid - 0.70896
train - 0.79901	| valid - 0.73881
Average accuracy on crossval is - 0.75589
Std is 0.03550
CPU times: user 13.5 s, sys: 0 ns, total: 13.5 s
Wall time: 13.6 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [118]:
%%time
tree = DecisionTreeClassifier(
  random_state=random_state,
  max_depth=10
)

cross_val_print(tree, X_train, y_train, 10)

train - 0.81039	| valid - 0.74074
train - 0.77741	| valid - 0.74074
train - 0.83347	| valid - 0.70370
train - 0.79720	| valid - 0.76296
train - 0.82440	| valid - 0.75556
train - 0.80379	| valid - 0.68889
train - 0.80709	| valid - 0.76296
train - 0.80132	| valid - 0.65926
train - 0.80807	| valid - 0.75373
train - 0.80478	| valid - 0.68657
Average accuracy on crossval is - 0.72551
Std is 0.03562
CPU times: user 77.1 ms, sys: 0 ns, total: 77.1 ms
Wall time: 75.5 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [119]:
%%time
tree = DecisionTreeClassifier(
  random_state=random_state,
  max_depth=1
)

cross_val_print(tree, X_train, y_train, 10)

train - 0.35367	| valid - 0.37037
train - 0.35449	| valid - 0.36296
train - 0.35614	| valid - 0.34815
train - 0.35449	| valid - 0.36296
train - 0.35532	| valid - 0.35556
train - 0.35367	| valid - 0.37037
train - 0.35532	| valid - 0.35556
train - 0.35614	| valid - 0.34815
train - 0.35667	| valid - 0.34328
train - 0.35750	| valid - 0.33582
Average accuracy on crossval is - 0.35532
Std is 0.01094
CPU times: user 62.5 ms, sys: 0 ns, total: 62.5 ms
Wall time: 60.7 ms


In [120]:
%%time
tree = DecisionTreeClassifier(
  random_state=random_state,
  max_depth=2
)

cross_val_print(tree, X_train, y_train, 10)

train - 0.43034	| valid - 0.47407
train - 0.43281	| valid - 0.45185
train - 0.44023	| valid - 0.38519
train - 0.43034	| valid - 0.47407
train - 0.43776	| valid - 0.40741
train - 0.43364	| valid - 0.44444
train - 0.43364	| valid - 0.44444
train - 0.43776	| valid - 0.40741
train - 0.41763	| valid - 0.38806
train - 0.43657	| valid - 0.41791
Average accuracy on crossval is - 0.42949
Std is 0.03116
CPU times: user 62.3 ms, sys: 0 ns, total: 62.3 ms
Wall time: 60.8 ms


In [121]:
%%time
tree = DecisionTreeClassifier(
  random_state=random_state,
  max_depth=4
)

cross_val_print(tree, X_train, y_train, 10)

train - 0.53998	| valid - 0.48148
train - 0.52679	| valid - 0.52593
train - 0.54493	| valid - 0.47407
train - 0.54163	| valid - 0.52593
train - 0.54493	| valid - 0.57037
train - 0.52679	| valid - 0.51111
train - 0.52844	| valid - 0.50370
train - 0.53669	| valid - 0.48889
train - 0.53624	| valid - 0.54478
train - 0.54613	| valid - 0.46269
Average accuracy on crossval is - 0.50889
Std is 0.03190
CPU times: user 78.6 ms, sys: 0 ns, total: 78.6 ms
Wall time: 77.1 ms


In [122]:
%%time
tree = DecisionTreeClassifier(
  random_state=random_state,
  max_depth=10,
  min_samples_leaf=5
)

cross_val_print(tree, X_train, y_train, 10)

train - 0.75515	| valid - 0.71111
train - 0.71476	| valid - 0.66667
train - 0.78153	| valid - 0.66667
train - 0.75268	| valid - 0.73333
train - 0.76752	| valid - 0.73333
train - 0.75268	| valid - 0.67407
train - 0.74608	| valid - 0.70370
train - 0.74279	| valid - 0.61481
train - 0.75206	| valid - 0.68657
train - 0.74465	| valid - 0.64179
Average accuracy on crossval is - 0.68321
Std is 0.03641
CPU times: user 77.2 ms, sys: 0 ns, total: 77.2 ms
Wall time: 75 ms


In [123]:
%%time
tree = DecisionTreeClassifier(
  random_state=random_state,
  max_depth=20
)

cross_val_print(tree, X_train, y_train, 10)

train - 0.98846	| valid - 0.86667
train - 0.99011	| valid - 0.91111
train - 0.98681	| valid - 0.85926
train - 0.98763	| valid - 0.91111
train - 0.98928	| valid - 0.88148
train - 0.98186	| valid - 0.85926
train - 0.98846	| valid - 0.91852
train - 0.99176	| valid - 0.89630
train - 0.99094	| valid - 0.88060
train - 0.98847	| valid - 0.88060
Average accuracy on crossval is - 0.88649
Std is 0.02075
CPU times: user 79.9 ms, sys: 0 ns, total: 79.9 ms
Wall time: 77.7 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [124]:
%%time
base_forest = RandomForestClassifier(
  random_state=random_state,
  n_estimators=50,
  max_depth=14,
)

cross_val_print(base_forest, X_train, y_train, 10)

train - 0.96455	| valid - 0.88148
train - 0.96208	| valid - 0.91852
train - 0.96785	| valid - 0.86667
train - 0.96455	| valid - 0.89630
train - 0.96538	| valid - 0.91111
train - 0.96538	| valid - 0.88148
train - 0.97115	| valid - 0.91852
train - 0.96867	| valid - 0.85185
train - 0.97364	| valid - 0.88060
train - 0.97941	| valid - 0.86567
Average accuracy on crossval is - 0.88722
Std is 0.02204
CPU times: user 745 ms, sys: 0 ns, total: 745 ms
Wall time: 739 ms


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [125]:
%%time
forest = RandomForestClassifier(
  random_state=random_state,
  n_estimators=10,
  max_depth=10,
)

cross_val_print(forest, X_train, y_train, 10)

train - 0.82193	| valid - 0.77778
train - 0.80709	| valid - 0.79259
train - 0.87552	| valid - 0.77037
train - 0.87552	| valid - 0.80741
train - 0.84254	| valid - 0.83704
train - 0.83182	| valid - 0.73333
train - 0.84336	| valid - 0.77778
train - 0.84336	| valid - 0.75556
train - 0.86244	| valid - 0.77612
train - 0.85667	| valid - 0.77612
Average accuracy on crossval is - 0.78041
Std is 0.02659
CPU times: user 187 ms, sys: 0 ns, total: 187 ms
Wall time: 185 ms


In [126]:
%%time
forest = RandomForestClassifier(
  random_state=random_state,
  n_estimators=20,
  max_depth=1,
)

cross_val_print(forest, X_train, y_train, 10)

train - 0.39571	| valid - 0.40741
train - 0.42127	| valid - 0.42963
train - 0.44106	| valid - 0.37037
train - 0.41632	| valid - 0.46667
train - 0.43693	| valid - 0.42222
train - 0.42292	| valid - 0.40000
train - 0.41880	| valid - 0.44444
train - 0.42292	| valid - 0.40000
train - 0.38715	| valid - 0.39552
train - 0.42092	| valid - 0.40299
Average accuracy on crossval is - 0.41392
Std is 0.02610
CPU times: user 256 ms, sys: 0 ns, total: 256 ms
Wall time: 253 ms


In [127]:
%%time
forest = RandomForestClassifier(
  random_state=random_state,
  n_estimators=25,
  max_depth=14,
)

cross_val_print(forest, X_train, y_train, 10)

train - 0.96373	| valid - 0.88148
train - 0.94312	| valid - 0.89630
train - 0.95960	| valid - 0.84444
train - 0.95796	| valid - 0.88889
train - 0.96620	| valid - 0.90370
train - 0.96373	| valid - 0.85926
train - 0.96373	| valid - 0.90370
train - 0.95960	| valid - 0.85926
train - 0.96211	| valid - 0.88806
train - 0.96293	| valid - 0.87313
Average accuracy on crossval is - 0.87982
Std is 0.01925
CPU times: user 409 ms, sys: 0 ns, total: 409 ms
Wall time: 405 ms


In [128]:
%%time
forest = RandomForestClassifier(
  random_state=random_state,
  n_estimators=5,
  max_depth=100,
)

cross_val_print(forest, X_train, y_train, 10)

train - 0.98681	| valid - 0.86667
train - 0.98186	| valid - 0.93333
train - 0.98763	| valid - 0.88148
train - 0.98763	| valid - 0.86667
train - 0.98516	| valid - 0.91852
train - 0.98763	| valid - 0.88889
train - 0.98846	| valid - 0.91111
train - 0.98186	| valid - 0.85926
train - 0.98105	| valid - 0.90299
train - 0.98435	| valid - 0.88060
Average accuracy on crossval is - 0.89095
Std is 0.02344
CPU times: user 148 ms, sys: 0 ns, total: 148 ms
Wall time: 146 ms


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [129]:
forest.fit(X_train, y_train)
predict = forest.predict(X_test)
accuracy_score(y_test, predict)

0.908284023668639

In [150]:
predict_df = pd.DataFrame(
  {
  'dayofweek': y_test,
  'predict': predict,
  'diff' : y_test != predict
  }
)
predict_df

Unnamed: 0,dayofweek,predict,diff
1087,1,1,False
16,5,5,False
563,6,6,False
1381,3,3,False
1199,2,2,False
...,...,...,...
1411,3,3,False
1079,1,1,False
1222,2,2,False
1064,1,1,False


In [152]:
errors = predict_df.groupby('dayofweek').sum()
errors['errors'] = errors['diff'] / errors['predict'] * 100
errors.drop(['diff', 'predict'], axis=1).sort_values('errors', ascending=False)

Unnamed: 0_level_0,errors
dayofweek,Unnamed: 1_level_1
0,21.428571
1,9.677419
2,5.172414
4,3.529412
5,2.651515
3,1.746725
6,0.471698


In [132]:
joblib.dump(forest, 'model.joblib')

['model.joblib']