# Day 09. Exercise 00
# Regularization

## 0. Imports

In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [4]:
df = pd.read_csv("../data/dayofweek.csv")
X = df.drop(columns='dayofweek')
y = df['dayofweek']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=21,
    stratify=y
)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [6]:
%%time
logreg_model = LogisticRegression(random_state=21, fit_intercept=False)

CPU times: user 9 μs, sys: 5 μs, total: 14 μs
Wall time: 13.8 μs


In [7]:
%%time
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

accuracies = []
for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
    
    logreg_model.fit(X_train_fold, y_train_fold)

    y_train_pred = logreg_model.predict(X_train_fold)
    y_valid_pred = logreg_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.63546   |   valid -  0.65089
train -  0.65326   |   valid -  0.60947
train -  0.63942   |   valid -  0.63314
train -  0.63283   |   valid -  0.57988
train -  0.65590   |   valid -  0.57988
train -  0.64535   |   valid -  0.62130
train -  0.63834   |   valid -  0.60714
train -  0.63702   |   valid -  0.59524
train -  0.64295   |   valid -  0.68452
train -  0.63900   |   valid -  0.56548
Average accuracy on crossval is 0.61269
Std is 0.03441
CPU times: user 92.7 ms, sys: 3.81 ms, total: 96.5 ms
Wall time: 98.6 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [8]:
%%time
penalty = None
solver = 'saga'

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

accuracies = []
for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
    
    logreg_model = LogisticRegression(
        penalty=penalty,
        solver=solver,
        fit_intercept=True,
        random_state=21,
        max_iter=10000
    )
    logreg_model.fit(X_train_fold, y_train_fold)

    y_train_pred = logreg_model.predict(X_train_fold)
    y_valid_pred = logreg_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.66777   |   valid -  0.68639
train -  0.66051   |   valid -  0.64497
train -  0.66183   |   valid -  0.68639
train -  0.67106   |   valid -  0.59763
train -  0.67436   |   valid -  0.58580
train -  0.66051   |   valid -  0.63314
train -  0.67062   |   valid -  0.61905
train -  0.65547   |   valid -  0.61310
train -  0.65744   |   valid -  0.67857
train -  0.66469   |   valid -  0.58333
Average accuracy on crossval is 0.63284
Std is 0.03802
CPU times: user 20.3 s, sys: 269 ms, total: 20.6 s
Wall time: 20.8 s


In [9]:
%%time
penalty = 'l1'
solver = 'liblinear'

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

accuracies = []
for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
    
    logreg_model = LogisticRegression(
        penalty=penalty,
        solver=solver,
        fit_intercept=True,
        random_state=21,
        max_iter=10000
    )
    logreg_model.fit(X_train_fold, y_train_fold)

    y_train_pred = logreg_model.predict(X_train_fold)
    y_valid_pred = logreg_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.63546   |   valid -  0.63905
train -  0.63217   |   valid -  0.60947
train -  0.63217   |   valid -  0.65089
train -  0.62887   |   valid -  0.58580
train -  0.64140   |   valid -  0.56213
train -  0.62755   |   valid -  0.59763
train -  0.62582   |   valid -  0.59524
train -  0.61792   |   valid -  0.54762
train -  0.61792   |   valid -  0.63690
train -  0.62978   |   valid -  0.56548
Average accuracy on crossval is 0.59902
Std is 0.03340
CPU times: user 308 ms, sys: 4.92 ms, total: 313 ms
Wall time: 313 ms


In [10]:
%%time
penalty = 'l2'
solver = 'lbfgs'

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

accuracies = []
for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
    
    logreg_model = LogisticRegression(
        penalty=penalty,
        solver=solver,
        fit_intercept=True,
        random_state=21,
        max_iter=10000
    )
    logreg_model.fit(X_train_fold, y_train_fold)

    y_train_pred = logreg_model.predict(X_train_fold)
    y_valid_pred = logreg_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.63744   |   valid -  0.65680
train -  0.65524   |   valid -  0.62722
train -  0.64074   |   valid -  0.63905
train -  0.63678   |   valid -  0.59172
train -  0.65260   |   valid -  0.58580
train -  0.64601   |   valid -  0.61538
train -  0.64493   |   valid -  0.61310
train -  0.64097   |   valid -  0.58929
train -  0.64427   |   valid -  0.68452
train -  0.64097   |   valid -  0.56548
Average accuracy on crossval is 0.61684
Std is 0.03433
CPU times: user 125 ms, sys: 3.41 ms, total: 129 ms
Wall time: 129 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [11]:
svm_model = SVC(probability=True, random_state=21, kernel='linear')
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    svm_model.fit(X_train_fold, y_train_fold)

    y_train_pred = svm_model.predict(X_train_fold)
    y_valid_pred = svm_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.70138   |   valid -  0.71598
train -  0.69677   |   valid -  0.68639
train -  0.70402   |   valid -  0.71006
train -  0.69941   |   valid -  0.63905
train -  0.71127   |   valid -  0.62130
train -  0.70336   |   valid -  0.69822
train -  0.69038   |   valid -  0.67857
train -  0.70487   |   valid -  0.69048
train -  0.69895   |   valid -  0.71429
train -  0.70026   |   valid -  0.61905
Average accuracy on crossval is 0.67734
Std is 0.03553


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [12]:
svm_model = SVC(probability=True, random_state=21, kernel='linear', C=1.0)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    svm_model.fit(X_train_fold, y_train_fold)

    y_train_pred = svm_model.predict(X_train_fold)
    y_valid_pred = svm_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.70138   |   valid -  0.71598
train -  0.69677   |   valid -  0.68639
train -  0.70402   |   valid -  0.71006
train -  0.69941   |   valid -  0.63905
train -  0.71127   |   valid -  0.62130
train -  0.70336   |   valid -  0.69822
train -  0.69038   |   valid -  0.67857
train -  0.70487   |   valid -  0.69048
train -  0.69895   |   valid -  0.71429
train -  0.70026   |   valid -  0.61905
Average accuracy on crossval is 0.67734
Std is 0.03553


In [13]:
svm_model = SVC(probability=True, random_state=21, kernel='linear', C=10.0)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    svm_model.fit(X_train_fold, y_train_fold)

    y_train_pred = svm_model.predict(X_train_fold)
    y_valid_pred = svm_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.77521   |   valid -  0.75740
train -  0.77587   |   valid -  0.73964
train -  0.77983   |   valid -  0.75740
train -  0.78049   |   valid -  0.75148
train -  0.78510   |   valid -  0.69822
train -  0.76664   |   valid -  0.75740
train -  0.78195   |   valid -  0.77381
train -  0.78195   |   valid -  0.75595
train -  0.77800   |   valid -  0.78571
train -  0.76680   |   valid -  0.71429
Average accuracy on crossval is 0.74913
Std is 0.02470


In [14]:
svm_model = SVC(probability=True, random_state=21, kernel='linear', C=100.0)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    svm_model.fit(X_train_fold, y_train_fold)

    y_train_pred = svm_model.predict(X_train_fold)
    y_valid_pred = svm_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.78840   |   valid -  0.75740
train -  0.79038   |   valid -  0.75740
train -  0.79895   |   valid -  0.76923
train -  0.79301   |   valid -  0.76331
train -  0.81147   |   valid -  0.71598
train -  0.78642   |   valid -  0.77515
train -  0.79051   |   valid -  0.77381
train -  0.80237   |   valid -  0.77976
train -  0.79974   |   valid -  0.76786
train -  0.80896   |   valid -  0.73214
Average accuracy on crossval is 0.75920
Std is 0.01921


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [15]:
tree_model = DecisionTreeClassifier(max_depth=10, random_state=21)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    tree_model.fit(X_train_fold, y_train_fold)

    y_train_pred = tree_model.predict(X_train_fold)
    y_valid_pred = tree_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.82004   |   valid -  0.79290
train -  0.82663   |   valid -  0.69822
train -  0.82927   |   valid -  0.76331
train -  0.81806   |   valid -  0.71598
train -  0.82268   |   valid -  0.74556
train -  0.80554   |   valid -  0.77515
train -  0.83333   |   valid -  0.75595
train -  0.81555   |   valid -  0.76786
train -  0.81225   |   valid -  0.77381
train -  0.81752   |   valid -  0.69048
Average accuracy on crossval is 0.74792
Std is 0.03306


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [16]:
tree_model = DecisionTreeClassifier(max_depth=5, random_state=21, min_samples_split=5)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    tree_model.fit(X_train_fold, y_train_fold)

    y_train_pred = tree_model.predict(X_train_fold)
    y_valid_pred = tree_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.62096   |   valid -  0.62130
train -  0.61042   |   valid -  0.55030
train -  0.62426   |   valid -  0.59763
train -  0.60976   |   valid -  0.59172
train -  0.61305   |   valid -  0.53846
train -  0.60382   |   valid -  0.56213
train -  0.61199   |   valid -  0.61905
train -  0.61199   |   valid -  0.59524
train -  0.60738   |   valid -  0.64286
train -  0.62187   |   valid -  0.55952
Average accuracy on crossval is 0.58782
Std is 0.03255


In [17]:
tree_model = DecisionTreeClassifier(max_depth=30, random_state=21, min_samples_split=5)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    tree_model.fit(X_train_fold, y_train_fold)

    y_train_pred = tree_model.predict(X_train_fold)
    y_valid_pred = tree_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.98088   |   valid -  0.88166
train -  0.97495   |   valid -  0.82840
train -  0.97956   |   valid -  0.92308
train -  0.97693   |   valid -  0.88166
train -  0.97693   |   valid -  0.89941
train -  0.97561   |   valid -  0.92899
train -  0.97892   |   valid -  0.89881
train -  0.98155   |   valid -  0.88690
train -  0.97563   |   valid -  0.89286
train -  0.98221   |   valid -  0.84524
Average accuracy on crossval is 0.88670
Std is 0.02937


In [18]:
tree_model = DecisionTreeClassifier(max_depth=40, random_state=21, min_samples_split=10)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    tree_model.fit(X_train_fold, y_train_fold)

    y_train_pred = tree_model.predict(X_train_fold)
    y_valid_pred = tree_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.95649   |   valid -  0.86982
train -  0.94990   |   valid -  0.78107
train -  0.95122   |   valid -  0.91124
train -  0.94595   |   valid -  0.87574
train -  0.95320   |   valid -  0.86391
train -  0.94199   |   valid -  0.85207
train -  0.95125   |   valid -  0.90476
train -  0.94862   |   valid -  0.85119
train -  0.94928   |   valid -  0.90476
train -  0.95850   |   valid -  0.82143
Average accuracy on crossval is 0.86360
Std is 0.03837


In [19]:
tree_model = DecisionTreeClassifier(max_depth=50, random_state=21, min_samples_split=3, min_samples_leaf=3)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    tree_model.fit(X_train_fold, y_train_fold)

    y_train_pred = tree_model.predict(X_train_fold)
    y_valid_pred = tree_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.94265   |   valid -  0.86391
train -  0.94792   |   valid -  0.79290
train -  0.93935   |   valid -  0.88757
train -  0.93540   |   valid -  0.84024
train -  0.93606   |   valid -  0.85799
train -  0.93474   |   valid -  0.87574
train -  0.93874   |   valid -  0.88095
train -  0.94137   |   valid -  0.83333
train -  0.94005   |   valid -  0.88690
train -  0.93610   |   valid -  0.81548
Average accuracy on crossval is 0.85350
Std is 0.03058


In [20]:
tree_model = DecisionTreeClassifier(max_depth=50, random_state=21, min_samples_split=3, min_samples_leaf=3, max_features=10)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    tree_model.fit(X_train_fold, y_train_fold)

    y_train_pred = tree_model.predict(X_train_fold)
    y_valid_pred = tree_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.87343   |   valid -  0.78107
train -  0.87014   |   valid -  0.75148
train -  0.88794   |   valid -  0.81657
train -  0.85695   |   valid -  0.76923
train -  0.88662   |   valid -  0.81065
train -  0.83982   |   valid -  0.76923
train -  0.85573   |   valid -  0.77381
train -  0.87879   |   valid -  0.79167
train -  0.90184   |   valid -  0.84524
train -  0.86495   |   valid -  0.80952
Average accuracy on crossval is 0.79185
Std is 0.02687


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [21]:
forest_model = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    forest_model.fit(X_train_fold, y_train_fold)

    y_train_pred = forest_model.predict(X_train_fold)
    y_valid_pred = forest_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.97034   |   valid -  0.90533
train -  0.96704   |   valid -  0.87574
train -  0.96902   |   valid -  0.91124
train -  0.97429   |   valid -  0.89349
train -  0.96243   |   valid -  0.86982
train -  0.96638   |   valid -  0.94083
train -  0.97036   |   valid -  0.92262
train -  0.97036   |   valid -  0.91667
train -  0.96838   |   valid -  0.89881
train -  0.97563   |   valid -  0.88690
Average accuracy on crossval is 0.90214
Std is 0.02069


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [22]:
forest_model = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=21)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    forest_model.fit(X_train_fold, y_train_fold)

    y_train_pred = forest_model.predict(X_train_fold)
    y_valid_pred = forest_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  1.00000   |   valid -  0.92899
train -  1.00000   |   valid -  0.91716
train -  1.00000   |   valid -  0.94675
train -  1.00000   |   valid -  0.93491
train -  1.00000   |   valid -  0.92308
train -  1.00000   |   valid -  0.94675
train -  1.00000   |   valid -  0.94643
train -  1.00000   |   valid -  0.92262
train -  1.00000   |   valid -  0.93452
train -  1.00000   |   valid -  0.92262
Average accuracy on crossval is 0.93238
Std is 0.01068


In [23]:
forest_model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=21)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    forest_model.fit(X_train_fold, y_train_fold)

    y_train_pred = forest_model.predict(X_train_fold)
    y_valid_pred = forest_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  0.47858   |   valid -  0.47929
train -  0.47462   |   valid -  0.43195
train -  0.47660   |   valid -  0.46154
train -  0.50890   |   valid -  0.48521
train -  0.48583   |   valid -  0.43787
train -  0.48187   |   valid -  0.46746
train -  0.46838   |   valid -  0.48810
train -  0.47760   |   valid -  0.48810
train -  0.46245   |   valid -  0.45238
train -  0.48946   |   valid -  0.51786
Average accuracy on crossval is 0.47097
Std is 0.02474


In [24]:
forest_model = RandomForestClassifier(n_estimators=150, max_depth=30, random_state=21, bootstrap=True)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
accuracies = []

for train_idx, valid_idx in skf.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    forest_model.fit(X_train_fold, y_train_fold)

    y_train_pred = forest_model.predict(X_train_fold)
    y_valid_pred = forest_model.predict(X_valid_fold)

    acc_train = accuracy_score(y_train_fold, y_train_pred)
    acc_valid = accuracy_score(y_valid_fold, y_valid_pred)
    accuracies.append(acc_valid)

    print(f"train -  {acc_train:.5f}   |   valid -  {acc_valid:.5f}")

print(f"Average accuracy on crossval is {np.mean(accuracies):.5f}")
print(f"Std is {np.std(accuracies):.5f}")

train -  1.00000   |   valid -  0.92899
train -  1.00000   |   valid -  0.92308
train -  1.00000   |   valid -  0.94675
train -  1.00000   |   valid -  0.93491
train -  1.00000   |   valid -  0.92308
train -  1.00000   |   valid -  0.94675
train -  1.00000   |   valid -  0.95238
train -  1.00000   |   valid -  0.92262
train -  1.00000   |   valid -  0.93452
train -  1.00000   |   valid -  0.92262
Average accuracy on crossval is 0.93357
Std is 0.01088


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [25]:
forest_model = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=21)
forest_model.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)

In [26]:
acc = accuracy_score(y_test, y_pred)
acc

0.9378698224852071

In [27]:
errors = pd.DataFrame({
    'true': y_test,
    'predicted': y_pred
})
errors['is_error'] = errors['true'] != errors['predicted']
errors.head()

Unnamed: 0,true,predicted,is_error
1087,1,1,False
16,5,5,False
563,6,6,False
1381,3,3,False
1199,2,2,False


In [28]:
error_stats = (
    errors.groupby('true')['is_error'].agg(['sum', 'count']).assign(error_pct=lambda x: 100 * x['sum'] / x['count']).sort_values(by='error_pct', ascending=False)
)
error_stats

Unnamed: 0_level_0,sum,count,error_pct
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7,27,25.925926
4,3,21,14.285714
2,2,30,6.666667
5,3,54,5.555556
1,3,55,5.454545
3,2,80,2.5
6,1,71,1.408451


ANSWER: больше всего ошибок в понедельник. На 2 месте пятница

In [29]:
joblib.dump(forest_model, 'forest_model.joblib')

['forest_model.joblib']