# Day 09. Exercise 00
# Regularization

## 0. Imports

In [68]:
import warnings
import joblib as jb
import pandas as pd
import numpy as np
from itertools import product
from sklearn.svm import SVC
from sklearn.exceptions import ConvergenceWarning
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [69]:
df = pd.read_csv("./data/dayofweek.csv", header=0, sep=',')
df.drop(axis=1, columns=["Unnamed: 0"], inplace=True)
df

Unnamed: 0,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,...,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,numTrials,hour,dayofweek
0,-0.034462,-0.167478,-0.209673,-0.054538,-0.173011,-0.192095,-0.291448,-0.100924,-0.139094,-0.143461,...,-0.14771,-0.343565,-0.256397,-0.389409,-0.171184,-0.193748,0.879131,-0.788667,-2.562352,4
1,-0.034462,-0.167478,-0.209673,-0.054538,-0.173011,-0.192095,-0.291448,-0.100924,-0.139094,-0.143461,...,-0.14771,-0.343565,-0.256397,-0.389409,-0.171184,-0.193748,0.879131,-0.756764,-2.562352,4
2,-0.034462,-0.167478,-0.209673,-0.054538,-0.173011,-0.192095,-0.291448,-0.100924,-0.139094,-0.143461,...,-0.14771,-0.343565,-0.256397,-0.389409,-0.171184,-0.193748,0.879131,-0.724861,-2.562352,4
3,-0.034462,-0.167478,-0.209673,-0.054538,-0.173011,-0.192095,-0.291448,-0.100924,-0.139094,-0.143461,...,-0.14771,-0.343565,-0.256397,-0.389409,-0.171184,-0.193748,0.879131,-0.692958,-2.562352,4
4,-0.034462,-0.167478,-0.209673,-0.054538,-0.173011,-0.192095,-0.291448,-0.100924,-0.139094,-0.143461,...,-0.14771,-0.343565,-0.256397,-0.389409,-0.171184,-0.193748,0.879131,-0.661055,-2.562352,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,-0.034462,-0.167478,-0.209673,-0.054538,-0.173011,-0.192095,-0.291448,-0.100924,-0.139094,-0.143461,...,-0.14771,-0.343565,-0.256397,-0.389409,-0.171184,5.161332,-1.137487,-0.533442,0.945382,3
1682,-0.034462,5.970944,-0.209673,-0.054538,-0.173011,-0.192095,-0.291448,-0.100924,-0.139094,-0.143461,...,-0.14771,-0.343565,-0.256397,-0.389409,-0.171184,5.161332,-1.137487,-0.629151,0.945382,3
1683,-0.034462,5.970944,-0.209673,-0.054538,-0.173011,-0.192095,-0.291448,-0.100924,-0.139094,-0.143461,...,-0.14771,-0.343565,-0.256397,-0.389409,-0.171184,5.161332,-1.137487,-0.597248,0.945382,3
1684,-0.034462,5.970944,-0.209673,-0.054538,-0.173011,-0.192095,-0.291448,-0.100924,-0.139094,-0.143461,...,-0.14771,-0.343565,-0.256397,-0.389409,-0.171184,5.161332,-1.137487,-0.565345,0.945382,3


In [70]:
X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:, :(-1)], df.iloc[:, (-1)], test_size=0.2, random_state=21, stratify=df.iloc[:, (-1)])
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
Y_train.reset_index(drop=True, inplace=True)
Y_test.reset_index(drop=True, inplace=True)
per_train, per_test = (Y_train.value_counts().values * 100) / len(Y_train), (Y_test.value_counts().values * 100) / len(Y_test)

print(*[f'{per_train[i]:.2f}% {per_test[i]:.2f}%' for i in range(len(per_train))], sep='\n')

23.44% 23.67%
21.14% 21.01%
16.25% 16.27%
16.10% 15.98%
8.83% 8.88%
8.09% 7.99%
6.16% 6.21%


## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [71]:
def check_model(model, n_splits, output_par=0):
    k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=21)
    test_scores = list()
    
    for (x, y) in k_fold.split(df.iloc[:, :(-1)], df.iloc[:, (-1)]):
        Xy_train, Xy_test = df.iloc[x], df.iloc[y]

        model.fit(Xy_train.iloc[:, :(-1)], Xy_train.iloc[:, (-1)])

        train_predicts = model.predict(Xy_train.iloc[:, :(-1)])
        test_predicts = model.predict(Xy_test.iloc[:, :(-1)])
        train_score, test_score = accuracy_score(Xy_train.iloc[:, (-1)], train_predicts), accuracy_score(Xy_test.iloc[:, (-1)], test_predicts)

        if output_par:
            print(f"train -  {train_score:.5f}   |   valid -  {test_score:.5f}")
        test_scores.append(test_score)

    test_scores = np.array(test_scores)

    if output_par:
        print(f"Average accuracy on crossval is {test_scores.mean():.5f}")
        print(f"Std is {test_scores.std():.5f}")

In [72]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False)
check_model(model, 10)

The slowest run took 23.64 times longer than the fastest. This could mean that an intermediate result is being cached.
3.69 s ± 3.77 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
model = LogisticRegression(random_state=21, fit_intercept=False)
check_model(model, 10, 1)

train -  0.65590   |   valid -  0.73964
train -  0.66579   |   valid -  0.62722
train -  0.65722   |   valid -  0.62722
train -  0.65788   |   valid -  0.57396
train -  0.66183   |   valid -  0.66272
train -  0.66249   |   valid -  0.60947
train -  0.65481   |   valid -  0.63095
train -  0.65283   |   valid -  0.61310
train -  0.65613   |   valid -  0.60119
train -  0.66601   |   valid -  0.63690
Average accuracy on crossval is 0.63224
Std is 0.04217


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [74]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [75]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="saga")
check_model(model, 10)

1.28 s ± 151 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [76]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="saga")
check_model(model, 10, 1)

train -  0.65194   |   valid -  0.72781
train -  0.66447   |   valid -  0.61538
train -  0.66249   |   valid -  0.63314
train -  0.66579   |   valid -  0.57396
train -  0.66381   |   valid -  0.66864
train -  0.66579   |   valid -  0.61538
train -  0.66140   |   valid -  0.62500
train -  0.65547   |   valid -  0.61905
train -  0.65876   |   valid -  0.60714
train -  0.66206   |   valid -  0.63095
Average accuracy on crossval is 0.63165
Std is 0.03909


In [77]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="sag")
check_model(model, 10)

1.49 s ± 763 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [78]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="sag")
check_model(model, 10, 1)

train -  0.65590   |   valid -  0.73964
train -  0.66513   |   valid -  0.63314
train -  0.66249   |   valid -  0.62722
train -  0.66183   |   valid -  0.57988
train -  0.66249   |   valid -  0.66864
train -  0.66645   |   valid -  0.61538
train -  0.65942   |   valid -  0.62500
train -  0.65547   |   valid -  0.60714
train -  0.66008   |   valid -  0.60119
train -  0.66271   |   valid -  0.63095
Average accuracy on crossval is 0.63282
Std is 0.04190


In [79]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="lbfgs")
check_model(model, 10)

178 ms ± 4.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [80]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="lbfgs")
check_model(model, 10, 1)

train -  0.65590   |   valid -  0.73964
train -  0.66579   |   valid -  0.62722
train -  0.65722   |   valid -  0.62722
train -  0.65788   |   valid -  0.57396
train -  0.66183   |   valid -  0.66272
train -  0.66249   |   valid -  0.60947
train -  0.65481   |   valid -  0.63095
train -  0.65283   |   valid -  0.61310
train -  0.65613   |   valid -  0.60119
train -  0.66601   |   valid -  0.63690
Average accuracy on crossval is 0.63224
Std is 0.04217


In [81]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="newton-cg")
check_model(model, 10)

196 ms ± 5.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [82]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="newton-cg")
check_model(model, 10, 1)

train -  0.65590   |   valid -  0.73964
train -  0.66579   |   valid -  0.62722
train -  0.65722   |   valid -  0.62722
train -  0.65788   |   valid -  0.57396
train -  0.66117   |   valid -  0.66272
train -  0.66315   |   valid -  0.60947
train -  0.65547   |   valid -  0.63690
train -  0.65349   |   valid -  0.61310
train -  0.65613   |   valid -  0.60119
train -  0.66535   |   valid -  0.63690
Average accuracy on crossval is 0.63283
Std is 0.04219


In [83]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="liblinear")
check_model(model, 10)

411 ms ± 13.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [84]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l2", solver="liblinear")
check_model(model, 10, 1)

train -  0.58141   |   valid -  0.65680
train -  0.62162   |   valid -  0.56213
train -  0.58537   |   valid -  0.58580
train -  0.60646   |   valid -  0.51479
train -  0.57943   |   valid -  0.53254
train -  0.58866   |   valid -  0.60355
train -  0.59684   |   valid -  0.55952
train -  0.59750   |   valid -  0.53571
train -  0.58827   |   valid -  0.59524
train -  0.57839   |   valid -  0.54762
Average accuracy on crossval is 0.56937
Std is 0.03980


In [85]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l1", solver="saga")
check_model(model, 10)

1.8 s ± 30.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [86]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l1", solver="saga")
check_model(model, 10, 1)

train -  0.65194   |   valid -  0.73373
train -  0.66381   |   valid -  0.62722
train -  0.66183   |   valid -  0.62722
train -  0.65985   |   valid -  0.57396
train -  0.66249   |   valid -  0.66864
train -  0.66447   |   valid -  0.61538
train -  0.65744   |   valid -  0.60714
train -  0.65481   |   valid -  0.61310
train -  0.65679   |   valid -  0.60119
train -  0.65876   |   valid -  0.63690
Average accuracy on crossval is 0.63045
Std is 0.04160


In [87]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l1", solver="liblinear")
check_model(model, 10)

8.45 s ± 3.15 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [88]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty="l1", solver="liblinear")
check_model(model, 10, 1)

train -  0.53724   |   valid -  0.59172
train -  0.63678   |   valid -  0.59172
train -  0.55570   |   valid -  0.57396
train -  0.56493   |   valid -  0.48521
train -  0.56691   |   valid -  0.51479
train -  0.53988   |   valid -  0.53254
train -  0.54282   |   valid -  0.51786
train -  0.63636   |   valid -  0.61905
train -  0.54677   |   valid -  0.52381
train -  0.54414   |   valid -  0.50000
Average accuracy on crossval is 0.54507
Std is 0.04306


In [89]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver="saga")
check_model(model, 10)

3.12 s ± 188 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [90]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver="saga")
check_model(model, 10, 1)

train -  0.65326   |   valid -  0.73373
train -  0.66645   |   valid -  0.63314
train -  0.66315   |   valid -  0.63314
train -  0.66711   |   valid -  0.57988
train -  0.66579   |   valid -  0.67456
train -  0.66777   |   valid -  0.61538
train -  0.66206   |   valid -  0.63095
train -  0.65679   |   valid -  0.60714
train -  0.66008   |   valid -  0.60119
train -  0.66403   |   valid -  0.63095
Average accuracy on crossval is 0.63401
Std is 0.04085


In [91]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver="sag")
check_model(model, 10)

2.56 s ± 185 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [92]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver="sag")
check_model(model, 10, 1)

train -  0.65854   |   valid -  0.74556
train -  0.66645   |   valid -  0.63314
train -  0.66381   |   valid -  0.62130
train -  0.66249   |   valid -  0.57988
train -  0.66513   |   valid -  0.67456
train -  0.66777   |   valid -  0.61538
train -  0.66206   |   valid -  0.63095
train -  0.65613   |   valid -  0.61310
train -  0.66403   |   valid -  0.60119
train -  0.66601   |   valid -  0.63095
Average accuracy on crossval is 0.63460
Std is 0.04365


In [93]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver="lbfgs")
check_model(model, 10)

3.99 s ± 632 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [94]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver="lbfgs")
check_model(model, 10, 1)

train -  0.65722   |   valid -  0.75148
train -  0.66249   |   valid -  0.62722
train -  0.65656   |   valid -  0.62130
train -  0.66249   |   valid -  0.57396
train -  0.66777   |   valid -  0.66272
train -  0.66974   |   valid -  0.61538
train -  0.65876   |   valid -  0.62500
train -  0.65613   |   valid -  0.62500
train -  0.65876   |   valid -  0.60714
train -  0.67325   |   valid -  0.64286
Average accuracy on crossval is 0.63521
Std is 0.04442


In [95]:
%%timeit
model = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver="newton-cg")
check_model(model, 10)

759 ms ± 12.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [96]:
model = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver="newton-cg")
check_model(model, 10, 1)

train -  0.65854   |   valid -  0.75148
train -  0.66249   |   valid -  0.62722
train -  0.65722   |   valid -  0.62130
train -  0.66183   |   valid -  0.57396
train -  0.66711   |   valid -  0.66272
train -  0.66974   |   valid -  0.61538
train -  0.65744   |   valid -  0.61905
train -  0.65547   |   valid -  0.63095
train -  0.65876   |   valid -  0.60714
train -  0.67260   |   valid -  0.64286
Average accuracy on crossval is 0.63521
Std is 0.04450


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [97]:
model = SVC(probability=True, kernel="linear", random_state=21)
check_model(model, 10, 1)

train -  0.76730   |   valid -  0.80473
train -  0.77983   |   valid -  0.72189
train -  0.77390   |   valid -  0.76923
train -  0.78115   |   valid -  0.71006
train -  0.76599   |   valid -  0.81065
train -  0.76730   |   valid -  0.77515
train -  0.77668   |   valid -  0.71429
train -  0.77075   |   valid -  0.70833
train -  0.78327   |   valid -  0.73810
train -  0.78129   |   valid -  0.70833
Average accuracy on crossval is 0.74608
Std is 0.03847


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [98]:
%%timeit
model = SVC(probability=True, kernel="linear", random_state=21, C=50, max_iter=1000)
check_model(model, 10)

3.34 s ± 202 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [99]:
model = SVC(probability=True, kernel="linear", random_state=21, C=50, max_iter=1000)
check_model(model, 10, 1)

train -  0.33751   |   valid -  0.33136
train -  0.37047   |   valid -  0.30769
train -  0.29598   |   valid -  0.31361
train -  0.36322   |   valid -  0.30178
train -  0.44891   |   valid -  0.43195
train -  0.28609   |   valid -  0.31361
train -  0.35112   |   valid -  0.36310
train -  0.39723   |   valid -  0.38690
train -  0.41700   |   valid -  0.36310
train -  0.36759   |   valid -  0.33333
Average accuracy on crossval is 0.34464
Std is 0.03940


In [100]:
%%timeit
model = SVC(probability=True, kernel="linear", random_state=21, C=100, max_iter=1000)
check_model(model, 10)

3.31 s ± 78.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [101]:
model = SVC(probability=True, kernel="linear", random_state=21, C=100, max_iter=1000)
check_model(model, 10, 1)

train -  0.38036   |   valid -  0.40828
train -  0.31707   |   valid -  0.24260
train -  0.27093   |   valid -  0.28994
train -  0.33158   |   valid -  0.34911
train -  0.37179   |   valid -  0.41420
train -  0.29334   |   valid -  0.27219
train -  0.36825   |   valid -  0.29167
train -  0.31686   |   valid -  0.25595
train -  0.32806   |   valid -  0.30357
train -  0.41238   |   valid -  0.33929
Average accuracy on crossval is 0.31668
Std is 0.05668


In [102]:
%%timeit
model = SVC(probability=True, kernel="linear", random_state=21, C=0.1, max_iter=1000)
check_model(model, 10)

3.23 s ± 460 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [103]:
model = SVC(probability=True, kernel="linear", random_state=21, C=0.1, max_iter=1000)
check_model(model, 10, 1)

train -  0.68688   |   valid -  0.75148
train -  0.71523   |   valid -  0.64497
train -  0.70138   |   valid -  0.71006
train -  0.72775   |   valid -  0.62130
train -  0.70007   |   valid -  0.70414
train -  0.70929   |   valid -  0.71598
train -  0.69697   |   valid -  0.64286
train -  0.70883   |   valid -  0.63095
train -  0.69236   |   valid -  0.70833
train -  0.70751   |   valid -  0.62500
Average accuracy on crossval is 0.67551
Std is 0.04469


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [104]:
model = DecisionTreeClassifier(max_depth=10, random_state=21)
check_model(model, 10, 1)

train -  0.80883   |   valid -  0.78107
train -  0.82202   |   valid -  0.76331
train -  0.81740   |   valid -  0.73964
train -  0.82334   |   valid -  0.75740
train -  0.81279   |   valid -  0.78698
train -  0.82136   |   valid -  0.82249
train -  0.81423   |   valid -  0.72619
train -  0.81950   |   valid -  0.72024
train -  0.82411   |   valid -  0.76190
train -  0.82543   |   valid -  0.75595
Average accuracy on crossval is 0.76152
Std is 0.02869


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [105]:
model = DecisionTreeClassifier(max_depth=12, random_state=21)
check_model(model, 10, 1)

train -  0.88662   |   valid -  0.86391
train -  0.88530   |   valid -  0.82249
train -  0.90112   |   valid -  0.81657
train -  0.89057   |   valid -  0.79290
train -  0.89651   |   valid -  0.85207
train -  0.89255   |   valid -  0.86982
train -  0.89328   |   valid -  0.81548
train -  0.90053   |   valid -  0.80357
train -  0.89987   |   valid -  0.82143
train -  0.88603   |   valid -  0.77381
Average accuracy on crossval is 0.82320
Std is 0.02918


In [106]:
model = DecisionTreeClassifier(random_state=21, max_depth=7)
check_model(model, 10, 1)

train -  0.69875   |   valid -  0.72781
train -  0.70798   |   valid -  0.68639
train -  0.71457   |   valid -  0.63905
train -  0.71918   |   valid -  0.65680
train -  0.70864   |   valid -  0.66272
train -  0.70534   |   valid -  0.70414
train -  0.71476   |   valid -  0.65476
train -  0.71739   |   valid -  0.63690
train -  0.70553   |   valid -  0.66071
train -  0.70883   |   valid -  0.64881
Average accuracy on crossval is 0.66781
Std is 0.02789


In [107]:
max_value, params = -1, [0, 0, 0]
for (a, b, c) in product(range(2, 21), repeat=3):
    model = DecisionTreeClassifier(random_state=21, max_depth=a, min_samples_split=b, min_samples_leaf=c)

    k_fold = KFold(n_splits=5, shuffle=True, random_state=21)
    test_scores = list()
    
    for (x, y) in k_fold.split(df.iloc[:, :(-1)], df.iloc[:, (-1)]):
        Xy_train, Xy_test = df.iloc[x], df.iloc[y]

        model.fit(Xy_train.iloc[:, :(-1)], Xy_train.iloc[:, (-1)])

        test_predicts = model.predict(Xy_test.iloc[:, :(-1)])
        test_score = accuracy_score(Xy_test.iloc[:, (-1)], test_predicts)
        test_scores.append(test_score)

    value = np.array(test_scores).mean()
    if value > max_value:
        params = [a, b, c]
        max_value = value

print(*params)
print(max_value)

20 6 2
0.8517163274981124


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [108]:
model = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
check_model(model, 10, 1)

train -  0.96902   |   valid -  0.91716
train -  0.96572   |   valid -  0.94675
train -  0.96309   |   valid -  0.88757
train -  0.96572   |   valid -  0.83432
train -  0.97100   |   valid -  0.92308
train -  0.96770   |   valid -  0.92308
train -  0.97299   |   valid -  0.88690
train -  0.96772   |   valid -  0.85714
train -  0.96904   |   valid -  0.89881
train -  0.96706   |   valid -  0.88690
Average accuracy on crossval is 0.89617
Std is 0.03170


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [109]:
model = RandomForestClassifier(n_estimators=55, max_depth=25, random_state=21)
check_model(model, 10, 1)

train -  1.00000   |   valid -  0.93491
train -  1.00000   |   valid -  0.94675
train -  1.00000   |   valid -  0.92899
train -  0.99934   |   valid -  0.89941
train -  0.99934   |   valid -  0.95266
train -  0.99868   |   valid -  0.96450
train -  1.00000   |   valid -  0.92262
train -  1.00000   |   valid -  0.91071
train -  0.99868   |   valid -  0.94048
train -  1.00000   |   valid -  0.91667
Average accuracy on crossval is 0.93177
Std is 0.01905


In [110]:
model = RandomForestClassifier(n_estimators=40, max_depth=20, random_state=21)
check_model(model, 10, 1)

train -  0.99802   |   valid -  0.94083
train -  0.99670   |   valid -  0.94675
train -  0.99934   |   valid -  0.92308
train -  0.99736   |   valid -  0.89349
train -  0.99539   |   valid -  0.92899
train -  0.99407   |   valid -  0.94675
train -  0.99605   |   valid -  0.89881
train -  0.99341   |   valid -  0.88095
train -  0.99802   |   valid -  0.92857
train -  0.99605   |   valid -  0.92262
Average accuracy on crossval is 0.92108
Std is 0.02168


In [111]:
model = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=21)
check_model(model, 10, 1)

train -  1.00000   |   valid -  0.94675
train -  1.00000   |   valid -  0.94675
train -  1.00000   |   valid -  0.92899
train -  1.00000   |   valid -  0.89941
train -  1.00000   |   valid -  0.95858
train -  1.00000   |   valid -  0.97041
train -  1.00000   |   valid -  0.92262
train -  1.00000   |   valid -  0.89286
train -  1.00000   |   valid -  0.94048
train -  1.00000   |   valid -  0.92262
Average accuracy on crossval is 0.93295
Std is 0.02337


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [112]:
model = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=21)
model.fit(X_train, Y_train)

predicts = model.predict(X_test)

In [113]:
final_accuracy = accuracy_score(Y_test, predicts)
final_accuracy

0.9349112426035503

In [114]:
is_right = pd.Series(data=[1 if Y_test.iloc[i] == predicts[i] else 0 for i in range(len(predicts))], name="is_right")
prediction = pd.concat([Y_test, is_right], axis=1, join="inner")
counted_data = prediction.groupby(by="dayofweek")["is_right"].count().reset_index(name="total_amount")

grouped_data = prediction.groupby(by="dayofweek")["is_right"].value_counts().reset_index(name="mistakes")
grouped_data = grouped_data[grouped_data["is_right"] == 0]
grouped_data.reset_index(drop=True, inplace=True)

final_data = pd.merge(grouped_data, counted_data, how="inner", on="dayofweek").drop(axis=1, columns=["is_right"])
final_data.set_index("dayofweek", inplace=True)

percents = (final_data["mistakes"] * 100) / final_data["total_amount"]
percents

dayofweek
0    25.925926
1     7.272727
2     6.666667
3     2.500000
4    14.285714
5     5.555556
6     1.408451
dtype: float64

For Monday my model makes most errors

In [115]:
jb.dump(model, "./best_model.joblib")

['./best_model.joblib']

# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [15]:
import pandas as pd
import warnings
import itertools
from tqdm.notebook import tqdm
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [16]:
data = pd.read_csv("./data/day-of-week-not-scaled.csv", header=0).drop(columns=["Unnamed: 0"], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(data.iloc[:, :(-1)], data.iloc[:, (-1)], test_size=0.2, random_state=21, stratify=data.iloc[:, (-1)])
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
Y_train.reset_index(drop=True, inplace=True)
Y_test.reset_index(drop=True, inplace=True)

data.head(10)

Unnamed: 0,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,...,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,numTrials,hour,dayofweek
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,5,4
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,5,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,3,5,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,4,5,4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,5,5,4
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,6,5,4
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,7,5,4
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,11,4
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,2,11,4
9,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,3,11,4


## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [17]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

model = SVC(random_state=21, probability=True, max_iter=1000)
param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}

SKF = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=SKF, scoring="accuracy", n_jobs=-1, refit=False)

grid_search.fit(data.iloc[:, :(-1)], data.iloc[:, -1])

print(grid_search.best_params_)

warnings.filterwarnings("default", category=ConvergenceWarning)

{'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}


In [18]:
SVM_results = pd.DataFrame(grid_search.cv_results_)[["param_class_weight", "param_C", "param_gamma", "param_kernel", "mean_test_score", "rank_test_score"]]
SVM_results.sort_values(by="rank_test_score", inplace=True)
SVM_results

Unnamed: 0,param_class_weight,param_C,param_gamma,param_kernel,mean_test_score,rank_test_score
70,,10.0,auto,rbf,0.906865,1
64,balanced,10.0,auto,rbf,0.897989,2
58,,5.0,auto,rbf,0.861200,3
52,balanced,5.0,auto,rbf,0.848760,4
40,balanced,1.5,auto,rbf,0.687423,5
...,...,...,...,...,...,...
53,balanced,5.0,auto,sigmoid,0.124588,68
17,balanced,0.1,auto,sigmoid,0.121503,69
41,balanced,1.5,auto,sigmoid,0.107315,70
29,balanced,1.0,auto,sigmoid,0.103163,71


Yes, there's a huge difference between various combinations

## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [19]:
model = DecisionTreeClassifier(random_state=21)

param_grid = {
    "max_depth": range(1, 50),
    "class_weight": ["balanced", None],
    "criterion": ["entropy", "gini"]
}

SKF = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=SKF, scoring="accuracy", n_jobs=-1, refit=False)

grid_search.fit(data.iloc[:, :(-1)], data.iloc[:, -1])

print(grid_search.best_params_)

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 26}


In [20]:
DecisionTree_results = pd.DataFrame(grid_search.cv_results_).sort_values(by="rank_test_score")
DecisionTree_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
80,0.009698,0.000429,0.002588,0.000100,balanced,gini,32,"{'class_weight': 'balanced', 'criterion': 'gin...",0.911243,0.875740,...,0.881657,0.905325,0.946746,0.934524,0.904762,0.928571,0.857143,0.908654,0.028374,1
81,0.009278,0.000336,0.003038,0.001215,balanced,gini,33,"{'class_weight': 'balanced', 'criterion': 'gin...",0.911243,0.875740,...,0.881657,0.905325,0.946746,0.934524,0.904762,0.928571,0.857143,0.908654,0.028374,1
82,0.008701,0.000184,0.002374,0.000066,balanced,gini,34,"{'class_weight': 'balanced', 'criterion': 'gin...",0.911243,0.875740,...,0.881657,0.905325,0.946746,0.934524,0.904762,0.928571,0.857143,0.908654,0.028374,1
83,0.010385,0.002039,0.002862,0.001234,balanced,gini,35,"{'class_weight': 'balanced', 'criterion': 'gin...",0.911243,0.875740,...,0.881657,0.905325,0.946746,0.934524,0.904762,0.928571,0.857143,0.908654,0.028374,1
84,0.012018,0.003681,0.003334,0.001669,balanced,gini,36,"{'class_weight': 'balanced', 'criterion': 'gin...",0.911243,0.875740,...,0.881657,0.905325,0.946746,0.934524,0.904762,0.928571,0.857143,0.908654,0.028374,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,0.004809,0.000654,0.001845,0.000261,balanced,entropy,2,"{'class_weight': 'balanced', 'criterion': 'ent...",0.349112,0.390533,...,0.420118,0.408284,0.378698,0.327381,0.339286,0.404762,0.446429,0.387289,0.036328,191
98,0.003671,0.000200,0.002216,0.000095,,entropy,1,"{'class_weight': None, 'criterion': 'entropy',...",0.366864,0.337278,...,0.366864,0.331361,0.396450,0.351190,0.351190,0.363095,0.363095,0.357650,0.017312,193
147,0.006425,0.003062,0.004089,0.002958,,gini,1,"{'class_weight': None, 'criterion': 'gini', 'm...",0.366864,0.337278,...,0.366864,0.331361,0.396450,0.351190,0.351190,0.363095,0.363095,0.357650,0.017312,193
0,0.008338,0.002781,0.002976,0.000398,balanced,entropy,1,"{'class_weight': 'balanced', 'criterion': 'ent...",0.260355,0.248521,...,0.355030,0.313609,0.254438,0.255952,0.285714,0.309524,0.154762,0.276927,0.053385,195


Yes, there's a huge difference between various combinations`

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [21]:
model = RandomForestClassifier(random_state=21)

param_grid = {
    "n_estimators": [5, 10, 50, 100],
    "max_depth": range(1, 50),
    "class_weight": ["balanced", None],
    "criterion": ["entropy", "gini"]
}

SKF = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=SKF, refit=False, n_jobs=-1, scoring="accuracy")

grid_search.fit(data.iloc[:, :(-1)], data.iloc[:, -1])

print(grid_search.best_params_)

{'class_weight': None, 'criterion': 'entropy', 'max_depth': 32, 'n_estimators': 50}


In [22]:
RandomForest_results = pd.DataFrame(grid_search.cv_results_).sort_values(by="rank_test_score")
RandomForest_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
518,0.153008,0.003200,0.006435,0.000505,,entropy,32,50,"{'class_weight': None, 'criterion': 'entropy',...",0.940828,...,0.934911,0.923077,0.952663,0.952381,0.934524,0.928571,0.910714,0.934749,0.012211,1
106,0.173429,0.007489,0.008407,0.003078,balanced,entropy,27,50,"{'class_weight': 'balanced', 'criterion': 'ent...",0.940828,...,0.928994,0.917160,0.940828,0.958333,0.934524,0.934524,0.922619,0.934172,0.010722,2
742,0.235288,0.040044,0.010427,0.004687,,gini,39,50,"{'class_weight': None, 'criterion': 'gini', 'm...",0.928994,...,0.928994,0.911243,0.952663,0.958333,0.934524,0.934524,0.922619,0.934172,0.013856,2
103,0.383515,0.056269,0.010592,0.000896,balanced,entropy,26,100,"{'class_weight': 'balanced', 'criterion': 'ent...",0.934911,...,0.934911,0.923077,0.952663,0.958333,0.928571,0.928571,0.910714,0.934158,0.013168,4
310,0.176303,0.025820,0.008156,0.002699,balanced,gini,29,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.934911,...,0.928994,0.928994,0.952663,0.952381,0.934524,0.922619,0.916667,0.934158,0.012037,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,0.014467,0.001070,0.003124,0.000275,balanced,gini,2,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.384615,...,0.295858,0.343195,0.473373,0.363095,0.351190,0.375000,0.392857,0.345847,0.067995,780
197,0.026335,0.009138,0.003525,0.001321,balanced,gini,1,10,"{'class_weight': 'balanced', 'criterion': 'gin...",0.337278,...,0.402367,0.213018,0.443787,0.380952,0.327381,0.404762,0.250000,0.333943,0.070405,781
1,0.022219,0.000834,0.003133,0.000435,balanced,entropy,1,10,"{'class_weight': 'balanced', 'criterion': 'ent...",0.301775,...,0.248521,0.189349,0.449704,0.357143,0.255952,0.357143,0.267857,0.305466,0.068676,782
0,0.014403,0.001468,0.003018,0.000388,balanced,entropy,1,5,"{'class_weight': 'balanced', 'criterion': 'ent...",0.272189,...,0.396450,0.165680,0.467456,0.327381,0.190476,0.375000,0.244048,0.285288,0.098075,783


Yes, there's a huge difference between various combinations

## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [23]:
param_grid = {
    "n_estimators": [5, 10, 50, 100],
    "max_depth": range(1, 50),
    "class_weight": ["balanced", None],
    "criterion": ["entropy", "gini"]
}

keys = param_grid.keys()
values = param_grid.values()

param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

dict_for_df = {
    "n_estimators": list(),
    "max_depth": list(),
    "class_weight": list(),
    "criterion": list(),
    "mean_accuracy": list(),
    "std_accuracy": list()
}

best_score, best_params = -1, None
progress_bar = tqdm(total=len(param_combinations), desc="Progress:")

for params in param_combinations:
    curr_model = RandomForestClassifier(random_state=21, n_jobs=-1, **params)
    scores = cross_val_score(curr_model, data.iloc[:, :(-1)], data.iloc[:, -1], cv=SKF, n_jobs=-1, scoring="accuracy")
    mean_score = scores.mean()
    
    if mean_score > best_score:
        best_score = mean_score
        best_params = params

    dict_for_df["n_estimators"].append(params["n_estimators"])
    dict_for_df["max_depth"].append(params["max_depth"])
    dict_for_df["class_weight"].append(params["class_weight"])
    dict_for_df["criterion"].append(params["criterion"])
    dict_for_df["mean_accuracy"].append(mean_score)
    dict_for_df["std_accuracy"].append(scores.std())
    
    progress_bar.update(1)

progress_bar.close()

Progress::   0%|          | 0/784 [00:00<?, ?it/s]

In [24]:
results = pd.DataFrame(dict_for_df).sort_values(by="mean_accuracy", ascending=False)
results

Unnamed: 0,n_estimators,max_depth,class_weight,criterion,mean_accuracy,std_accuracy
519,50,32,,gini,0.927642,0.012516
563,50,43,,gini,0.926459,0.009409
551,50,40,,gini,0.926459,0.009409
571,50,45,,gini,0.926459,0.009409
587,50,49,,gini,0.926459,0.009409
...,...,...,...,...,...,...
590,100,1,,entropy,0.382565,0.014829
197,10,1,balanced,gini,0.382011,0.063859
198,10,1,,entropy,0.376656,0.024160
196,10,1,balanced,entropy,0.352347,0.057324


Yes, there's a huge difference between various combinations

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [25]:
## class_weight=None criterion=entropy max_depth=32 n_estimators=50
model = RandomForestClassifier(random_state=21, n_estimators=50, max_depth=32, criterion="entropy", class_weight=None)
model.fit(X_train, Y_train)

0,1,2
,n_estimators,50
,criterion,'entropy'
,max_depth,32
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
predicts = model.predict(X_test)
final_accuracy = accuracy_score(Y_test, predicts)
final_accuracy

0.9260355029585798

# Day 09. Exercise 02
# Metrics

## 0. Imports

In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
data = pd.read_csv("./data/day-of-week-not-scaled.csv", header=0, index_col="Unnamed: 0")
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :(-1)], data.iloc[:, -1], test_size=0.2, random_state=21, stratify=data.iloc[:, -1])
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

data.head(5)

Unnamed: 0,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,...,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,numTrials,hour,dayofweek
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,5,4
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,5,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,3,5,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,4,5,4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,5,5,4


## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [3]:
## {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
model = SVC(random_state=21, probability=True, C=10, class_weight=None, gamma="auto", kernel="rbf")
model.fit(X_train, y_train)

predicts = model.predict(X_test)

In [4]:
accuracy = accuracy_score(y_test, predicts)
precision = precision_score(y_test, predicts, average="weighted")
recall = recall_score(y_test, predicts, average="weighted")
ROC_AUC = roc_auc_score(y_test, model.predict_proba(X_test), average="weighted", multi_class="ovo")

print(f"accuracy is {accuracy:.5f}\nprecision is {precision:.5f}\nrecall is {recall:.5f}\nroc_auc is {ROC_AUC:.5f}")

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. The same task for decision tree

In [5]:
## {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 26}
model = DecisionTreeClassifier(random_state=21, class_weight="balanced", max_depth=26, criterion="gini")
model.fit(X_train, y_train)

predicts = model.predict(X_test)

In [6]:
accuracy = accuracy_score(y_test, predicts)
precision = precision_score(y_test, predicts, average="weighted")
recall = recall_score(y_test, predicts, average="weighted")
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), average="weighted", multi_class="ovo")

print(f"accuracy is {accuracy:.5f}\nprecision is {precision:.5f}\nrecall is {recall:.5f}\nroc_auc is {ROC_AUC:.5f}")

accuracy is 0.89349
precision is 0.89551
recall is 0.89349
roc_auc is 0.97878


## 4. Random forest

1. The same task for random forest.

In [7]:
## {'class_weight': None, 'criterion': 'entropy', 'max_depth': 32, 'n_estimators': 50}
model = RandomForestClassifier(random_state=21, class_weight=None, criterion="entropy", max_depth=32, n_estimators=50)
model.fit(X_train, y_train)

predicts = model.predict(X_test)

In [8]:
accuracy = accuracy_score(y_test, predicts)
precision = precision_score(y_test, predicts, average="weighted")
recall = recall_score(y_test, predicts, average="weighted")
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), average="weighted", multi_class="ovo")

print(f"accuracy is {accuracy:.5f}\nprecision is {precision:.5f}\nrecall is {recall:.5f}\nroc_auc is {ROC_AUC:.5f}")

accuracy is 0.92604
precision is 0.92763
recall is 0.92604
roc_auc is 0.97878


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [9]:
predicts = pd.Series(data=model.predict(data.iloc[:, :(-1)]), name="predict")
dayofweek = data["dayofweek"].copy()

is_right = (predicts == dayofweek).apply(lambda x: 1 if x else 0)
analyze_1 = pd.concat([dayofweek, is_right], axis=1).rename(columns={0: "is_right"})

counted_data_1 = analyze_1.groupby(by="dayofweek")["is_right"].count()
grouped_data_1 = analyze_1.groupby(by="dayofweek")["is_right"].value_counts().reset_index(name="amount")

grouped_data_1 = grouped_data_1[grouped_data_1["is_right"] == 0]

percents = grouped_data_1["amount"].values * 100 / counted_data_1.values

print(*[f"{i}: {percents[i]:.3f}" for i in range(7)], sep="\n")

0: 4.412
1: 1.460
2: 1.342
3: 0.758
4: 2.885
5: 1.845
6: 0.562


**For Monday model makes the most errors**

In [10]:
data["is_right"] = (predicts == data["dayofweek"]).apply(lambda x: 1 if x else 0)

lab_names = ['labname_code_rvw', 'labname_lab02', 'labname_lab03', 'labname_lab03s', 'labname_lab05s',
             'labname_laba04', 'labname_laba04s', 'labname_laba05', 'labname_laba06', 'labname_laba06s', 'labname_project1']

for name in lab_names:
    cols = data[[name, "is_right"]]
    curr_lab = cols[cols[name] == 1]

    percents = len(curr_lab[curr_lab["is_right"] == 0]) * 100 / len(curr_lab)

    print(f"{name}: {percents:.3f}")

labname_code_rvw: 1.220
labname_lab02: 0.000
labname_lab03: 100.000
labname_lab03s: 0.000
labname_lab05s: 2.778
labname_laba04: 3.933
labname_laba04s: 1.923
labname_laba05: 0.450
labname_laba06: 2.083
labname_laba06s: 1.639
labname_project1: 1.052


**For labname_lab03 model makes the most errors**

In [11]:
user_names = ['uid_user_0', 'uid_user_1', 'uid_user_10', 'uid_user_11', 'uid_user_12',
       'uid_user_13', 'uid_user_14', 'uid_user_15', 'uid_user_16',
       'uid_user_17', 'uid_user_18', 'uid_user_19', 'uid_user_2',
       'uid_user_20', 'uid_user_21', 'uid_user_22', 'uid_user_23',
       'uid_user_24', 'uid_user_25', 'uid_user_26', 'uid_user_27',
       'uid_user_28', 'uid_user_29', 'uid_user_3', 'uid_user_30',
       'uid_user_31', 'uid_user_4', 'uid_user_6', 'uid_user_7', 'uid_user_8']

for name in user_names:
    cols = data[[name, "is_right"]]
    curr_user = cols[cols[name] == 1]

    percents = len(curr_user[curr_user["is_right"] == 0]) * 100 / len(curr_user)

    print(f"{name}: {percents:.3f}")

uid_user_0: 0.000
uid_user_1: 0.000
uid_user_10: 1.408
uid_user_11: 0.000
uid_user_12: 0.000
uid_user_13: 3.333
uid_user_14: 0.758
uid_user_15: 0.000
uid_user_16: 3.125
uid_user_17: 0.000
uid_user_18: 2.857
uid_user_19: 3.297
uid_user_2: 2.479
uid_user_20: 0.000
uid_user_21: 0.000
uid_user_22: 0.000
uid_user_23: 0.000
uid_user_24: 1.786
uid_user_25: 1.667
uid_user_26: 0.000
uid_user_27: 4.348
uid_user_28: 0.000
uid_user_29: 1.562
uid_user_3: 2.817
uid_user_30: 2.564
uid_user_31: 2.667
uid_user_4: 1.064
uid_user_6: 8.333
uid_user_7: 0.000
uid_user_8: 0.000


**For uid_user_6 model makes the most errors**

In [12]:
joblib.dump(model, "./best_model.joblib")

['./best_model.joblib']

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [13]:
def all_metrics(models, params):
    l = len(models)

    if l != len(params):
        return None

    metrics = list()
    
    for i in range(l):
        curr_model, curr_metrics = models[i](**params[i]), dict()

        curr_model.fit(X_train, y_train)
        predicts = curr_model.predict(X_test)

        curr_metrics["accuracy"] = accuracy_score(y_test, predicts)
        curr_metrics["precision"] = precision_score(y_test, predicts, average="weighted")
        curr_metrics["recall"] = recall_score(y_test, predicts, average="weighted")
        curr_metrics["roc_auc"] = float(roc_auc_score(y_test, model.predict_proba(X_test), average="weighted", multi_class="ovo"))

        metrics.append(curr_metrics)

    return metrics

In [14]:
all_metrics([SVC, DecisionTreeClassifier, RandomForestClassifier], [
    {'random_state': 21, 'probability': True, 'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'},
    {'random_state': 21, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 26},
    {'random_state': 21, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 32, 'n_estimators': 50}
])

[{'accuracy': 0.8875739644970414,
  'precision': 0.8926729169690374,
  'recall': 0.8875739644970414,
  'roc_auc': 0.9870419785805531},
 {'accuracy': 0.893491124260355,
  'precision': 0.8955093578395273,
  'recall': 0.893491124260355,
  'roc_auc': 0.9870419785805531},
 {'accuracy': 0.9260355029585798,
  'precision': 0.9276258795834271,
  'recall': 0.9260355029585798,
  'roc_auc': 0.9870419785805531}]

# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [66]:
import pandas as pd
import numpy as np
import joblib as jb
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.exceptions import ConvergenceWarning
from itertools import product
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [67]:
data = pd.read_csv("./data/day-of-week-not-scaled.csv", header=0, index_col="Unnamed: 0")
data.head(5)

Unnamed: 0,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,...,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,numTrials,hour,dayofweek
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,5,4
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,5,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,3,5,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,4,5,4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,5,5,4


In [68]:
X_train_1, X_test, y_train_1, y_test = train_test_split(data.iloc[:, :(-1)], data.iloc[:, -1], test_size=0.2, random_state=21, stratify=data.iloc[:, -1])

X_train_1.reset_index(drop=True, inplace=True)
y_train_1.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_1, y_train_1, test_size=0.2, random_state=21, stratify=y_train_1)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_valid.reset_index(drop=True, inplace=True)
y_valid.reset_index(drop=True, inplace=True)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [69]:
svc_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'random_state': 21, 'probability': True, 'max_iter': 600}
dt_params = {'random_state': 21, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 26}
rf_params = {'random_state': 21, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 32, 'n_estimators': 50}

In [70]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [71]:
svc_model = SVC(**svc_params)
svc_model.fit(X_train, y_train)

dt_model = DecisionTreeClassifier(**dt_params)
dt_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(**rf_params)
rf_model.fit(X_train, y_train)

predicts_1, predicts_2, predicts_3 = svc_model.predict(X_valid), dt_model.predict(X_valid), rf_model.predict(X_valid)

In [72]:
def three_metrics(target_values, predict_values):
    accuracy = accuracy_score(target_values, predict_values)
    precision = precision_score(target_values, predict_values, average="weighted")
    recall = recall_score(target_values, predict_values, average="weighted")

    print(f"accuracy is {accuracy:.5f}\nprecision is {precision:.5f}\nrecall is {recall:.5f}")

In [73]:
three_metrics(y_valid, predicts_1)

accuracy is 0.88148
precision is 0.88577
recall is 0.88148


In [74]:
three_metrics(y_valid, predicts_2)

accuracy is 0.85926
precision is 0.86568
recall is 0.85926


In [75]:
three_metrics(y_valid, predicts_3)

accuracy is 0.89630
precision is 0.89601
recall is 0.89630


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [76]:
classifier = VotingClassifier(estimators=[('model 1', svc_model), ('model 2', dt_model), ('model 3', rf_model)], n_jobs=-1)
classifier.fit(X_train, y_train)

predicts = classifier.predict(X_valid)
three_metrics(y_valid, predicts)

accuracy is 0.89630
precision is 0.89558
recall is 0.89630


In [77]:
## let's find the best weights
classifier = VotingClassifier(estimators=[('model 1', svc_model), ('model 2', dt_model), ('model 3', rf_model)], n_jobs=-1)
param_grid = {
    "voting": ["hard", "soft"],
    "weights": list(filter(lambda x: sum(x) != 0, list(product(np.arange(0, 1.1, 0.2), repeat=3))))
}

SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=SKF, scoring="accuracy", n_jobs=-1)
grid_search.fit(data.iloc[:, :(-1)], data.iloc[:, -1])

top = pd.DataFrame(grid_search.cv_results_)
top = top[["params", "mean_test_score", "rank_test_score"]]
top.sort_values(by="mean_test_score", ascending=False, inplace=True)

top

Unnamed: 0,params,mean_test_score,rank_test_score
291,"{'voting': 'soft', 'weights': (0.4, 0.0, 1.0)}",0.928827,1
295,"{'voting': 'soft', 'weights': (0.4, 0.2, 0.600...",0.928237,2
297,"{'voting': 'soft', 'weights': (0.4, 0.2, 1.0)}",0.928237,2
259,"{'voting': 'soft', 'weights': (0.2, 0.2, 0.600...",0.927649,4
260,"{'voting': 'soft', 'weights': (0.2, 0.2, 0.8)}",0.927647,5
...,...,...,...
167,"{'voting': 'hard', 'weights': (0.8, 0.8, 0.0)}",0.889683,426
83,"{'voting': 'hard', 'weights': (0.4, 0.4, 0.0)}",0.889683,426
209,"{'voting': 'hard', 'weights': (1.0, 1.0, 0.0)}",0.889683,426
125,"{'voting': 'hard', 'weights': (0.6000000000000...",0.889683,426


In [78]:
model = VotingClassifier([('model 1', svc_model), ('model 2', dt_model), ('model 3', rf_model)], voting='soft', weights=[0.4, 0.0, 1.0], n_jobs=-1)
model.fit(X_train, y_train)

predicts = model.predict(X_test)

three_metrics(y_test, predicts)

accuracy is 0.91716
precision is 0.91913
recall is 0.91716


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [79]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

svc_model = SVC(**svc_params)

BC = BaggingClassifier(estimator=svc_model, n_jobs=-1, random_state=21)

param_grid = {
    "max_samples": [0.6, 0.7, 0.8],
    "n_estimators": range(10, 101, 7),
}

SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

grid_search = GridSearchCV(estimator=BC, cv=SKF, param_grid=param_grid, scoring="accuracy", n_jobs=(-1))

grid_search.fit(data.iloc[:, :(-1)], data.iloc[:, -1])

top = pd.DataFrame(grid_search.cv_results_)
top = top[["params", "mean_test_score", "rank_test_score"]]
top.sort_values(by="mean_test_score", ascending=False, inplace=True)

top

Unnamed: 0,params,mean_test_score,rank_test_score
29,"{'max_samples': 0.8, 'n_estimators': 31}",0.894427,1
31,"{'max_samples': 0.8, 'n_estimators': 45}",0.892647,2
27,"{'max_samples': 0.8, 'n_estimators': 17}",0.892058,3
37,"{'max_samples': 0.8, 'n_estimators': 87}",0.892057,4
32,"{'max_samples': 0.8, 'n_estimators': 52}",0.892055,5
30,"{'max_samples': 0.8, 'n_estimators': 38}",0.89146,6
38,"{'max_samples': 0.8, 'n_estimators': 94}",0.89087,7
34,"{'max_samples': 0.8, 'n_estimators': 66}",0.890278,8
28,"{'max_samples': 0.8, 'n_estimators': 24}",0.889686,9
36,"{'max_samples': 0.8, 'n_estimators': 80}",0.889684,10


In [80]:
model = BaggingClassifier(estimator=svc_model, n_jobs=-1, random_state=21, max_samples=0.8, n_estimators=32)
model.fit(X_train, y_train)

three_metrics(y_test, model.predict(X_test))

accuracy is 0.85207
precision is 0.85701
recall is 0.85207


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [81]:
svc_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'random_state': 21, 'probability': True, 'max_iter': 600}
dt_params = {'random_state': 21, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 26}
rf_params = {'random_state': 21, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 32, 'n_estimators': 50}

svc_model = SVC(**svc_params)
dt_model = DecisionTreeClassifier(**dt_params)
rf_model = RandomForestClassifier(**rf_params)

In [82]:
warnings.filterwarnings("ignore", category=FutureWarning)

for n in range(2, 8):
    SKF = StratifiedKFold(n_splits=n, shuffle=True, random_state=21)

    for value in [True, False]:
        SC = StackingClassifier(estimators=[('model 1', svc_model), ('model 2', dt_model), ('model 3', rf_model)], final_estimator=LogisticRegression(solver='liblinear'), cv=SKF, passthrough=value, n_jobs=(-1))
        SC.fit(X_train, y_train)
        
        three_metrics(y_valid, SC.predict(X_valid))
        
        print(f"\nparams: n={n}, passthrough={value}", end='\n\n')
    

accuracy is 0.90370
precision is 0.90559
recall is 0.90370

params: n=2, passthrough=True

accuracy is 0.90370
precision is 0.90539
recall is 0.90370

params: n=2, passthrough=False

accuracy is 0.90370
precision is 0.90500
recall is 0.90370

params: n=3, passthrough=True

accuracy is 0.90000
precision is 0.90145
recall is 0.90000

params: n=3, passthrough=False

accuracy is 0.90741
precision is 0.91042
recall is 0.90741

params: n=4, passthrough=True

accuracy is 0.90370
precision is 0.90621
recall is 0.90370

params: n=4, passthrough=False

accuracy is 0.90741
precision is 0.90978
recall is 0.90741

params: n=5, passthrough=True

accuracy is 0.90370
precision is 0.90521
recall is 0.90370

params: n=5, passthrough=False

accuracy is 0.90370
precision is 0.90412
recall is 0.90370

params: n=6, passthrough=True

accuracy is 0.90370
precision is 0.90462
recall is 0.90370

params: n=6, passthrough=False

accuracy is 0.89630
precision is 0.89716
recall is 0.89630

params: n=7, passthrough=

In [83]:
## params: n=4, passthrough=True
SKF = StratifiedKFold(n_splits=4, shuffle=True, random_state=21)
SC = StackingClassifier(estimators=[('model 1', svc_model), ('model 2', dt_model), ('model 3', rf_model)], final_estimator=LogisticRegression(solver='liblinear'), cv=SKF, passthrough=True, n_jobs=(-1))
SC.fit(X_train, y_train)

0,1,2
,estimators,"[('model 1', ...), ('model 2', ...), ...]"
,final_estimator,LogisticRegre...r='liblinear')
,cv,StratifiedKFo... shuffle=True)
,stack_method,'auto'
,n_jobs,-1
,passthrough,True
,verbose,0

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,26
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,21
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,n_estimators,50
,criterion,'entropy'
,max_depth,32
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [84]:
predicts = SC.predict(X_test)
three_metrics(y_test, predicts)

accuracy is 0.92012
precision is 0.92087
recall is 0.92012


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [85]:
model = VotingClassifier([('model 1', svc_model), ('model 2', dt_model), ('model 3', rf_model)], voting='soft', weights=[0.4, 0.0, 1.0], n_jobs=-1)
model.fit(X_train, y_train)

0,1,2
,estimators,"[('model 1', ...), ('model 2', ...), ...]"
,voting,'soft'
,weights,"[0.4, 0.0, ...]"
,n_jobs,-1
,flatten_transform,True
,verbose,False

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,26
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,21
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,n_estimators,50
,criterion,'entropy'
,max_depth,32
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [86]:
predicts = pd.Series(data=model.predict(data.iloc[:, :(-1)]), name="predict")
dayofweek = data["dayofweek"].copy()

is_right = (predicts == dayofweek).apply(lambda x: 1 if x else 0)
analyze_1 = pd.concat([dayofweek, is_right], axis=1).rename(columns={0: "is_right"})

counted_data_1 = analyze_1.groupby(by="dayofweek")["is_right"].count()
grouped_data_1 = analyze_1.groupby(by="dayofweek")["is_right"].value_counts().reset_index(name="amount")

grouped_data_1 = grouped_data_1[grouped_data_1["is_right"] == 0]

percents = grouped_data_1["amount"].values * 100 / counted_data_1.values

print(*[f"{i}: {percents[i]:.3f}" for i in range(7)], sep="\n")

0: 8.088
1: 3.285
2: 4.698
3: 1.010
4: 2.885
5: 4.059
6: 2.528


**For Monday model makes the most errors**

In [87]:
data["is_right"] = (predicts == data["dayofweek"]).apply(lambda x: 1 if x else 0)

lab_names = ['labname_code_rvw', 'labname_lab02', 'labname_lab03', 'labname_lab03s', 'labname_lab05s',
             'labname_laba04', 'labname_laba04s', 'labname_laba05', 'labname_laba06', 'labname_laba06s', 'labname_project1']

for name in lab_names:
    cols = data[[name, "is_right"]]
    curr_lab = cols[cols[name] == 1]

    percents = len(curr_lab[curr_lab["is_right"] == 0]) * 100 / len(curr_lab)

    print(f"{name}: {percents:.3f}")

labname_code_rvw: 4.878
labname_lab02: 0.000
labname_lab03: 100.000
labname_lab03s: 0.000
labname_lab05s: 8.333
labname_laba04: 4.494
labname_laba04s: 7.692
labname_laba05: 1.351
labname_laba06: 6.250
labname_laba06s: 4.918
labname_project1: 2.208


**For labname_lab03 model makes the most errors**

In [88]:
user_names = ['uid_user_0', 'uid_user_1', 'uid_user_10', 'uid_user_11', 'uid_user_12',
       'uid_user_13', 'uid_user_14', 'uid_user_15', 'uid_user_16',
       'uid_user_17', 'uid_user_18', 'uid_user_19', 'uid_user_2',
       'uid_user_20', 'uid_user_21', 'uid_user_22', 'uid_user_23',
       'uid_user_24', 'uid_user_25', 'uid_user_26', 'uid_user_27',
       'uid_user_28', 'uid_user_29', 'uid_user_3', 'uid_user_30',
       'uid_user_31', 'uid_user_4', 'uid_user_6', 'uid_user_7', 'uid_user_8']

for name in user_names:
    cols = data[[name, "is_right"]]
    curr_user = cols[cols[name] == 1]

    percents = len(curr_user[curr_user["is_right"] == 0]) * 100 / len(curr_user)

    print(f"{name}: {percents:.3f}")

uid_user_0: 0.000
uid_user_1: 2.174
uid_user_10: 1.408
uid_user_11: 0.000
uid_user_12: 0.000
uid_user_13: 1.667
uid_user_14: 4.545
uid_user_15: 5.882
uid_user_16: 6.250
uid_user_17: 2.941
uid_user_18: 2.857
uid_user_19: 3.297
uid_user_2: 5.785
uid_user_20: 0.000
uid_user_21: 0.000
uid_user_22: 28.571
uid_user_23: 25.000
uid_user_24: 3.571
uid_user_25: 5.833
uid_user_26: 1.111
uid_user_27: 4.348
uid_user_28: 0.000
uid_user_29: 4.688
uid_user_3: 2.817
uid_user_30: 5.128
uid_user_31: 4.000
uid_user_4: 1.064
uid_user_6: 25.000
uid_user_7: 0.000
uid_user_8: 2.128


**For uid_user_22 model makes most errors**

In [89]:
jb.dump(model, "./best_model.joblib")

['./best_model.joblib']

# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [37]:
import pandas as pd
import joblib as jb
import numpy as np
import warnings
from sklearn.pipeline import Pipeline
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [38]:
class FeatureExtractor:

    def fit(self, data, y=None):
        return self

    def transform(self, data):
        data["hour"] = data["timestamp"].dt.hour
        data["dayofweek"] = data["timestamp"].dt.weekday
        
        return data.drop(axis=1, columns=["timestamp"])


class MyOneHotEncoder:

    def __init__(self, target_name):
        self.target_name = target_name
        self.one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

    def fit(self, data, y=None):
        cat_cols = []
        for column in data.columns:
            if column != self.target_name and (data[column].dtype == "object" or data[column].dtype == "category"):
                cat_cols.append(column)
        self.cat_cols = cat_cols
        if len(self.cat_cols) > 0:
            self.one_hot_encoder.fit(data[self.cat_cols])
        return self

    def transform(self, data):
        if len(self.cat_cols) == 0:
            return data
        transformed_data = self.one_hot_encoder.transform(data[self.cat_cols])
        new_column_names = self.one_hot_encoder.get_feature_names_out()
        target_column = data[self.target_name].copy()
        
        self.cat_cols.append(self.target_name)
        data.drop(axis=1, columns=self.cat_cols, inplace=True)
        
        new_data = pd.DataFrame(data=transformed_data, columns=new_column_names).astype(int)

        return (pd.concat([new_data, data], axis=1), target_column)


class TrainValidationTest:

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def transform(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=21, stratify=self.y)
        y_test.reset_index(drop=True, inplace=True)
        X_test.reset_index(drop=True, inplace=True)
        
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)
        X_train.reset_index(drop=True, inplace=True)
        y_train.reset_index(drop=True, inplace=True)
        X_valid.reset_index(drop=True, inplace=True)
        y_valid.reset_index(drop=True, inplace=True)
        
        return (X_train, X_valid, X_test, y_train, y_valid, y_test)

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [39]:
class ModelSelection:

    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict
        self.best_params = []
        self.val_accuracy = []
        
    def choose(self, X_train, y_train, X_valid, y_valid):
        
        for (index, grid) in enumerate(self.grids):
            print(f"Estimator: {self.grid_dict[index]}")
            grid.fit(X_train, y_train)

            curr_params = grid.best_params_
            curr_score = grid.best_score_
            curr_estimator = grid.best_estimator_
            val_accuracy = accuracy_score(y_valid, curr_estimator.predict(X_valid))

            self.best_params.append(curr_params)
            self.val_accuracy.append(val_accuracy)

            print(f"Best params: {curr_params}")
            print(f"Best training accuracy: {curr_score:.3f}")
            print(f"Validation set accuracy score for best params: {val_accuracy:.3f}\n")

        winner_name = self.grid_dict[self.val_accuracy.index(max(self.val_accuracy))]
        print(f"Classifier with best validation set accuracy: {winner_name}")

        return winner_name

    def best_results(self):
        model_names = list(self.grid_dict.values())
        
        return pd.DataFrame(data=[[model_names[i], self.best_params[i], self.val_accuracy[i]] for i in range(len(self.best_params))], columns=["model", "params", "valid_score"])

## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [40]:
class Finalize:

    def __init__(self, estimator):
        self.estimator = estimator

    def final_score(self, X_train, y_train, X_test, y_test):
        self.estimator.fit(X_train, y_train)

        final_accuracy = accuracy_score(y_test, self.estimator.predict(X_test))
        print(f"Accuracy of the final model is {final_accuracy}")
        
        return final_accuracy

    def save_model(self, path):
        jb.dump(self.estimator, path)

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [41]:
df = pd.read_csv("./data/checker_submits.csv", header=0, parse_dates=["timestamp"], sep=',')
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
data = preprocessing.fit_transform(df)

In [42]:
TVT = TrainValidationTest(data[0], data[1])
X_train, X_valid, X_test, y_train, y_valid, y_test = TVT.transform()

In [43]:
# svm_params = {'C': [0.1, 1, 10, 100], 'kernel': ['rbf', 'linear', 'sigmoid'], 'gamma': ['auto', 'scale'], 'class_weight': [None, 'balanced']}
# dt_params = { 'criterion': ['gini', 'entropy'], 'max_depth': range(3, 31, 3), 'class_weight': [None, 'balanced']}
# rf_params = {'n_estimators': range(10, 101, 10), 'criterion': ['gini', 'entropy'], 'max_depth': range(4, 30, 2), 'class_weight': [None, 'balanced']}
svm_params = {'C': [0.1, 1, 10, 100], 'kernel': ['rbf', 'linear'], 'max_iter': [1000]}
dt_params = {'criterion': ['gini', 'entropy'], 'max_depth': range(3, 15, 3)}
rf_params = {'n_estimators': range(10, 101, 20), 'max_depth': range(4, 30, 10)}

SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

GS_svm = GridSearchCV(estimator=SVC(random_state=21, probability=True), param_grid=svm_params, scoring="accuracy", cv=SKF, n_jobs=(-1))
GS_dt = GridSearchCV(estimator=DecisionTreeClassifier(random_state=21), param_grid=dt_params, scoring="accuracy", cv=SKF, n_jobs=(-1))
GS_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=21), param_grid=rf_params, scoring="accuracy", cv=SKF, n_jobs=(-1))

grids = [GS_svm, GS_dt, GS_rf]
grid_dict = {0: "SVM", 1: "DecisionTree", 2: "RandomForest"}

In [44]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [45]:
MS = ModelSelection(grids, grid_dict)
answer = MS.choose(X_train, y_train, X_valid, y_valid)

Estimator: SVM
Best params: {'C': 100, 'kernel': 'rbf', 'max_iter': 1000}
Best training accuracy: 0.664
Validation set accuracy score for best params: 0.693

Estimator: DecisionTree
Best params: {'criterion': 'entropy', 'max_depth': 12}
Best training accuracy: 0.781
Validation set accuracy score for best params: 0.815

Estimator: RandomForest
Best params: {'max_depth': 24, 'n_estimators': 70}
Best training accuracy: 0.892
Validation set accuracy score for best params: 0.889

Classifier with best validation set accuracy: RandomForest


In [46]:
print(MS.best_results())

          model                                         params  valid_score
0           SVM  {'C': 100, 'kernel': 'rbf', 'max_iter': 1000}     0.692593
1  DecisionTree      {'criterion': 'entropy', 'max_depth': 12}     0.814815
2  RandomForest          {'max_depth': 24, 'n_estimators': 70}     0.888889


In [47]:
best_model = RandomForestClassifier(random_state=21, max_depth=24, n_estimators=70)

In [48]:
obj = Finalize(best_model)
final_acc = obj.final_score(X_train, y_train, X_test, y_test)

Accuracy of the final model is 0.9112426035502958


In [49]:
obj.save_model("./RandomForestClassifier_{0.9112426035502958}.sav")