# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (
  train_test_split
)

from sklearn.tree import (
  DecisionTreeClassifier,
)

from sklearn.svm import (
  SVC
)

from sklearn.metrics import (
  accuracy_score,
)


from sklearn.ensemble import (
  RandomForestClassifier,
)

from sklearn.model_selection import (
  cross_val_score,
  GridSearchCV
)

from tqdm.notebook import tqdm

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore). Don't forget to enrich the table with the 'dayofweek' column from the previous day's .csv-file.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [7]:
df = pd.read_csv('../../datasets/day-of-week-not-scaled.csv')
df['dayofweek'] = pd.read_csv('../../datasets/dayofweek.csv')['dayofweek']
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
  df.drop('dayofweek', axis=1),
  df['dayofweek'],
  test_size=0.2,
  stratify=df['dayofweek'],
  random_state=21
)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
svc_params = dict(
  kernel=['linear', 'rbf', 'sigmoid'],
  C=[0.01, 0.1, 1, 1.5, 5, 10],
  gamma=['scale', 'auto'],
  class_weight=['balanced', None],
  random_state=[21],
  probability=[True]
)

In [10]:
svc_clf = GridSearchCV(
  estimator=SVC(),
  param_grid=svc_params,
  n_jobs=-1
)

svc_clf.fit(X_train, y_train)

GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 1.5, 5, 10],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid'],
                         'probability': [True], 'random_state': [21]})

In [11]:
svc_results = pd.DataFrame(svc_clf.cv_results_).sort_values('rank_test_score')
svc_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,param_probability,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
70,0.632883,0.022435,0.015996,0.001192,10,,auto,rbf,True,21,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.900000,0.848148,0.885185,0.884758,0.862454,0.876109,0.018419,1
64,0.690212,0.040839,0.018685,0.002938,10,balanced,auto,rbf,True,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.877778,0.851852,0.862963,0.873606,0.851301,0.863500,0.010870,2
58,0.720491,0.197030,0.018645,0.003238,5,,auto,rbf,True,21,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.825926,0.811111,0.818519,0.821561,0.802974,0.816018,0.008116,3
52,0.637936,0.039193,0.017269,0.000502,5,balanced,auto,rbf,True,21,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.844444,0.785185,0.792593,0.817844,0.802974,0.808608,0.021007,4
63,55.082689,2.740062,0.013364,0.002393,10,balanced,auto,linear,True,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.729630,0.700000,0.755556,0.754647,0.665428,0.721052,0.034438,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,0.893359,0.029525,0.025161,0.001081,5,balanced,auto,sigmoid,True,21,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.144444,0.148148,0.137037,0.126394,0.092937,0.129792,0.019869,68
65,0.822642,0.027097,0.027817,0.005294,10,balanced,auto,sigmoid,True,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.122222,0.140741,0.129630,0.100372,0.085502,0.115693,0.020052,69
41,0.914885,0.027520,0.025563,0.001134,1.5,balanced,auto,sigmoid,True,21,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.066667,0.085185,0.081481,0.078067,0.085502,0.079380,0.006913,70
17,0.886887,0.038906,0.026882,0.007762,0.1,balanced,auto,sigmoid,True,21,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.062963,0.066667,0.062963,0.059480,0.059480,0.062310,0.002678,71


In [30]:
svc_params = svc_clf.best_params_
%store svc_params
svc_params

Stored 'svc_params' (dict)


{'C': 10,
 'class_weight': None,
 'gamma': 'auto',
 'kernel': 'rbf',
 'probability': True,
 'random_state': 21}

## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [13]:
tree_params = dict(
  max_depth=range(1, 49),
  class_weight=['balanced', None],
  criterion=['gini', 'entropy'],
  random_state=[21]
)

In [14]:
tree_clf = GridSearchCV(
  estimator=DecisionTreeClassifier(),
  param_grid=tree_params,
  n_jobs=-1
).fit(X_train, y_train)

In [15]:
tree_results = pd.DataFrame(tree_clf.cv_results_).sort_values('rank_test_score')
tree_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,0.005890,0.000467,0.001913,0.000374,balanced,gini,21,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.859259,0.903704,0.884758,0.832714,0.873865,0.025066,1
24,0.005154,0.000347,0.002046,0.000581,balanced,gini,25,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.874074,0.903704,0.873606,0.828996,0.873854,0.025018,2
21,0.007225,0.001481,0.002974,0.000770,balanced,gini,22,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.885185,0.862963,0.903704,0.881041,0.828996,0.872378,0.025263,3
27,0.006609,0.001384,0.002481,0.000567,balanced,gini,28,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
30,0.006355,0.000594,0.002742,0.000528,balanced,gini,31,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,0.007504,0.001262,0.002955,0.000507,balanced,gini,3,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.388889,0.303704,0.403704,0.427509,0.345725,0.373906,0.044064,188
96,0.004368,0.000695,0.002323,0.000477,,gini,1,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.370370,0.351852,0.359259,0.353160,0.342007,0.355330,0.009338,189
144,0.006013,0.001905,0.002882,0.000218,,entropy,1,21,"{'class_weight': None, 'criterion': 'entropy',...",0.370370,0.351852,0.359259,0.353160,0.342007,0.355330,0.009338,189
48,0.004775,0.000496,0.002506,0.000493,balanced,entropy,1,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.262963,0.318519,0.266667,0.323420,0.260223,0.286358,0.028376,191


In [28]:
tree_params = tree_clf.best_params_
%store tree_params
tree_params

Stored 'tree_params' (dict)


{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 21,
 'random_state': 21}

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [17]:
forest_params = dict(
  n_estimators=[5, 10, 50, 100],
  max_depth=range(1, 49),
  class_weight=['balanced', None],
  criterion=['entropy','gini'],
  random_state=[21]
)

In [18]:
forest_clf = GridSearchCV(
  estimator=RandomForestClassifier(),
  param_grid=forest_params,
  n_jobs=-1,
)
forest_clf.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': range(1, 49),
                         'n_estimators': [5, 10, 50, 100],
                         'random_state': [21]})

In [19]:
forest_results = pd.DataFrame(forest_clf.cv_results_).sort_values('rank_test_score')
forest_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
95,0.317607,0.021225,0.013736,0.001542,balanced,entropy,24,100,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.922222,0.900000,0.903704,0.910781,0.884758,0.904293,0.012361,1
686,0.160557,0.022530,0.010970,0.002200,,gini,28,50,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.922222,0.900000,0.907407,0.903346,0.888476,0.904290,0.010961,2
115,0.324300,0.038714,0.014182,0.000683,balanced,entropy,29,100,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.922222,0.900000,0.907407,0.907063,0.884758,0.904290,0.012156,2
310,0.147709,0.008184,0.010676,0.000683,balanced,gini,30,50,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,0.903704,0.900000,0.907063,0.884758,0.903549,0.012056,4
699,0.276456,0.005293,0.013258,0.000668,,gini,31,100,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.918519,0.911111,0.900000,0.910781,0.877323,0.903547,0.014380,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,0.014778,0.001583,0.003786,0.000475,,entropy,1,5,21,"{'class_weight': None, 'criterion': 'entropy',...",0.355556,0.366667,0.374074,0.345725,0.327138,0.353832,0.016467,764
4,0.021727,0.003043,0.005968,0.002103,balanced,entropy,2,5,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.318519,0.366667,0.381481,0.353160,0.345725,0.353110,0.021165,765
196,0.018314,0.003646,0.004207,0.000864,balanced,gini,2,5,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.311111,0.377778,0.377778,0.353160,0.312268,0.346419,0.029749,766
192,0.016086,0.000500,0.004188,0.000943,balanced,gini,1,5,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.262963,0.292593,0.285185,0.282528,0.293680,0.283390,0.011062,767


In [29]:
forest_params = forest_clf.best_params_
%store forest_params
forest_params

Stored 'forest_params' (dict)


{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 24,
 'n_estimators': 100,
 'random_state': 21}

## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [21]:
from itertools import product
k, v = zip(*forest_params.items())
params = [dict(zip(k, v)) for v in product(*v)]
results = []
for param in tqdm(params, desc="Поиск лучших параметров"):
  row = param
  model = RandomForestClassifier(**param)
  scores = cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5)
  row['mean_accuracy'] = np.mean(scores)
  row['std_accuracy'] = np.std(scores)
  results.append(row)


Поиск лучших параметров:   0%|          | 0/768 [00:00<?, ?it/s]

In [22]:
manual_gs_results = pd.DataFrame(results).sort_values('mean_accuracy', ascending=False)
manual_gs_results


Unnamed: 0,n_estimators,max_depth,class_weight,criterion,random_state,mean_accuracy,std_accuracy
668,100,24,balanced,entropy,21,0.904293,0.012361
495,50,28,,gini,21,0.904290,0.010961
688,100,29,balanced,entropy,21,0.904290,0.012156
501,50,30,balanced,gini,21,0.903549,0.012056
699,100,31,,gini,21,0.903547,0.014380
...,...,...,...,...,...,...,...
2,5,1,,entropy,21,0.353832,0.016467
4,5,2,balanced,entropy,21,0.353110,0.021165
5,5,2,balanced,gini,21,0.346419,0.029749
1,5,1,balanced,gini,21,0.283390,0.011062


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [23]:
predict = forest_clf.best_estimator_.predict(X_test)
predict

array([1, 5, 6, 3, 2, 1, 5, 5, 6, 6, 5, 6, 3, 0, 3, 3, 5, 3, 3, 6, 3, 4,
       3, 3, 6, 1, 5, 1, 1, 6, 6, 5, 6, 6, 3, 5, 6, 0, 2, 6, 6, 5, 5, 1,
       3, 1, 6, 3, 2, 1, 1, 5, 5, 4, 6, 6, 1, 6, 0, 3, 1, 3, 1, 1, 1, 4,
       2, 0, 3, 6, 5, 6, 5, 2, 5, 5, 4, 0, 5, 6, 1, 1, 1, 3, 6, 5, 5, 6,
       5, 3, 6, 3, 4, 2, 3, 6, 3, 6, 4, 3, 3, 0, 2, 1, 6, 0, 0, 4, 3, 3,
       5, 1, 6, 0, 6, 5, 5, 1, 6, 3, 5, 3, 3, 0, 0, 6, 4, 6, 5, 6, 3, 5,
       6, 3, 2, 6, 5, 6, 5, 4, 6, 1, 6, 6, 5, 3, 1, 1, 3, 1, 1, 3, 1, 0,
       3, 3, 5, 6, 2, 5, 0, 6, 2, 3, 5, 5, 3, 0, 3, 3, 3, 6, 3, 5, 0, 3,
       5, 3, 3, 5, 3, 1, 5, 2, 3, 0, 3, 1, 2, 3, 0, 2, 0, 6, 5, 5, 2, 2,
       1, 6, 6, 5, 5, 6, 3, 6, 2, 5, 2, 6, 1, 3, 1, 6, 4, 4, 1, 3, 5, 3,
       2, 5, 1, 0, 3, 3, 1, 1, 1, 6, 6, 6, 6, 4, 6, 2, 1, 4, 6, 0, 1, 3,
       5, 6, 6, 3, 5, 2, 5, 6, 3, 2, 3, 4, 5, 6, 3, 6, 6, 1, 1, 3, 1, 5,
       6, 6, 6, 4, 3, 3, 2, 3, 6, 3, 1, 6, 1, 3, 3, 3, 5, 1, 0, 3, 2, 6,
       2, 6, 6, 1, 5, 4, 3, 6, 0, 3, 6, 1, 3, 2, 6,

In [24]:
accuracy_score(y_test, predict)

0.9260355029585798