# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from tqdm.notebook import tqdm 

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
X = pd.read_csv("../../data/day-of-week-not-scaled.csv")
y = pd.read_csv("../../data/dayofweek.csv")["dayofweek"]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [4]:
gs = GridSearchCV(SVC(random_state=21, probability=True),
                  param_grid={'kernel': ['linear', 'rbf', 'sigmoid'],'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma':['scale', 'auto'], 'class_weight':['balanced', None]}, 
                  scoring='accuracy',
                  n_jobs=-1)

In [5]:
gs.fit(X_train, y_train)

In [6]:
gs.best_params_

{'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}

In [7]:
gridSearchResSVM = pd.DataFrame(gs.cv_results_)
gridSearchResSVM.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
70,0.465335,0.010299,0.039977,0.011578,10.0,,auto,rbf,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.900000,0.848148,0.885185,0.884758,0.862454,0.876109,0.018419,1
64,0.445037,0.038820,0.037072,0.004523,10.0,balanced,auto,rbf,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.877778,0.851852,0.862963,0.873606,0.851301,0.863500,0.010870,2
58,0.411628,0.009387,0.034285,0.006136,5.0,,auto,rbf,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.825926,0.811111,0.818519,0.821561,0.802974,0.816018,0.008116,3
52,0.461144,0.034619,0.034124,0.006412,5.0,balanced,auto,rbf,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.844444,0.785185,0.792593,0.817844,0.799257,0.807865,0.021257,4
63,34.432375,3.605016,0.013002,0.001307,10.0,balanced,auto,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.729630,0.700000,0.755556,0.754647,0.665428,0.721052,0.034438,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,0.436910,0.018637,0.017820,0.004697,5.0,balanced,auto,sigmoid,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.144444,0.148148,0.137037,0.126394,0.092937,0.129792,0.019869,68
65,0.404421,0.019765,0.014160,0.002370,10.0,balanced,auto,sigmoid,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.122222,0.140741,0.129630,0.100372,0.085502,0.115693,0.020052,69
41,0.519550,0.029659,0.013502,0.003759,1.5,balanced,auto,sigmoid,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.066667,0.085185,0.081481,0.078067,0.085502,0.079380,0.006913,70
17,0.401514,0.029285,0.013721,0.004751,0.1,balanced,auto,sigmoid,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.062963,0.066667,0.062963,0.059480,0.059480,0.062310,0.002678,71


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [8]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=21), 
                  param_grid={'max_depth': range(1, 50), 'class_weight':['balanced', None], 'criterion':['entropy', 'gini']})

In [9]:
gs.fit(X_train, y_train)

In [10]:
gs.best_params_

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 23}

In [11]:
gridSearchResTree = pd.DataFrame(gs.cv_results_)
gridSearchResTree.sort_values(by=['rank_test_score', 'param_max_depth'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
71,0.002709,0.000169,0.000561,0.000032,balanced,gini,23,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.877323,0.832714,0.873859,0.023972,1
72,0.002768,0.000046,0.000587,0.000059,balanced,gini,24,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.877323,0.832714,0.873859,0.023972,1
73,0.002697,0.000105,0.000535,0.000010,balanced,gini,25,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.877323,0.832714,0.873859,0.023972,1
75,0.002682,0.000173,0.000604,0.000064,balanced,gini,27,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.877323,0.832714,0.873859,0.023972,1
76,0.002658,0.000141,0.000575,0.000040,balanced,gini,28,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.877323,0.832714,0.873859,0.023972,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,0.001429,0.000050,0.000518,0.000047,balanced,gini,3,"{'class_weight': 'balanced', 'criterion': 'gin...",0.388889,0.303704,0.403704,0.427509,0.345725,0.373906,0.044064,192
98,0.000987,0.000043,0.000504,0.000019,,entropy,1,"{'class_weight': None, 'criterion': 'entropy',...",0.370370,0.351852,0.359259,0.353160,0.342007,0.355330,0.009338,193
147,0.000977,0.000025,0.000509,0.000012,,gini,1,"{'class_weight': None, 'criterion': 'gini', 'm...",0.370370,0.351852,0.359259,0.353160,0.342007,0.355330,0.009338,193
0,0.003009,0.002487,0.000826,0.000313,balanced,entropy,1,"{'class_weight': 'balanced', 'criterion': 'ent...",0.262963,0.318519,0.266667,0.323420,0.260223,0.286358,0.028376,195


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [12]:
gs = GridSearchCV(RandomForestClassifier(random_state=21), 
                  param_grid={'n_estimators':[5, 10, 50, 100],'max_depth': range(1, 50), 'class_weight':['balanced', None], 'criterion':['entropy', 'gini']})

In [13]:
gs.fit(X_train, y_train)

In [14]:
gs.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 28,
 'n_estimators': 50}

In [15]:
bestParams = gs.best_params_

In [16]:
gridSearchResForest = pd.DataFrame(gs.cv_results_)
gridSearchResForest.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
698,0.040601,0.000175,0.002505,0.000028,,gini,28,50,"{'class_weight': None, 'criterion': 'gini', 'm...",0.922222,0.900000,0.907407,0.903346,0.888476,0.904290,0.010961,1
711,0.080446,0.000469,0.004511,0.000059,,gini,31,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.922222,0.911111,0.900000,0.910781,0.877323,0.904287,0.015204,2
374,0.042051,0.000262,0.002619,0.000046,balanced,gini,45,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,0.907407,0.896296,0.907063,0.884758,0.903549,0.012503,3
390,0.042734,0.000440,0.002686,0.000088,balanced,gini,49,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,0.907407,0.896296,0.907063,0.884758,0.903549,0.012503,3
386,0.042599,0.000912,0.002762,0.000229,balanced,gini,48,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,0.907407,0.896296,0.907063,0.884758,0.903549,0.012503,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,0.003305,0.000077,0.000703,0.000023,,entropy,1,5,"{'class_weight': None, 'criterion': 'entropy',...",0.355556,0.366667,0.374074,0.345725,0.327138,0.353832,0.016467,780
4,0.003873,0.000076,0.000841,0.000127,balanced,entropy,2,5,"{'class_weight': 'balanced', 'criterion': 'ent...",0.318519,0.366667,0.381481,0.353160,0.345725,0.353110,0.021165,781
200,0.004217,0.000170,0.001033,0.000194,balanced,gini,2,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.311111,0.377778,0.377778,0.353160,0.312268,0.346419,0.029749,782
196,0.003463,0.000087,0.000725,0.000025,balanced,gini,1,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.262963,0.292593,0.285185,0.282528,0.293680,0.283390,0.011062,783


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [62]:
import itertools
import math
grid = {'n_estimators':[5, 10, 50, 100],'max_depth': range(1, 50), 'class_weight':['balanced', None], 'criterion':['entropy', 'gini']}
grid_len = math.prod([len(i) for i in grid.values()])
result = pd.DataFrame(columns=[*grid.keys(), 'mean_accuracy', 'std_accuracy'])

for pr in tqdm(itertools.product(*grid.values()), total=grid_len):
    params = {k:v for k, v in zip(grid.keys(), pr)}
    model = RandomForestClassifier(random_state=21, **params, n_jobs=-1)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    mean_accuracy = np.mean(cv_scores)
    std_accuracy = np.std(cv_scores)
    result = pd.concat([result, pd.DataFrame({k:[v] for k,v in zip(result.columns, [*pr, mean_accuracy, std_accuracy])})])
    

  0%|          | 0/784 [00:00<?, ?it/s]

  result = pd.concat([result, pd.DataFrame({k:[v] for k,v in zip(result.columns, [*pr, mean_accuracy, std_accuracy])})])


In [64]:
result.sort_values(by='mean_accuracy', ascending=False)

Unnamed: 0,n_estimators,max_depth,class_weight,criterion,mean_accuracy,std_accuracy
0,50,28,,gini,0.904290,0.010961
0,100,31,,gini,0.904287,0.015204
0,50,45,balanced,gini,0.903549,0.012503
0,50,43,balanced,gini,0.903549,0.012503
0,50,44,balanced,gini,0.903549,0.012503
...,...,...,...,...,...,...
0,5,1,,entropy,0.353832,0.016467
0,5,2,balanced,entropy,0.353110,0.021165
0,5,2,balanced,gini,0.346419,0.029749
0,5,1,balanced,gini,0.283390,0.011062


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [18]:
bestModel = RandomForestClassifier(random_state=21, **bestParams)
bestModel.fit(X_train, y_train)
accuracy_score(y_test, bestModel.predict(X_test))

0.9289940828402367