# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV,ParameterGrid
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from tqdm.notebook import tqdm 

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore). Don't forget to enrich the table with the 'dayofweek' column from the previous day's .csv-file.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df_x = pd.read_csv('../data/day-of-week-not-scaled.csv')
df_y = pd.read_csv('../data/dayofweek.csv')

In [3]:
y = df_y['dayofweek'].values
X = df_x.values


In [4]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=21)


## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [5]:
svc = SVC()
param_grid = {'kernel':['linear','rbf','sigmoid'],
              'C' :[0.01,0.1,1,1.5,5,10],
              'gamma':['scale','auto'],
              'class_weight':['balanced',None],
              'random_state': [21],
              'probability':[True]}

svc_cv = GridSearchCV(
    svc,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=4
)


In [6]:
svc_cv.fit(X_train,y_train)

In [7]:
print("Best parametrs: ", svc_cv.best_params_)
print("Best cross_validated accuracy: ",svc_cv.best_score_)


Best parametrs:  {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best cross_validated accuracy:  0.8761090458488228


In [8]:
best_svc = svc_cv.best_estimator_
y_pred = best_svc.predict(X_test)
test_accuracy = best_svc.score(X_test,y_test)

print(f"SVC accuracy:",test_accuracy)

SVC accuracy: 0.8875739644970414


In [9]:
results =pd.DataFrame(svc_cv.cv_results_)
rank_test_score = results.sort_values(by = 'rank_test_score',ascending=True)


rank_test_score[['params','mean_test_score','rank_test_score']].head(10)

Unnamed: 0,params,mean_test_score,rank_test_score
70,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.876109,1
64,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.8635,2
58,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.816018,3
52,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.808608,4
60,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.721052,5
63,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.721052,5
66,"{'C': 10, 'class_weight': None, 'gamma': 'scal...",0.719587,7
69,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.719587,7
51,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.706234,9
48,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.706234,9


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [10]:
dt = DecisionTreeClassifier()
param_grids = {'max_depth':list(range(1,50)),'class_weight':['balanced',None],'criterion':['entropy','gini'],'random_state':[21]}
dt_cv = GridSearchCV(dt,
                     param_grid = param_grids,
                     scoring='accuracy',
                     cv=5)


In [11]:
dt_cv.fit(X_train,y_train)

In [12]:
print("Best parametrs: ",dt_cv.best_params_)
print("Best cross-validated accuracy: ",dt_cv.best_score_)

Best parametrs:  {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22, 'random_state': 21}
Best cross-validated accuracy:  0.8731212997384002


In [13]:
results = pd.DataFrame(dt_cv.cv_results_)
results = results.sort_values(by='rank_test_score',ascending=True)
results[['params','mean_test_score','rank_test_score']].head(10)

Unnamed: 0,params,mean_test_score,rank_test_score
70,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873121,1
69,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873121,2
80,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116,3
81,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116,3
96,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116,3
97,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116,3
82,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116,3
83,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116,3
87,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116,3
86,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873116,3


In [14]:
best_dt = dt_cv.best_estimator_
y_pred = best_dt.predict(X_test)
test_accuracy = accuracy_score(y_pred,y_test)
test_accuracy2 = best_dt.score(X_test,y_test)
print("Decision tree accuracy: ",test_accuracy)
print("Decision tree accuracy score: ",test_accuracy2)

Decision tree accuracy:  0.8905325443786982
Decision tree accuracy score:  0.8905325443786982


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [15]:
rf = RandomForestClassifier()
param_grid = {'n_estimators':[5,10,50,100],
              'max_depth':list(range(1,50)),
              'class_weight':['balanced',None],
              'criterion':['entropy','gini'],
              'random_state':[21]}

rf_cv = GridSearchCV(rf,
                     param_grid,
                     scoring='accuracy',
                     n_jobs=4
                    )

In [16]:
rf_cv.fit(X_train, y_train)

In [17]:
print("Best parametrs: ",rf_cv.best_params_)
print("Best cross-validated accuracy: ",rf_cv.best_score_)

Best parametrs:  {'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50, 'random_state': 21}
Best cross-validated accuracy:  0.9042902381935839


In [18]:
best_rf = rf_cv.best_estimator_
rf_accuracy = best_rf.score(X_test,y_test)
print("Best Random Forest accuracy score: ",rf_accuracy) 


Best Random Forest accuracy score:  0.9289940828402367


In [19]:
results = pd.DataFrame(rf_cv.cv_results_).sort_values(by='rank_test_score',ascending=True)
results[['params','mean_test_score','rank_test_score']].head(10)

Unnamed: 0,params,mean_test_score,rank_test_score
698,"{'class_weight': None, 'criterion': 'gini', 'm...",0.90429,1
711,"{'class_weight': None, 'criterion': 'gini', 'm...",0.903547,2
314,"{'class_weight': 'balanced', 'criterion': 'gin...",0.902817,3
330,"{'class_weight': 'balanced', 'criterion': 'gin...",0.902809,4
763,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806,5
759,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806,5
767,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806,5
755,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806,5
735,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806,5
783,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806,5


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [20]:
rf_manual = RandomForestClassifier()

param_grid = {'n_estimators':[5,15,25,45,75],
              'max_depth':list(range(9,60)),
              'class_weight':['balanced'],
              'criterion' : ['entropy'],
              'random_state':[21]}

# cross_val_score -> cv = 5

In [21]:
rf_manual_cv = GridSearchCV(
    rf_manual,
    param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=4
)

In [22]:
rf_manual_cv.fit(X_train,y_train)

In [23]:
print("Best parametrs: ",rf_manual_cv.best_params_)
print("Best cross-validated accuracy: ",rf_manual_cv.best_score_)

Best parametrs:  {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 27, 'n_estimators': 75, 'random_state': 21}
Best cross-validated accuracy:  0.9020735233374639


In [24]:
best_rf_manual = rf_manual_cv.best_estimator_
rf_accuracy = best_rf_manual.score(X_test,y_test)
print("Best Random Forest accuracy score: ",rf_accuracy) 


Best Random Forest accuracy score:  0.9289940828402367


In [25]:
param_grid = {
    'n_estimators': [5, 15, 25, 45, 75],
    'max_depth': list(range(9, 60)),
    'class_weight': ['balanced'],
    'criterion': ['entropy'],
    'random_state': [21]
}


param_list = list(ParameterGrid(param_grid))


results = []


for params in tqdm(param_list, desc='Manual Grid Search'):
    model = RandomForestClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=4)
    results.append({'params': params, 'mean_score': scores.mean(), 'std_score': scores.std()})


results.sort(key=lambda x: x['mean_score'], reverse=True)


print("Best parameters:", results[0]['params'])
print("Best cross-validated accuracy:", results[0]['mean_score'])

results_df = pd.DataFrame(results)
results_df.head()


Manual Grid Search:   0%|          | 0/255 [00:00<?, ?it/s]

Best parameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 27, 'n_estimators': 75, 'random_state': 21}
Best cross-validated accuracy: 0.9020735233374639


Unnamed: 0,params,mean_score,std_score
0,"{'class_weight': 'balanced', 'criterion': 'ent...",0.902074,0.011439
1,"{'class_weight': 'balanced', 'criterion': 'ent...",0.89984,0.012735
2,"{'class_weight': 'balanced', 'criterion': 'ent...",0.899105,0.011146
3,"{'class_weight': 'balanced', 'criterion': 'ent...",0.899102,0.011659
4,"{'class_weight': 'balanced', 'criterion': 'ent...",0.898367,0.012108


In [26]:
rf_df = pd.DataFrame(rf_manual_cv.cv_results_).sort_values(by='mean_test_score',ascending=False)
rf_df = rf_df.rename(columns={'mean_test_score':'mean_accuracy', 'std_test_score':'std_accuracy'})

rf_df[['params','mean_accuracy','std_accuracy']]

Unnamed: 0,params,mean_accuracy,std_accuracy
94,"{'class_weight': 'balanced', 'criterion': 'ent...",0.902074,0.011439
72,"{'class_weight': 'balanced', 'criterion': 'ent...",0.899840,0.012735
119,"{'class_weight': 'balanced', 'criterion': 'ent...",0.899105,0.011146
124,"{'class_weight': 'balanced', 'criterion': 'ent...",0.899102,0.011659
109,"{'class_weight': 'balanced', 'criterion': 'ent...",0.898367,0.012108
...,...,...,...
15,"{'class_weight': 'balanced', 'criterion': 'ent...",0.827881,0.022532
1,"{'class_weight': 'balanced', 'criterion': 'ent...",0.815277,0.039035
10,"{'class_weight': 'balanced', 'criterion': 'ent...",0.815272,0.021221
5,"{'class_weight': 'balanced', 'criterion': 'ent...",0.805653,0.023668


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [27]:
best_rf = rf_cv.best_estimator_
rf_accuracy = best_rf.score(X_test,y_test)
print("Best model - Random Forest accuracy : ",rf_accuracy) 


Best model - Random Forest accuracy :  0.9289940828402367
