# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
from sklearn.model_selection import cross_val_score
import itertools
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [4]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')

In [5]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [6]:
param_grid={'C': [0.1, 1, 10], 
            'gamma': ['scale', 'auto'], 
            'kernel': ['linear', 'rbf', 'sigmoid'], 
            'class_weight': ['balanced', None]}

In [7]:
GSCV1 = GridSearchCV(SVC(random_state=21, probability=True), param_grid=param_grid)
GSCV1.fit(X_train, y_train)

KeyboardInterrupt: 

In [70]:
results_df = pd.DataFrame(GSCV1.cv_results_)
results_df = results_df.sort_values(by='rank_test_score')
print(results_df[['param_kernel', 'param_C', 'param_gamma', 'param_class_weight', 'mean_test_score', 'std_test_score', 'rank_test_score']].head(10))

   param_kernel param_C param_gamma param_class_weight  mean_test_score  \
46          rbf     100        auto               None         0.905039   
40          rbf     100        auto           balanced         0.904298   
34          rbf      10        auto               None         0.876109   
28          rbf      10        auto           balanced         0.863500   
39       linear     100        auto           balanced         0.741826   
36       linear     100       scale           balanced         0.741826   
45       linear     100        auto               None         0.740366   
42       linear     100       scale               None         0.740366   
27       linear      10        auto           balanced         0.721052   
24       linear      10       scale           balanced         0.721052   

    std_test_score  rank_test_score  
46        0.017033                1  
40        0.017314                2  
34        0.018419                3  
28        0.010870    

In [71]:
print(GSCV1.best_params_)

{'C': 100, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [72]:
param_grid={'max_depth': list(range(1, 50)),
            'criterion': ['gini', 'entropy'],
            'class_weight': ['balanced', None]}

In [73]:
GS_dt = GridSearchCV(DecisionTreeClassifier(random_state=21), param_grid=param_grid)
GS_dt.fit(X_train, y_train)

In [74]:
dt_results_df = pd.DataFrame(GS_dt.cv_results_)
dt_results_df.sort_values(by='rank_test_score', inplace=True)
print(dt_results_df[['param_class_weight', 'param_criterion', 'param_max_depth', 'mean_test_score', 'std_test_score', 'rank_test_score']].head(10))

   param_class_weight param_criterion param_max_depth  mean_test_score  \
21           balanced            gini              22         0.873121   
20           balanced            gini              21         0.873121   
29           balanced            gini              30         0.873116   
32           balanced            gini              33         0.873116   
28           balanced            gini              29         0.873116   
27           balanced            gini              28         0.873116   
26           balanced            gini              27         0.873116   
24           balanced            gini              25         0.873116   
23           balanced            gini              24         0.873116   
22           balanced            gini              23         0.873116   

    std_test_score  rank_test_score  
21        0.023998                1  
20        0.026300                2  
29        0.023911                3  
32        0.023911               

In [75]:
print(GS_dt.best_params_)

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [None]:
param_grid={'n_estimators': [5, 10, 50, 100],
            'max_depth': list(range(1, 50)),
            'criterion': ['gini', 'entropy'],
            'class_weight': ['balanced', None]}

In [77]:
GS_rfc = GridSearchCV(RandomForestClassifier(random_state=21), param_grid=param_grid)
GS_rfc.fit(X_train, y_train)

In [78]:
rfc_result = pd.DataFrame(GS_rfc.cv_results_)
rfc_result.sort_values(by='rank_test_score', inplace=True)
print(rfc_result[['param_n_estimators', 'param_max_depth', 'param_criterion', 'param_class_weight', 'mean_test_score', 'std_test_score', 'rank_test_score']].head(10))

    param_n_estimators param_max_depth param_criterion param_class_weight  \
502                 50              28            gini               None   
515                100              31            gini               None   
118                 50              30            gini           balanced   
134                 50              34            gini           balanced   
587                100              49            gini               None   
551                100              40            gini               None   
547                100              39            gini               None   
571                100              45            gini               None   
543                100              38            gini               None   
559                100              42            gini               None   

     mean_test_score  std_test_score  rank_test_score  
502         0.904290        0.010961                1  
515         0.903547        0.014380    

In [79]:
print(GS_rfc.best_params_)

{'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50}


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [80]:
param_grid={'n_estimators': [5, 10, 50, 100],
            'max_depth': list(range(1, 50)),
            'criterion': ['gini', 'entropy'],
            'class_weight': ['balanced', None]}

param_combinations = list(itertools.product(*param_grid.values()))

In [81]:
results = []

for params in tqdm(param_combinations, desc="Grid Search Progress"):
    n_estimators, max_depth, criterion, class_weight = params
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        criterion=criterion,
        class_weight=class_weight,
        n_jobs=-1
    )
    scores = cross_val_score(model, X, y, cv=5, n_jobs=-1)
    results.append({
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'criterion': criterion,
        'class_weight': class_weight,
        'mean_accuracy': scores.mean(),
        'std_accuracy': scores.std()
    })

Grid Search Progress:   0%|          | 0/784 [00:00<?, ?it/s]

In [82]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='mean_accuracy', ascending=False)
print(results_df.head(10))

     n_estimators  max_depth criterion class_weight  mean_accuracy  \
559            50         42   entropy         None       0.555265   
526            50         34   entropy     balanced       0.554671   
531            50         35   entropy         None       0.552282   
279            10         21   entropy         None       0.549922   
684           100         25      gini     balanced       0.549325   
553            50         41      gini         None       0.546368   
451            50         15   entropy         None       0.546361   
702           100         29   entropy     balanced       0.545179   
439            50         12   entropy         None       0.545165   
631           100         11   entropy         None       0.545162   

     std_accuracy  
559      0.156207  
526      0.150646  
531      0.144817  
279      0.151343  
684      0.159791  
553      0.148372  
451      0.153645  
702      0.170276  
439      0.154590  
631      0.146501  


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [83]:
model = RandomForestClassifier(class_weight='balanced', criterion='gini', max_depth=26, n_estimators=50, n_jobs=-1)
model.fit(X_train, y_train)

In [84]:
accuracy = accuracy_score(y_test, model.predict(X_test))

In [85]:
accuracy

0.9319526627218935