# Hyperparameter Tuning using `GridSearchCV()`

In [None]:
# load necessary Python packages
import numpy as np
import pandas as pd
pd.set_option('max_columns', 50)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeClassifier

In [None]:
# Load the balanced LendingClub dataset

df = pd.read_csv('LendingClub_balanced.csv')

X = df.drop(columns=['not_fully_paid'])
y = df['not_fully_paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=365)
X_train = X_train.copy()
X_test = X_test.copy()

## Hyperparameters and model performance

In [5]:
# Let's use the "max_depth" parameter in DecisionTreeClassifier as an example

y_predict = DecisionTreeClassifier(max_depth=2, random_state=1).fit(X_train,y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(f"\nWhen max_depth=2, the accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_predict))

y_predict = DecisionTreeClassifier(max_depth=10, random_state=1).fit(X_train,y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(f"\nWhen max_depth=10, the accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_predict))


When max_depth=2, the accuracy is: 57.33%
The confusion matrix is:
[[247  51]
 [211 105]]

When max_depth=10, the accuracy is: 53.09%
The confusion matrix is:
[[205  93]
 [195 121]]


In [6]:
# Next try the "criterion" parameter

y_predict = DecisionTreeClassifier(criterion='gini', max_depth=10, random_state=1).fit(X_train,y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(f"\nWhen the split criterion is 'gini', the accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_predict))

max_depth = 10
y_predict = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=1).fit(X_train,y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(f"\nWhen the split criterion is 'entropy', the accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_predict))


When the split criterion is 'gini', the accuracy is: 53.09%
The confusion matrix is:
[[205  93]
 [195 121]]

When the split criterion is 'entropy', the accuracy is: 54.89%
The confusion matrix is:
[[209  89]
 [188 128]]


## Hyperparameter tuning using `GridSearchCV()`

Suppose that we plan to train a `DecisionTreeClassifier`, and we want to decide between the following hyperparameter choices:
+ for the tree splitting criterion, we wonder to use 'gini' or to use 'entropy'
+ for the max depth of the tree, we wonder which to choose among 2, 4, 6, 8, 10

In [1]:
# GridSearchCV() is the popular choice for hyper-parameter tuning
from sklearn.model_selection import GridSearchCV

In [11]:
clf = DecisionTreeClassifier(random_state=1)

param_grid = {
    'criterion' : ["gini", "entropy"],
    'max_depth': np.arange(2,11,2)
#    'max_depth': [2,4,6,8,10]
}    

grid = GridSearchCV(estimator = clf, param_grid = param_grid, cv=3)
grid.fit(X_train, y_train)
print(f"Best parameters are: {grid.best_params_}")
print(f"The cross-validation accuracy is: {round(grid.best_score_,4)}")

# evaluation
y_predict = grid.best_estimator_.predict(X_test)
print(f"The testing accuracy is: {accuracy_score(y_test, y_predict).round(4)}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)

Best parameters are: {'criterion': 'entropy', 'max_depth': 4}
The cross-validation accuracy is: 0.5889
The testing accuracy is: 0.5798
The confusion matrix is:
[[228  70]
 [188 128]]


### Understanding `GridSearchCV()`: the grid search process

Simply put, `GridSearchCV()` tries every possible combination of hyperparameter values, and then select the best one:

In [12]:
# Now print out all combinations below just for our learning. In practice, we usually 
# care only about the combination that results in the best score.
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008942,0.000473,0.002917,0.000492,gini,2,"{'criterion': 'gini', 'max_depth': 2}",0.559902,0.602203,0.596083,0.586063,0.018666,3
1,0.011279,0.001507,0.003477,0.000684,gini,4,"{'criterion': 'gini', 'max_depth': 4}",0.579462,0.613219,0.572827,0.588503,0.017686,2
2,0.01409,0.0014,0.003195,0.00041,gini,6,"{'criterion': 'gini', 'max_depth': 6}",0.570905,0.577723,0.585067,0.577898,0.005783,7
3,0.013831,0.001032,0.002463,8.5e-05,gini,8,"{'criterion': 'gini', 'max_depth': 8}",0.557457,0.588739,0.585067,0.577088,0.013962,8
4,0.015453,0.00059,0.007924,0.00709,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.572127,0.575275,0.555692,0.567698,0.008587,10
5,0.008812,0.001086,0.002704,0.000204,entropy,2,"{'criterion': 'entropy', 'max_depth': 2}",0.557457,0.602203,0.596083,0.585248,0.019809,4
6,0.01113,0.0006,0.002595,0.000151,entropy,4,"{'criterion': 'entropy', 'max_depth': 4}",0.574572,0.614443,0.577723,0.588913,0.018098,1
7,0.013544,0.000266,0.002682,0.000155,entropy,6,"{'criterion': 'entropy', 'max_depth': 6}",0.556235,0.608323,0.588739,0.584432,0.021482,5
8,0.016885,0.001278,0.002353,3.7e-05,entropy,8,"{'criterion': 'entropy', 'max_depth': 8}",0.547677,0.615667,0.578947,0.580764,0.027786,6
9,0.018838,0.000359,0.003183,0.000574,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.555012,0.603427,0.565483,0.574641,0.020799,9


### Understanding `GridSearchCV()`: the need for cross-validation

Repeatedly using the same test dataset to compare the performances of multiple trained models raised the concern of **data leakage**: suppose we tried 100 models (including various hyperparameter choices), and selected the champion model based on test accuracy. This highest test accuracy value may come from two sources:
+ The trained champion model is indeed good
+ We found a model that may or may not be good, but *happens to work well on the test dataset*

To avoid the above data leakage problem, we usually prepare a third data sample, **the validation data**, that is used during hyperparameter tuning. After hyperparameter tuning is done, we then use the *never used before* test data to check its final performance.

Nowadays, instead of manually creating a validation data, we often use cross validation to automate the process:
+ [explanation of cross validation on wikipedia](https://en.wikipedia.org/wiki/Cross-validation_(statistics))

Explain the data leak issue when doing hyperparameter tuning
Thus the need for validation data
And CV provides a good solution


## Recapping the steps when using `GridSearchCV()`

As in the code above, to use `GridSearchCV()`, we need to do the following.

*Step 1.* Decide on which learning algorithm to use, e.g.

`clf = DecisionTreeClassifier(random_state=1)`

*Step 2.* Decide on what values to try for each of the chosen hyperparameters, e.g.

```
param_grid = {
    'criterion' : ["gini", "entropy"],
    'max_depth': np.arange(2,11,2),
}    
```

*Step 3.* Then, call `GridSearchCV()` to try all these hyperparameter values for the chosen learning algorithm, e.g.

`grid = GridSearchCV(estimator = clf, param_grid = param_grid, cv=3)`

Additional comments on `GridSearchCV()`:
+ It is similar to function `tune()` in Caret in R.
+ `scikit-learn` provides a few alternatives of `GridSearchCV()` for hyperparameter tuning -- [see the user guide](https://scikit-learn.org/stable/modules/grid_search.html). 



## Using alternative metrics during hyperparameter tuning

In the example below, we switch the performance metrics from 'accuracy' to 'ROC AUC':

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
# Specify the 'scoring' option in GridSearchCV()
grid = GridSearchCV(estimator = clf, param_grid = param_grid, scoring='roc_auc', cv=3)

grid.fit(X_train, y_train)
print(f"Best parameters are: {grid.best_params_}")
print(f"The cross-validation ROC AUC is: {round(grid.best_score_,4)}")

# Accordingly, adjust the evaluation code
y_predict = grid.best_estimator_.predict(X_test)
y_predict_proba = grid.best_estimator_.predict_proba(X_test)[:,1]
print(f"The testing ROC AUC is: {roc_auc_score(y_test, y_predict_proba).round(4)}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)

## Model persistence

After we find our champion trained model, we need to save it (a.k.a., persist it) so that we don't have to go through the training process again in order to use the trained model. The steps for saving and reusing a trained model is called **model persistence**, and can be done using either the `pickle` package or the `joblib` package. Below we use the `joblib` package:

In [None]:
from joblib import dump, load

In [None]:
dump(grid.best_estimator_, 'clf_best.joblib')

In [None]:
# Verify that we can load & use the saved model
clf_loaded = load('clf_best.joblib')
clf_loaded.predict(X_test[0:11])

Caution: make sure you always pre-process new data exactly in the same way that you pre-processed the training data, before applying a saved model!

## Exercise: Create a kNN classifier, and tune its hyperparameter 'n_neighbors'

## KNNï¼šlecture 5

In [13]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
y_predict = KNeighborsClassifier(n_neighbors=5).fit(X_train,y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(f"\nWhen n_neighbors=5, the accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_predict))

y_predict = KNeighborsClassifier(n_neighbors=10).fit(X_train,y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print(f"\nWhen n_neighbors=10, the accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
print(confusion_matrix(y_test, y_predict))


When n_neighbors=5, the accuracy is: 50.98%
The confusion matrix is:
[[155 143]
 [158 158]]

When n_neighbors=10, the accuracy is: 50.49%
The confusion matrix is:
[[192 106]
 [198 118]]
