# Assignment 1 - CIC-1205

## Exercise 5 - Hyperparameter Tuning

Student: Balthazar Paixão


In [1]:
import numpy as np
import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report, confusion_matrix

In [2]:
def transform_array(arr: np.ndarray) -> list:
    elems_list = [[1] if elem[0] != 0 else [0] for elem in arr]
    elems_array = np.array(elems_list)
    return elems_array

In [3]:
data_folder = "../class-repo/cic1205/data/"
pickle_file = "A652.pickle"
SEED = 57

In [4]:
a652 = f"{data_folder}{pickle_file}"
file = open(a652, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)
print(f"Shapes: ", X_train.shape, X_val.shape, X_test.shape)

Shapes:  (10012, 24) (2506, 24) (9582, 24)


In [5]:
y_train = transform_array(y_train)
y_train = y_train.ravel()

y_val = transform_array(y_val)
y_val = y_val.ravel()

y_test = transform_array(y_test)
y_test = y_test.ravel()

In [6]:
params = {
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}
GB = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(GB, params, cv=5, n_jobs=-2)

grid_search.fit(X_train, y_train)

In [7]:
grid_search.best_params_

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}

In [9]:
GB = GradientBoostingClassifier(random_state=57, **grid_search.best_params_)

GB.fit(X_train, y_train)
y_pred = GB.predict(X_val)

print("\nValidation")
print(f"Shapes: {X_val.shape}, {y_pred.shape}")
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))


Validation
Shapes: (2506, 24), (2506,)
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2277
           1       0.76      0.57      0.65       229

    accuracy                           0.94      2506
   macro avg       0.86      0.77      0.81      2506
weighted avg       0.94      0.94      0.94      2506

[[2235   42]
 [  99  130]]


In [10]:
y_pred = GB.predict(X_test)
print("\nTest")
print(f"Shapes: {X_test.shape}, {y_pred.shape}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Test
Shapes: (9582, 24), (9582,)
              precision    recall  f1-score   support

           0       0.94      0.99      0.97      8769
           1       0.80      0.35      0.48       813

    accuracy                           0.94      9582
   macro avg       0.87      0.67      0.73      9582
weighted avg       0.93      0.94      0.93      9582

[[8698   71]
 [ 531  282]]
