# Assignment 2 - CIC-1205

## Exercise 2 - Cross Validation

Student: Balthazar Paixão


In [1]:
data_folder = "../class-repo/cic1205/data/atmoseer/"
pickle_files = [
    "A602.pickle",
    "A621.pickle",
    "A627.pickle",
    "A636.pickle",
    "A652.pickle",
]

In [2]:
import numpy as np
import pickle
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [3]:
SEED = 57


def transform_array(arr: np.ndarray) -> list:
    elems_list = [[1] if elem[0] != 0 else [0] for elem in arr]
    elems_array = np.array(elems_list)
    return elems_array


def count_values_list(lst: list) -> dict:
    if isinstance(lst, np.ndarray):
        try:
            lst = [1 if elem[0] != 0 else 0 for elem in lst]
        except IndexError:
            lst = [1 if elem != 0 else 0 for elem in lst]
    elif isinstance(lst, np.array):
        lst = lst.tolist()
    elif isinstance(lst, pd.Series):
        lst = lst.tolist()

    elems = set(lst)
    dict_counter = {elem: lst.count(elem) for elem in elems}

    return dict_counter

# A602


In [4]:
a602 = f"{data_folder}{pickle_files[0]}"
file = open(a602, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

y_train = transform_array(y_train)
y_val = transform_array(y_val)
y_test = transform_array(y_test)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

Shapes:  (12743, 24) (765, 24)


In [5]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())

In [6]:
print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

Best params: {'learning_rate': 0.05, 'loss': 'exponential', 'max_depth': 2, 'n_estimators': 200}
Best score: 0.7822973288959998
Best estimator: GradientBoostingClassifier(learning_rate=0.05, loss='exponential', max_depth=2,
                           n_estimators=200, random_state=57)
Best index: 29


In [7]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


Confusion matrix:
[[638  35]
 [ 44  48]]
Classification report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       673
           1       0.58      0.52      0.55        92

    accuracy                           0.90       765
   macro avg       0.76      0.73      0.75       765
weighted avg       0.89      0.90      0.89       765



# A621


In [8]:
a621 = f"{data_folder}{pickle_files[1]}"
file = open(a621, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

Shapes:  (11977, 24) (10080, 24)


In [9]:
y_train = transform_array(y_train)
print(f"y_train: {count_values_list(y_train)}")
y_test = transform_array(y_test)
print(f"y_test:  {count_values_list(y_test)}")

y_train: {0: 10987, 1: 990}
y_test:  {0: 9147, 1: 933}


In [10]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())

print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

Best params: {'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 2, 'n_estimators': 100}
Best score: 0.7662423958036096
Best estimator: GradientBoostingClassifier(loss='exponential', max_depth=2, random_state=57)
Best index: 10


In [11]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


Confusion matrix:
[[8931  216]
 [ 460  473]]
Classification report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      9147
           1       0.69      0.51      0.58       933

    accuracy                           0.93     10080
   macro avg       0.82      0.74      0.77     10080
weighted avg       0.93      0.93      0.93     10080



# A627


In [14]:
a627 = f"{data_folder}{pickle_files[2]}"
file = open(a627, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

y_train = transform_array(y_train)
y_val = transform_array(y_val)
y_test = transform_array(y_test)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

Shapes:  (11917, 24) (10202, 24)


In [15]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())


print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

Best params: {'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 3, 'n_estimators': 100}
Best score: 0.7711154000906714
Best estimator: GradientBoostingClassifier(loss='exponential', random_state=57)
Best index: 13


In [16]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


Confusion matrix:
[[8991  209]
 [ 453  549]]
Classification report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      9200
           1       0.72      0.55      0.62      1002

    accuracy                           0.94     10202
   macro avg       0.84      0.76      0.79     10202
weighted avg       0.93      0.94      0.93     10202



# A636


In [17]:
a636 = f"{data_folder}{pickle_files[3]}"
file = open(a636, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

y_train = transform_array(y_train)
y_val = transform_array(y_val)
y_test = transform_array(y_test)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

Shapes:  (12633, 24) (10129, 24)


In [18]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())


print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

Best params: {'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 2, 'n_estimators': 50}
Best score: 0.806039467331494
Best estimator: GradientBoostingClassifier(loss='exponential', max_depth=2, n_estimators=50,
                           random_state=57)
Best index: 9


In [19]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


Confusion matrix:
[[8805  212]
 [ 549  563]]
Classification report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      9017
           1       0.73      0.51      0.60      1112

    accuracy                           0.92     10129
   macro avg       0.83      0.74      0.78     10129
weighted avg       0.92      0.92      0.92     10129



# A652


In [20]:
a652 = f"{data_folder}{pickle_files[4]}"
file = open(a652, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

y_train = transform_array(y_train)
y_val = transform_array(y_val)
y_test = transform_array(y_test)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

Shapes:  (12518, 24) (9582, 24)


In [21]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())


print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

Best params: {'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 2, 'n_estimators': 50}
Best score: 0.7996128817180317
Best estimator: GradientBoostingClassifier(loss='exponential', max_depth=2, n_estimators=50,
                           random_state=57)
Best index: 9


In [22]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

Confusion matrix:
[[8701   68]
 [ 534  279]]
Classification report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.97      8769
           1       0.80      0.34      0.48       813

    accuracy                           0.94      9582
   macro avg       0.87      0.67      0.72      9582
weighted avg       0.93      0.94      0.93      9582

