# Assignment 2 - CIC-1205

## Exercise 2 - Cross Validation

Student: Balthazar Paixão


In [None]:
data_folder = "../class-repo/cic1205/data/atmoseer/"
pickle_files = [
    "A602.pickle",
    "A621.pickle",
    "A627.pickle",
    "A636.pickle",
    "A652.pickle",
]

In [45]:
import numpy as np
import pickle
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [46]:
SEED = 57


def transform_array(arr: np.ndarray) -> list:
    elems_list = [[1] if elem[0] != 0 else [0] for elem in arr]
    elems_array = np.array(elems_list)
    return elems_array


def count_values_list(lst: list) -> dict:
    if isinstance(lst, np.ndarray):
        try:
            lst = [1 if elem[0] != 0 else 0 for elem in lst]
        except IndexError:
            lst = [1 if elem != 0 else 0 for elem in lst]
    elif isinstance(lst, np.array):
        lst = lst.tolist()
    elif isinstance(lst, pd.Series):
        lst = lst.tolist()

    elems = set(lst)
    dict_counter = {elem: lst.count(elem) for elem in elems}

    return dict_counter

# A602


In [None]:
a602 = f"{data_folder}{pickle_files[0]}"
file = open(a602, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

In [47]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())

180 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/home/balthapaixao/Desktop/baltha/projects/portfolio/MSc-codes/cic1205-machine-learning/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/balthapaixao/Desktop/baltha/projects/portfolio/MSc-codes/cic1205-machine-learning/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/home/balthapaixao/Desktop/baltha/projects/portfolio/MSc-codes/cic1205-machine-learning/.venv/lib/python3.10/site-package

In [48]:
print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

Best params: {'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 2, 'n_estimators': 50}
Best score: 0.7996128817180317
Best estimator: GradientBoostingClassifier(loss='exponential', max_depth=2, n_estimators=50,
                           random_state=57)
Best index: 9


In [None]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


# A621


In [None]:
a621 = f"{data_folder}{pickle_files[1]}"
file = open(a621, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

In [None]:
y_train = transform_array(y_train)
print(f"y_train: {count_values_list(y_train)}")
y_test = transform_array(y_test)
print(f"y_test:  {count_values_list(y_test)}")

In [None]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())

print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

In [None]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


# A627


In [None]:
a627 = f"{data_folder}{pickle_files[2]}"
file = open(a627, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

In [None]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())


print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

In [None]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


# A636


In [None]:
a636 = f"{data_folder}{pickle_files[3]}"
file = open(a636, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

In [None]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())


print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

In [None]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")


# A652


In [None]:
a652 = f"{data_folder}{pickle_files[4]}"
file = open(a652, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)

print(f"Shapes: ", X_train.shape, X_test.shape)

In [None]:
params = {
    'loss': ['deviance', 'exponential'],
    "learning_rate": [0.1, 0.05, 0.02, 0.01],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
}

k_folds = 5


skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=SEED)
gb = GradientBoostingClassifier(random_state=SEED)

grid_search = GridSearchCV(gb, params, scoring="f1_macro", cv=skf, n_jobs=-1)
grid_search.fit(X_train, y_train.ravel())


print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best index: {grid_search.best_index_}")

In [None]:
GB = GradientBoostingClassifier(
    **grid_search.best_params_, random_state=SEED
)
GB.fit(X_train, y_train.ravel())

y_pred = GB.predict(X_test)

print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")