# Assignment 2 - CIC-1205

## Exercise 1 - Cross Validation

Student: Balthazar Paixão


In [None]:
import numpy as np
import pickle
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import learning_curve, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

In [None]:
SEED = 57

data_folder = "../class-repo/cic1205/data/atmoseer/"
pickle_files = [
    "A602.pickle",
    "A621.pickle",
    "A627.pickle",
    "A636.pickle",
    "A652.pickle",
]


def transform_array(arr: np.ndarray) -> list:
    """
    •0 → NONE
    •(0, 5] → WEAK
    •(5, 25] → MODERATE
    •(25,50] → STRONG
    •(50, ∞] → EXTREME
    """
    arr = arr.tolist()
    for i in range(len(arr)):
        if arr[i][0] == 0:
            arr[i] = 0  # 'NONE'
        elif arr[i][0] > 0 and arr[i][0] <= 5:
            arr[i] = 1  # 'WEAK'
        elif arr[i][0] > 5 and arr[i][0] <= 25:
            arr[i] = 2  # 'MODERATE'
        elif arr[i][0] > 25 and arr[i][0] <= 50:
            arr[i] = 3  # 'STRONG'
        elif arr[i][0] > 50:
            arr[i] = 4  # 'EXTREME'
    return arr


def count_values_list(lst: list) -> dict:
    """
    Count the number of times each value appears in a list
    """
    dict_counter = {}
    for i in lst:
        dict_counter[i] = dict_counter.get(i, 0) + 1

    return dict_counter

# A602


In [None]:
a602 = f"{data_folder}{pickle_files[0]}"
file = open(a602, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

y_train = transform_array(y_train)
y_val = transform_array(y_val)
y_test = transform_array(y_test)

In [None]:
X_train = np.concatenate((X_train, X_val))
y_train = np.concatenate((y_train, y_val))

In [None]:
GB = GradientBoostingClassifier(random_state=SEED)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

train_sizes, train_scores, test_scores = learning_curve(
    GB, X_train, y_train, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1
)

train_scores_mean =  train_scores.mean(axis=1)
test_scores_mean = test_scores.mean(axis=1)

fig, ax = plt.subplots()
ax.plot(train_sizes, train_scores_mean, label="Training score")
ax.plot(train_sizes, test_scores_mean, label="Cross-validation score")
ax.set_xlabel("Training set size")
ax.set_ylabel("RMSE")
ax.set_title("Learning curves for Gradient Boosting")
ax.legend()
plt.show()

In [None]:
params = {
    'n_iter_no_change': 100,
    'tol': 0.001,
    'random_state': SEED
}

GB = GradientBoostingClassifier(**params)

GB.fit(X_train, y_train)

y_pred = GB.predict(X_test)

print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))

# A621


In [None]:
a621 = f"{data_folder}{pickle_files[1]}"
file = open(a621, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

y_train = transform_array(y_train)
y_val = transform_array(y_val)
y_test = transform_array(y_test)

In [None]:
X_train = np.concatenate((X_train, X_val))
y_train = np.concatenate((y_train, y_val))

In [None]:
GB = GradientBoostingClassifier(random_state=SEED)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

train_sizes, train_scores, test_scores = learning_curve(
    GB, X_train, y_train, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1
)

train_scores_mean =  train_scores.mean(axis=1)
test_scores_mean = test_scores.mean(axis=1)

fig, ax = plt.subplots()
ax.plot(train_sizes, train_scores_mean, label="Training score")
ax.plot(train_sizes, test_scores_mean, label="Cross-validation score")
ax.set_xlabel("Training set size")
ax.set_ylabel("RMSE")
ax.set_title("Learning curves for Gradient Boosting")
ax.legend()
plt.show()

In [None]:
params = {
    'n_iter_no_change': 100,
    'tol': 0.001,
    'random_state': SEED
}

GB = GradientBoostingClassifier(**params)

GB.fit(X_train, y_train)

y_pred = GB.predict(X_test)

print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))

# A627


In [None]:
a627 = f"{data_folder}{pickle_files[2]}"
file = open(a627, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

y_train = transform_array(y_train)
y_val = transform_array(y_val)
y_test = transform_array(y_test)

In [None]:
X_train = np.concatenate((X_train, X_val))
y_train = np.concatenate((y_train, y_val))

In [None]:
GB = GradientBoostingClassifier(random_state=SEED)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

train_sizes, train_scores, test_scores = learning_curve(
    GB, X_train, y_train, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1
)

train_scores_mean =  train_scores.mean(axis=1)
test_scores_mean = test_scores.mean(axis=1)

fig, ax = plt.subplots()
ax.plot(train_sizes, train_scores_mean, label="Training score")
ax.plot(train_sizes, test_scores_mean, label="Cross-validation score")
ax.set_xlabel("Training set size")
ax.set_ylabel("RMSE")
ax.set_title("Learning curves for Gradient Boosting")
ax.legend()
plt.show()

In [None]:
params = {
    'n_iter_no_change': 100,
    'tol': 0.001,
    'random_state': SEED
}

GB = GradientBoostingClassifier(**params)

GB.fit(X_train, y_train)

y_pred = GB.predict(X_test)

print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))

# A636

In [None]:
a636 = f"{data_folder}{pickle_files[3]}"
file = open(a636, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

y_train = transform_array(y_train)
y_val = transform_array(y_val)
y_test = transform_array(y_test)

In [None]:
X_train = np.concatenate((X_train, X_val))
y_train = np.concatenate((y_train, y_val))

In [None]:
GB = GradientBoostingClassifier(random_state=SEED)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

train_sizes, train_scores, test_scores = learning_curve(
    GB, X_train, y_train, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1
)

train_scores_mean =  train_scores.mean(axis=1)
test_scores_mean = test_scores.mean(axis=1)

fig, ax = plt.subplots()
ax.plot(train_sizes, train_scores_mean, label="Training score")
ax.plot(train_sizes, test_scores_mean, label="Cross-validation score")
ax.set_xlabel("Training set size")
ax.set_ylabel("RMSE")
ax.set_title("Learning curves for Gradient Boosting")
ax.legend()
plt.show()

In [None]:
params = {
    'n_iter_no_change': 100,
    'tol': 0.001,
    'random_state': SEED
}

GB = GradientBoostingClassifier(**params)

GB.fit(X_train, y_train)

y_pred = GB.predict(X_test)

print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))

# A652


In [None]:
a652 = f"{data_folder}{pickle_files[4]}"
file = open(a652, "rb")
(X_train, y_train, X_val, y_val, X_test, y_test) = pickle.load(file)

y_train = transform_array(y_train)
y_val = transform_array(y_val)
y_test = transform_array(y_test)

In [None]:
X_train = np.concatenate((X_train, X_val))
y_train = np.concatenate((y_train, y_val))

In [None]:
GB = GradientBoostingClassifier(random_state=SEED)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

train_sizes, train_scores, test_scores = learning_curve(
    GB, X_train, y_train, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1
)

train_scores_mean =  train_scores.mean(axis=1)
test_scores_mean = test_scores.mean(axis=1)

fig, ax = plt.subplots()
ax.plot(train_sizes, train_scores_mean, label="Training score")
ax.plot(train_sizes, test_scores_mean, label="Cross-validation score")
ax.set_xlabel("Training set size")
ax.set_ylabel("RMSE")
ax.set_title("Learning curves for Gradient Boosting")
ax.legend()
plt.show()

In [None]:
params = {
    'n_iter_no_change': 100,
    'tol': 0.001,
    'random_state': SEED
}

GB = GradientBoostingClassifier(**params)

GB.fit(X_train, y_train)

y_pred = GB.predict(X_test)

print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))