In [12]:
import numpy as np
from sklearn.svm import SVC
import matplotlib.pyplot as plt 
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
import pickle
from sklearn.feature_selection import r_regression, f_regression, VarianceThreshold
from sklearn.linear_model import PoissonRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, log_loss
from typing import Union

In [13]:
'''
This file contains all the functions required in pre-processing the datasets 
'''
def fetch_datasets() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    ''' 
    Fetches all the datasets from the datasets folder

    Returns:
    tuple
        All the loaded datasets
    '''
    x_train = np.load("./datasets/x_train.npy")
    x_test = np.load("./datasets/x_test.npy")
    y_train = np.load("./datasets/y_train.npy")
    y_test = np.load("./datasets/y_test.npy")

    return x_train, x_test, y_train, y_test

def find_missing_values(input_array: np.ndarray) -> dict:
    ''' 
    Finds the missing values (if any) in all the datasets

    Parameters:
    input_array: np.ndarray
        The dataset values
    array_name: str
        The name of the array

    Returns:
    str
        A string indicating the missing values found in the dataset
    '''
   
    return np.isnan(input_array).sum()

def find_non_unique_features(x_set: np.ndarray, array_name: str) -> list:
    ''' 
    Finds the features with non-unique values (i.e. only one value throughout)

    Parameters:
    x_set: np.ndarray
        The dataset that contains features
    array_name: str
        The name of the array

    Returns:
    list
        A list containing the features with non unique values
    '''
    # In the test set, there was one feature with 2 unique values, but I chose to keep this as if this feature has strong correlation with the target variablee, it could still be useful
    num_features = x_set.shape[1]
    unique_value_counts = [len(np.unique(x_set[:, i])) for i in range(num_features)]
    filtered_features = [(i+1, count) for i, count in enumerate(unique_value_counts) if count == 1]

    if len(filtered_features) == 0:
        print(f"No features with 1 unique value in {array_name}")
    else:
        for feature, count in filtered_features:
            print("Features with less than 100 unique values:")
            print(f"Feature {feature}: {count} unique values")
    
    return filtered_features

# Assuming numerical features are not categorical
def find_categorical_features(x_set: np.ndarray, array_name: str) -> list:
    ''' 
    Finds the categorical features in a dataset

    Parameters:
    x_set: np.ndarray
        The dataset that contains features
    array_name: str
        The name of the array

    Returns:
    list
        A list containing the features that have categorical values
        
    '''
    num_columns = x_set.shape[1]
    categorical_features = []

    for i in range(num_columns):
        if not (is_int(x_set[0][i])) and not (is_float(x_set[0][i])):
            categorical_features.append(i)

    if len(categorical_features) == 0:
        print(f"No categorical features in {array_name}")
    else:
        print(f"There are {len(categorical_features)} categorical features")

    # If more than one column exists, encode it?
    return categorical_features

In [3]:
'''
This file contains all the functions required in transforming the datasets after pre-processing
'''
def normalise_min_max(train_set: np.ndarray, test_set: np.ndarray) -> tuple:
    ''' 
    Normalises the datasets to the range (-1, 1) with MinMaxScaler

    Parameters:
    train_set: np.ndarray
        The training dataset
    test_set: np.ndarray
        The testing dataset

    Returns:
    tuple
        All the normalised datasets
    '''
    min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
    x_train_norm = min_max_scaler.fit_transform(train_set)
    x_test_norm = min_max_scaler.fit_transform(test_set)
    return x_train_norm, x_test_norm

def normalise_min_max_task_3(train_set: np.ndarray, test_set: np.ndarray) -> tuple:
    ''' 
    Normalises the datasets to the range (10, 15) with MinMaxScaler

    Parameters:
    train_set: np.ndarray
        The training dataset
    test_set: np.ndarray
        The testing dataset

    Returns:
    tuple
        All the normalised datasets
    '''
    min_max_scaler = MinMaxScaler(feature_range=(10,15))
    x_train_norm = min_max_scaler.fit_transform(train_set)
    x_test_norm = min_max_scaler.transform(test_set)
    return x_train_norm, x_test_norm

def standardize_std_scaler(train_set: np.ndarray, test_set: np.ndarray) -> tuple:
    ''' 
    Standardizes the datasets using StandardScaler

    Parameters:
    train_set: np.ndarray
        The training dataset
    test_set: np.ndarray
        The testing dataset

    Returns:
    tuple
        All the standardized datasets
    '''
    standard_scaler = StandardScaler()
    x_train_norm = standard_scaler.fit_transform(train_set)
    x_test_norm = standard_scaler.transform(test_set)
    return x_train_norm, x_test_norm

def normalise_robust_scaler(train_set: np.ndarray, test_set: np.ndarray) -> tuple:
    ''' 
    Scales features with the median and interquartile range using RobustScaler

    Parameters:
    train_set: np.ndarray
        The training dataset
    test_set: np.ndarray
        The testing dataset

    Returns:
    tuple
        All the scaled datasets
    '''
    robust_scaler = RobustScaler()
    x_train_norm = robust_scaler.fit_transform(train_set)
    x_test_norm = robust_scaler.transform(test_set)
    return x_train_norm, x_test_norm

def find_lowest_occurring_class(dataset_y: np.ndarray) -> float:
    ''' 
    Finds the class with the lowest occurrences of values

    Parameters:
    dataset_y: np.ndarray
        The classes dataset

    Returns:
    float
        The class with the lowest occurences
    '''
    unique_classes, class_counts = np.unique(dataset_y, return_counts=True)
    min_count = np.argmin(class_counts)
    lowest_occurrence_class = unique_classes[min_count]
    return lowest_occurrence_class

def find_highest_occurring_class(dataset_y: np.ndarray) -> float:
    ''' 
    Finds the class with the highest occurrences of values

    Parameters:
    dataset_y: np.ndarray
        The classes dataset

    Returns:
    float
        The class with the highest occurences
    '''
    unique_classes, class_counts = np.unique(dataset_y, return_counts=True)
    max_count = np.argmax(class_counts)
    highest_occurrence_class = unique_classes[max_count]
    return highest_occurrence_class

def dataset_undersampling(dataset_x: np.ndarray, dataset_y: np.ndarray) -> tuple:
    ''' 
    Used in imbalanced datasets to reduce the instances in the majority class to balance the class distribution

    Parameters:
    dataset_x: np.ndarray
        The features dataset
    dataset_y: np.ndarray
        The classes dataset

    Returns:
    tuple
        The undersampled versions of the input datasets
    '''
    lowest_occur_class = np.where(dataset_y == find_lowest_occurring_class(dataset_y))[0]
    num_occurences = len(lowest_occur_class)

    selected_indices_other_classes = []
    for class_label in np.unique(dataset_y):
        if class_label != find_lowest_occurring_class(dataset_y):
            class_indices = np.where(dataset_y == class_label)[0]
            selected_indices = np.random.choice(class_indices, size=num_occurences, replace=False)
            selected_indices_other_classes.append(selected_indices)

    balanced_indices = np.concatenate((lowest_occur_class, *selected_indices_other_classes))
    x_balanced = dataset_x[balanced_indices]
    y_balanced = dataset_y[balanced_indices]

    return x_balanced, y_balanced

def dataset_oversampling(dataset_x: np.ndarray, dataset_y: np.ndarray) -> tuple:
    ''' 
    Used in imbalanced datasets to increase the instances in the minority class to balance the class distribution

    Parameters:
    dataset_x: np.ndarray
        The features dataset
    dataset_y: np.ndarray
        The classes dataset

    Returns:
    tuple
        The oversampled versions of the input datasets
    '''
    highest_occur_class = np.where(dataset_y == find_highest_occurring_class(dataset_y))[0]
    num_occurences = len(highest_occur_class)

    selected_indices_other_classes = []
    for class_label in np.unique(dataset_y):
        if class_label != find_highest_occurring_class(dataset_y):
            class_indices = np.where(dataset_y == class_label)[0]
            selected_indices = np.random.choice(class_indices, size=num_occurences)
            selected_indices_other_classes.append(selected_indices)

    balanced_indices = np.concatenate((highest_occur_class, *selected_indices_other_classes))
    x_balanced = dataset_x[balanced_indices]
    y_balanced = dataset_y[balanced_indices]

    return x_balanced, y_balanced

# ------ Task 4 ------

# By reducing to 2, you're getting the 2 most important features
def reduce_pca_dimensionality(train_set_x: np.ndarray, test_set_x: np.ndarray, components: int) -> tuple:
    # ''' 
    # Finds the features with non-unique values (i.e. only one value throughout)

    # Keyword Arguments:
    # x_set: np.ndarray
    #     The dataset that contains features
    # array_name: str
    #     The name of the array

    # Returns:
    # list
    #     A list containing the features with non unique values
    # '''
    pca = PCA(n_components=components)
    x_train_pca = pca.fit_transform(train_set_x)
    x_test_pca = pca.transform(test_set_x)

    return x_train_pca, x_test_pca

In [4]:
'''
This file contains all the functions required in the feature selection phase
'''
def calculate_variance_threshold(train_set_x: np.ndarray, test_set_x: np.ndarray, top_10_features: bool) -> tuple:
    '''
    Finds and removes the low variance features from datasets

    Parameters:
    train_set_x: np.ndarray
        The features training dataset
    test_set_x: np.ndarray
        The features testing dataset
    top_10_features: bool
        A boolean indicating whether only the top 10 features should be fetched or not

    Returns:
    top_10_features_train: np.ndarray
        The top 10 features in the training dataset after variance threshold
    top_10_features_test: np.ndarray
        The top 10 features in the testing dataset after variance threshold
    x_train_selected: np.ndarray
        All features in the training dataset after variance threshold
    x_test_selected: np.ndarray
        All features in the testing dataset after variance threshold
    '''
    variances = np.var(train_set_x, axis=0)
    average_variance = np.mean(variances) # To get the threshold value, the average of the variance for each feature was taken - 0.05

    threshold = average_variance
    variance_calculator = VarianceThreshold(threshold)
    variance_calculator.fit(train_set_x)

    if top_10_features:
        kept_features_idx = variance_calculator.get_support(indices=True)
        kept_features_variance = np.var(train_set_x[:, kept_features_idx], axis=0)
        sorted_indices = np.argsort(kept_features_variance)[::-1]
        top_10_features_train = kept_features_idx[sorted_indices][:10]
        top_10_variance_train = kept_features_variance[sorted_indices][:10]

        kept_features_variance_test = np.var(test_set_x[:, kept_features_idx], axis=0)
        sorted_indices_test = np.argsort(kept_features_variance_test)[::-1]
        top_10_features_test = kept_features_idx[sorted_indices_test][:10]
        top_10_variance_test = kept_features_variance_test[sorted_indices_test][:10]

        return top_10_features_train, top_10_features_test
    else: 
        x_train_selected = variance_calculator.transform(train_set_x)
        x_test_selected = variance_calculator.transform(test_set_x)
        return x_train_selected, x_test_selected


def pearson_correlation(train_set_x: np.ndarray, train_set_y: np.ndarray) -> np.ndarray:
    '''
    Calculates the pearson correlation for all the features

    Parameters:
    train_set_x: np.ndarray
        The features training dataset
    train_set_y: np.ndarray
        The classes training dataset

    Returns:
    imp_features: np.ndarray
        A sorted array of the features with the highest correlation
    '''
    pr_coeff = r_regression(train_set_x, train_set_y)
    imp_features = np.argsort(np.abs(pr_coeff))
    return imp_features

def f_regression_scores(train_set_x: np.ndarray, train_set_y: np.ndarray) -> np.ndarray:
    '''
    Calculates the f regression for all the features

    Parameters:
    train_set_x: np.ndarray
        The features training dataset
    train_set_y: np.ndarray
        The classes training dataset

    Returns:
    imp_features: np.ndarray
        A sorted array of the features
    '''
    f_scores, p_value = f_regression(train_set_x, train_set_y)
    imp_features = np.argsort(np.abs(f_scores))
    return imp_features

def poisson_method(train_set_x: np.ndarray, train_set_y: np.ndarray) -> np.ndarray:
    '''
    Performs feature selection using Poisson method

    Parameters:
    train_set_x: np.ndarray
        The features training dataset
    train_set_y: np.ndarray
        The classes training dataset

    Returns:
    top_10_features_idx: np.ndarray
        A sorted array of the features
    '''
    poisson_model = PoissonRegressor()
    poisson_model.fit(train_set_x, train_set_y)
    feature_importance = np.abs(poisson_model.coef_)
    top_10_features_idx = np.argsort(feature_importance)[-10:]
    top_10_features_train = train_set_x[:, top_10_features_idx]

    return top_10_features_idx

In [5]:
'''
This file contains all the functions required in training each model
'''
def svc_classifier(train_set_x: np.ndarray, test_set_x: np.ndarray, train_set_y: np.ndarray, test_set_y: np.ndarray) -> SVC:
    ''' 
    Trains the model under a Support Vector Classifier

    Parameters:
    train_set_x: np.ndarray
        The features training dataset
    train_set_y: np.ndarray
        The classes training dataset
    test_set_x: np.ndarray
        The features testing dataset
    test_set_y: np.ndarray
        The classes testing dataset

    Returns:
    svc: SVC
        The trained model
    '''
    # kernel='rbf', C=1, gamma="scale", probability=True
    svc = SVC(probability=True) # Configuration derived from hyperparameter tuning
    selected_features = train_set_x
    selected_test_features = test_set_x
    # concatenated_array_x = np.concatenate((selected_features, selected_test_features), axis=0)
    # concatenated_array_y = np.concatenate((train_set_y, test_set_y), axis=0)
    holdout_validation(svc, selected_features, selected_test_features, train_set_y, test_set_y)
    cross_validation(svc, train_set_x, train_set_y)
    # k_fold_valdiation("svc", train_set_x, train_set_y, 20)
    k_fold_cross_validation_strat("svc", train_set_x, train_set_y, 10)

    return svc

def mlp_classifier(train_set_x: np.ndarray, test_set_x: np.ndarray, train_set_y: np.ndarray, test_set_y: np.ndarray) -> MLPClassifier:
    ''' 
    Trains the model under a Multi Layer Perceptron Classifier

    Parameters:
    train_set_x: np.ndarray
        The features training dataset
    train_set_y: np.ndarray
        The classes training dataset
    test_set_x: np.ndarray
        The features testing dataset
    test_set_y: np.ndarray
        The classes testing dataset

    Returns:
    mlp: MLPClassifier
        The trained model
    '''
    # hidden_layer_sizes=(50, 100, 50), activation='tanh', solver='sgd', alpha=0.0001, random_state=42, shuffle=False
    mlp = MLPClassifier(random_state=42, shuffle=False) # Configuration derived from hyperparameter tuning
    mlp.fit(train_set_x, train_set_y)
    # concatenated_array_x = np.concatenate((train_set_x, test_set_x), axis=0)
    # concatenated_array_y = np.concatenate((train_set_y, test_set_y), axis=0)
    holdout_validation(mlp, train_set_x, test_set_x, train_set_y, test_set_y)
    cross_validation(mlp, train_set_x, train_set_y)
    # k_fold_valdiation("mlp", train_set_x, train_set_y, 20)
    k_fold_cross_validation_strat("mlp", train_set_x, train_set_y, 10)

    return mlp

In [6]:
'''
This file contains all the functions used for validating the models
'''
def holdout_validation(classifier: Union[MLPClassifier, SVC], train_set_x: np.ndarray, test_set_x: np.ndarray, train_set_y: np.ndarray, test_set_y: np.ndarray) -> Union[MLPClassifier|SVC]:
    '''
    Performs holdout validation on the classifier used for training

    Parameters:
    classifier: Union[SVC, MLPClassifier]
        The model that was trained during development - Only accepts SVC and MLP for now
    train_set_x: np.ndarray
        The features training dataset
    test_set_x: np.ndarray
        The features testing dataset
    train_set_y: np.ndarray
        The classes training dataset
    test_set_y: np.ndarray
        The classes testing dataset
    
    Returns:
    classifier: Union[SVC, MLPClassifier]
        The classifer that was validated
    '''
    classifier.fit(train_set_x, train_set_y)
    print("------ Holdout Validation ------")
    print(f"Training Accuracy: {classifier.score(train_set_x, train_set_y)}")
    print(f"Testing Accuracy: {classifier.score(test_set_x, test_set_y)}")
    return classifier

def cross_validation(classifier: Union[MLPClassifier, SVC], train_set_x: np.ndarray, train_set_y: np.ndarray) -> Union[MLPClassifier, SVC]:
    '''
    Performs cross validation on the classifier used for training

    Parameters:
    classifier: Union[SVC, MLPClassifier]
        The model that was trained during development - Only accepts SVC and MLP for now
    train_set_x: np.ndarray
        The features training dataset
    train_set_y: np.ndarray
        The classes training dataset
    
    Returns:
    classifier: Union[SVC, MLPClassifier]
        The classifer that was validated
    '''
    cv_score = cross_val_score(classifier, train_set_x, train_set_y)
    print("------ Cross Validation ------")
    print(f"Mean Accuracy: {cv_score.mean()}")
    return classifier

# def k_fold_valdiation(classifier: Union[MLPClassifier, SVC], train_set_x: np.ndarray, train_set_y: np.ndarray, size: int) -> Union[MLPClassifier, SVC]:
#     '''
#     Performs K-fold validation on the classifier used for training

#     Parameters:
#     classifier: Union[SVC, MLPClassifier]
#         The model that was trained during development - Only accepts SVC and MLP for now
#     train_set_x: np.ndarray
#         The features training dataset
#     train_set_y: np.ndarray
#         The classes training dataset
#     size: int
#         The number of folds
    
#     Returns:
#     classifier: Union[SVC, MLPClassifier]
#         The classifer that was validated
#     '''
#     kf = KFold(n_splits=size, shuffle=True)
#     tracked_scores = np.zeros(size)
#     index = 0
#     for train_idx, test_idx in kf.split(train_set_x):
#         x_train_kfold, x_test_kfold = train_set_x[train_idx], train_set_x[test_idx]
#         y_train_kfold, y_test_kfold = train_set_y[train_idx], train_set_y[test_idx]

#         if classifier == "svc":
#             # kernel='rbf', C=1, gamma="scale"
#             svc_clf = SVC()
#             svc_clf.fit(x_train_kfold, y_train_kfold)
#             tracked_scores[index] = svc_clf.score(x_test_kfold, y_test_kfold)
#             index += 1
#         else:
#             # hidden_layer_sizes=(100,), activation='tanh', solver='adam', alpha=0.05, random_state=42)
#             mlp = MLPClassifier()
#             mlp.fit(x_train_kfold, y_train_kfold)
#             tracked_scores[index] = mlp.score(x_test_kfold, y_test_kfold)
#             index += 1

#     print("------ K fold Validation ------")
#     print(f"Mean Accuracy: {tracked_scores.mean()}")
#     print(f"Std Deviation: {tracked_scores.std()}")
#     return classifier

def k_fold_cross_validation_strat(classifier: Union[MLPClassifier, SVC], train_set_x: np.ndarray, train_set_y: np.ndarray, size: int) -> Union[MLPClassifier, SVC]:
    '''
    Performs stratified K-fold validation on the classifier used for training

    Parameters:
    classifier: Union[SVC, MLPClassifier]
        The model that was trained during development - Only accepts SVC and MLP for now
    train_set_x: np.ndarray
        The features training dataset
    train_set_y: np.ndarray
        The classes training dataset
    size: int
        The number of folds
    
    Returns:
    classifier: Union[SVC, MLPClassifier]
        The classifer that was validated
    '''    
    kf_strat = StratifiedKFold(n_splits=size, shuffle=True)
    tracked_scores = np.zeros(size)
    index = 0
    for train_idx, test_idx in kf_strat.split(train_set_x, train_set_y):
        x_train_kfold, x_test_kfold = train_set_x[train_idx], train_set_x[test_idx]
        y_train_kfold, y_test_kfold = train_set_y[train_idx], train_set_y[test_idx]

        if classifier == "svc":
            # kernel='rbf', C=1, gamma="scale"
            svc_clf = SVC()
            svc_clf.fit(x_train_kfold, y_train_kfold)
            tracked_scores[index] = svc_clf.score(x_test_kfold, y_test_kfold)
            index += 1
        else:
            # hidden_layer_sizes=(100,), activation='tanh', solver='adam', alpha=0.05, random_state=42
            mlp = MLPClassifier()
            mlp.fit(x_train_kfold, y_train_kfold)
            tracked_scores[index] = mlp.score(x_test_kfold, y_test_kfold)
            index += 1

    print("------ Stratified K fold Validation ------")
    print(f"Mean Accuracy: {tracked_scores.mean()}")
    print(f"Std Deviation: {tracked_scores.std()}")
    return classifier

In [7]:
'''
This file contains all the functions used for evaluating the models
'''
def fetch_classification_report(classifier: Union[SVC, MLPClassifier], test_set_x: np.ndarray, test_set_y: np.ndarray) -> None:
    '''
    Creates the classification report for a classifier to evaluate the precision, recall, f1-score, and support

    Parameters:
    classifier: Union[SVC, MLPClassifier]
        The classifier that was trained during development - Only accepts SVC and MLP for now
    test_set_x: np.ndarray
        The features testing dataset
    test_set_y: np.ndarray
        The classes testing dataset
    '''
    y_pred = classifier.predict(test_set_x)
    print("----- Classification Report -----")
    print(classification_report(test_set_y, y_pred))

def fetch_multiple_classification_report(classifiers: Union[SVC, MLPClassifier], classifier_titles: list[str], test_set_x: np.ndarray, test_set_y: np.ndarray):
    '''
    Creates multiple classification reports used for comparison

    Parameters:
    classifiers: Union[SVC, MLPClassifier]
        The classifier that was trained during development - Only accepts SVC and MLP for now
    classifier_titles: list[str]
        The list of all titles for each report
    test_set_x: np.ndarray
        The features testing dataset
    test_set_y: np.ndarray
        The classes testing dataset
    '''
    for idx in range(len(classifiers)):
        y_pred = classifiers[idx].predict(test_set_x)
        print(f"----- {classifier_titles[idx]} -----\n{classification_report(test_set_y, y_pred)}")

def fetch_accuracy_score(test_set_y: np.ndarray, predicted_set_y: np.ndarray):
    '''
    Details the accuracy of a model based on the actual and predicted values

    Parameters:
    test_set_y: np.ndarray
        The classes testing dataset
    predicted_set_y: np.ndarray
        The predictions made by the model
    '''
    accuracy = accuracy_score(test_set_y, predicted_set_y)
    print(f"Accuracy: {accuracy}")

def fetch_log_loss(test_set_y: np.ndarray, predicted_set_y: np.ndarray):
    '''
    Details the log loss of a model based on the actual and predicted values

    Parameters:
    test_set_y: np.ndarray
        The classes testing dataset
    predicted_set_y: np.ndarray
        The predictions made by the model
    '''
    lg_loss = log_loss(test_set_y, predicted_set_y)
    print(f"Log Loss: {lg_loss}")

In [8]:
'''
This file contains all the functions required to plot the visualisations
'''
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, auc, roc_curve, det_curve
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import numpy as np
from typing import Union

def plot_feature_split_of_values(input_arrays: list[np.ndarray], labels: list[str]):
    ''' 
    Plots a histogram showing the split of values in each feature dataset 

    Parameters:
    input_arrays: list[np.ndarray]
        The datasets containing the the plottable inputs
    labels: list[str]
        The titles for the datasets
    '''
    fig, axs = plt.subplots(1, len(input_arrays), figsize=(15, 4)) 
    fig.suptitle('Split of Values', fontsize=16)
    
    for idx, input_array in enumerate(input_arrays):
        flattened_array = input_array.flatten()
        axs[idx].hist(flattened_array, bins=50, color='blue', alpha=0.7)
        axs[idx].set_title(f'{labels[idx]}')
        axs[idx].set_xlabel('Values')
        axs[idx].set_ylabel('Frequency')
        axs[idx].set_xlim(0, 3)
        axs[idx].set_xticks(np.arange(0, 3.1, 0.25))
    
    plt.tight_layout()
    plt.show()

def plot_class_split_of_values(input_arrays: list[np.ndarray], labels: list[str]):
    ''' 
    Plots a histogram showing the split of values of each class

    Parameters:
    input_arrays: list[np.ndarray]
        The datasets containing the the plottable inputs
    labels: list[str]
        The titles for the datasets
    '''
    fig, axs = plt.subplots(1, len(input_arrays), figsize=(15, 4)) 
    fig.suptitle('Split of Values', fontsize=16)
    
    for idx, input_array in enumerate(input_arrays):
        flattened_array = input_array.flatten()
        axs[idx].hist(flattened_array, bins=50, color='blue', alpha=0.7)
        axs[idx].set_title(f'{labels[idx]}')
        axs[idx].set_xlabel('Values')
        axs[idx].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
def plot_single_correlation_heatmap(corr_matrix: np.corrcoef, title: str):
    ''' 
    Plots a heatmap showing the corrlation between all features and target variables 

    Parameters:
    corr_matrix: np.corrcoef
        The correlation matrix between the features and the target variable
    title: str
        The title for the figure
    '''
    plt.figure(figsize=(8, 6))
    plt.imshow(corr_matrix, cmap='coolwarm', interpolation='nearest')
    plt.colorbar(label='Correlation')
    plt.xticks(np.arange(corr_matrix.shape[0]), np.arange(1, corr_matrix.shape[0] + 1), rotation=45)
    plt.yticks(np.arange(corr_matrix.shape[0]), np.arange(1, corr_matrix.shape[0] + 1))
    plt.title('Pearson Correlation Heatmap')
    plt.xlabel('Features')
    plt.ylabel('Features')
    plt.title(title)
    for idx in range(corr_matrix.shape[0]):
            for idj in range(corr_matrix.shape[1]):
                plt.text(idj, idx, '{:.2f}'.format(corr_matrix[idx, idj]), ha='center', va='center', color='black')
    plt.show()

def plot_correlation_heatmap(ax, corr_matrix, title):
    im = ax.imshow(corr_matrix, cmap='coolwarm', interpolation='nearest')
    cbar = ax.figure.colorbar(im, ax=ax)
    cbar.set_label('Correlation')
    ax.set_xticks(np.arange(corr_matrix.shape[0]))
    ax.set_yticks(np.arange(corr_matrix.shape[0]))
    ax.set_xticklabels(np.arange(1, corr_matrix.shape[0] + 1), rotation=45)
    ax.set_yticklabels(np.arange(1, corr_matrix.shape[0] + 1))
    ax.set_title(title)
    for idx in range(corr_matrix.shape[0]):
        for idj in range(corr_matrix.shape[1]):
            ax.text(idj, idx, '{:.2f}'.format(corr_matrix[idx, idj]), ha='center', va='center', color='black')


def plot_confusion_matrix(classifier: Union[SVC, MLPClassifier], test_set_x: np.ndarray, test_set_y: np.ndarray, table_needed: bool):
    ''' 
    Plots a confusion matrix to show the key metrics (i.e. the True Positive, True Negative, False Positive, and False Negative) for each class

    Parameters:
    classifier: Union[SVC, MLPClassifier]
        The classifier that was trained during development - Only accepts SVC and MLP for now
    test_set_x: np.ndarray
        The features testing dataset
    test_set_y: np.ndarray
        The classes testing dataset
    table_needed: bool
        This determines whether a table should be created as well for each of the metrics mentioned above
    '''
    confusion_matrix_values = []
    y_pred = classifier.predict(test_set_x)
    ConfusionMatrixDisplay.from_predictions(test_set_y, y_pred)
    plt.title("Confusion Matrix")
    plt.show()

    conf_mtrx = confusion_matrix(test_set_y, y_pred)
    
    if table_needed:
        for idx in range(len(conf_mtrx)):
            TP = conf_mtrx[idx, idx]
            FP = np.sum(conf_mtrx[:, idx]) - TP
            FN = np.sum(conf_mtrx[idx, :]) - TP
            TN = np.sum(conf_mtrx) - (TP + FP + FN)
            confusion_matrix_values.append((TP, FP, FN, TN))

        fig, ax = plt.subplots()
        table = ax.table(cellText=confusion_matrix_values,
                        colLabels=['True Positive', 'False Positive', 'False Negative', 'True Negative'],
                        rowLabels=[
                            'Class 0', 'Class 1', 'Class 2', 'Class 3',
                            'Class 4', 'Class 5',
                            'Class 6', 'Class 7', 'Class 8',
                            'Class 9'
                        ],
                        loc='center')

        table.scale(1.2, 1)
        ax.axis('off')
        plt.show()

def plot_precision_recall_curve(test_set_y: np.ndarray, pred_set_y: np.ndarray, ax: plt.axes):
    ''' 
    Plots a precision recall curve to see the rate of (i) true positive predictions to the total positive predictions 
    and (ii) true positive predictions to the true positives.

    Parameters:
    test_set_y: np.ndarray
        The classes testing dataset
    pred_set_y: np.ndarray
        The predictions made on the testing data
    ax: plt.axes
        Axis when plotting subfigures
    '''
    for idx in range(len(np.unique(test_set_y))):
        precision, recall, _ = precision_recall_curve((test_set_y == idx).astype(int), pred_set_y[:, idx])
        ax.plot(recall, precision, lw=2, label='Class {}'.format(idx))

    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title('Precision-Recall Curve')
    ax.legend(loc='best')

def plot_roc_curve(test_set_y, pred_set_y, ax):
    ''' 
    Plots an ROC curve to see the rate of true positives to false positives

    Parameters:
    test_set_y: np.ndarray
        The classes testing dataset
    pred_set_y: np.ndarray
        The predictions made on the testing data
    ax: plt.axes
        Axis when plotting subfigures
    '''
    for idx in range(len(np.unique(test_set_y))):
        fpr, tpr, _ = roc_curve((test_set_y == idx).astype(int), pred_set_y[:, idx])
        roc_auc = auc(fpr, tpr)
        ax.plot(fpr, tpr, lw=2, label='Class {}'.format(idx, roc_auc))

    ax.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('ROC Curve')
    ax.legend(loc='lower right')

def plot_det_curve(test_set_y, pred_set_y, ax):
    ''' 
    Plots a DET curve to see the rate of false negatives to false positives

    Parameters:
    test_set_y: np.ndarray
        The classes testing dataset
    pred_set_y: np.ndarray
        The predictions made on the testing data
    ax: plt.axes
        Axis when plotting subfigures
    '''
    for idx in range(len(np.unique(test_set_y))):
        fpr, fnr, _ = det_curve((test_set_y == idx).astype(int), pred_set_y[:, idx])
        ax.plot(fpr, fnr, lw=2, label='Class {}'.format(idx))

    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('False Negative Rate')
    ax.set_title('DET Curve')
    ax.legend(loc='best')

def plot_predicted_vs_actual(test_set_y: np.ndarray, pred_set_y: np.ndarray, title: str, ax: plt.axes, no_of_plots: int):
    ''' 
    Plots a scatter plot showing the predicted vs actual values. 
    It is done only with 100 occurences so that the outputs are visible.

    Parameters:
    test_set_y: np.ndarray
        The classes testing dataset
    pred_set_y: np.ndarray
        The predictions made on the testing data
    title: str
        Title of the plot
    ax: plt.axes
        Axis when plotting subfigures
    no_of_plots: int
        The number of plots that the predictions vs actuals are shown
    '''
    ax.plot(test_set_y[:no_of_plots], 'o', label='Actual')
    ax.plot(pred_set_y[:no_of_plots], 'x', label='Prediction')
    ax.set_xlabel('Actual/Predicted value of the target')
    ax.set_ylabel('Index of sample')
    ax.set_title(title)
    ax.legend()

# ------ TASK 4 ------

# def plot_clustering_results(train_set_x, test_set_x, clusters):

#     kmeans = KMeans(n_clusters=clusters)
#     kmeans.fit(train_set_x)
#     cluster_labels = kmeans.predict(test_set_x)

#     plt.figure(figsize=(8, 6))
#     plt.scatter(train_set_x[:, 0], test_set_x[:, 1], c=cluster_labels, cmap='viridis', s=50, alpha=0.5)
#     plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker='x', s=200, c='red', label='Cluster Centers')
#     plt.title('PCA Reduced Testing Dataset with K-means Clustering')
#     plt.xlabel('Principal Component 1')
#     plt.ylabel('Principal Component 2')
#     plt.colorbar(label='Cluster')
#     plt.legend()
#     plt.show()

In [15]:
'''
This file contains all the functions required for miscellaneous use-cases
'''
import joblib
import os
import numpy as np

def is_float(string):
    '''
    Check if a string can be converted to float
    '''
    try:
        float(string)
        return True
    except ValueError:
        return False

def is_int(string):
    '''
    Check if a string can be converted to int
    '''
    try:
        int(string)
        return True
    except ValueError:
        return False
    
def save_dataset(model, file_name):
    with open(file_name, 'wb') as f:
        pickle.dump(model, f)
    print(f"{file_name} has the new dataset!")
    
def load_dataset(file_name):
    with open(file_name, 'rb') as f:
        dataset = pickle.load(f)
    return dataset

In [16]:
'''
This file contains all the functions required for model operations post training
'''
def save_model(model: Union[SVC, MLPClassifier], file_name: str):
    '''
    Saves the model using pickle

    Parameters:
    model: Union[SVC, MLPClassifier]
        The model that was trained during development - Only accepts SVC and MLP for now
    file_name: str
        The classes training dataset
    '''
    with open(file_name, 'wb') as f:
        pickle.dump(model, f)
    print(f"{file_name} has the new model!")

def load_model(file_name: str):
    '''
    Loads the model using pickle

    Parameters:
    file_name: str
        The path where the model was saved
    
    Returns:
        The loaded model
    '''
    with open(file_name, 'rb') as f:
        model = pickle.load(f)
    return model

def find_best_configuration_svc(parameters: dict, train_set_x: np.ndarray, test_set_x: np.ndarray, train_set_y: np.ndarray, test_set_y: np.ndarray) -> dict:
    '''
    Performs manual hyperparameter optimisation to find the best configuration for the SVC classifier

    Parameters:
    parameters: dict
        All the parameters to be experimented with
    train_set_x: str
        The features training dataset
    test_set_x: str
        The features testing dataset
    train_set_y: str
        The classes training dataset
    test_set_y: str
        The classes testing dataset
    
    Returns:
        The best parameter configuration with the accuracy
    '''
    selected_c = 0 
    selected_kernel = ''
    selected_gamma = '' 
    best_training = 0
    best_test = 0

    for values in range(len(parameters["kernels_values"])):
     for c_vals in range(len(parameters["c_values"])):
          for g_values in range(len(parameters["gamma_values"])):
                svc_clf = SVC(C=parameters["c_values"][c_vals], gamma=parameters["gamma_values"][g_values], kernel=parameters["kernels_values"][values])
                svc_clf.fit(train_set_x, train_set_y)
                current_train = svc_clf.score(train_set_x, train_set_y) 
                current_test = svc_clf.score(test_set_x, test_set_y)
                cv_score = cross_val_score(svc_clf, train_set_x, train_set_y, cv=10)

                if (current_train > best_training) and (current_test > best_test):
                    best_training = current_train
                    best_test = current_test
                    selected_c = parameters["c_values"][c_vals]
                    selected_kernel = parameters["kernels_values"][values]
                    selected_gamma = parameters["gamma_values"][g_values]

    return {
        "parameters": {"c": selected_c, "kernel": selected_kernel, "gamma": selected_gamma},
        "training_accuracy": best_training,
        "testing_accuracy": best_test
    }

def find_best_configuration_mlp(parameters: dict, train_set_x: np.ndarray, test_set_x: np.ndarray, train_set_y: np.ndarray, test_set_y: np.ndarray) -> dict:
    '''
    Performs manual hyperparameter optimisation to find the best configuration for the MLP classifier

    Parameters:
    parameters: dict
        All the parameters to be experimented with
    train_set_x: str
        The features training dataset
    test_set_x: str
        The features testing dataset
    train_set_y: str
        The classes training dataset
    test_set_y: str
        The classes testing dataset
    
    Returns:
        The best parameter configuration with the accuracy
    '''
    selected_hidden_layers = 0 
    selected_activation = ''
    selected_solver = '' 
    selected_alpha = '' 
    best_training = 0
    best_test = 0
    outer_cv = KFold(n_splits=10, shuffle=True)

    for layer in range(len(parameters["hidden_layers"])):
        for activation in range(len(parameters["activations"])):
            for solver in range(len(parameters["solvers"])):
                for alpha in range(len(parameters["alphas"])):
                        inner_scores = []
                        mlp_clf = MLPClassifier(hidden_layer_sizes=parameters["hidden_layers"][layer], activation=parameters["activations"][activation], solver=parameters["solvers"][solver], alpha=parameters["alphas"][alpha])
                        mlp_clf.fit(train_set_x, train_set_y)
                        current_train = mlp_clf.score(train_set_x, train_set_y)
                        current_test = mlp_clf.score(test_set_x, test_set_y)
                        # Nested CV
                        for train_index, val_index in outer_cv.split(train_set_x):
                            X_train, X_val = train_set_x[train_index], train_set_x[val_index]
                            Y_train, Y_val = train_set_y[train_index], train_set_y[val_index]
                            mlp_clf.fit(X_train, Y_train)
                            test_score = mlp_clf.fit(X_val, Y_val)
                            inner_scores.append(test_score)
                        
                        mean_score = sum(inner_scores) / len(inner_scores)
                        if (current_train > best_training) and (current_test > best_test):
                            best_training = current_train
                            best_test = current_test
                            selected_hidden_layers = parameters["hidden_layers"][layer]
                            selected_activation = parameters["activations"][activation]
                            selected_solver = parameters["solvers"][solver]
                            selected_alpha = parameters["alphas"][alpha]

    return {
        "parameters": {"hidden_layers": selected_hidden_layers, "activation": selected_activation, "solver": selected_solver, "alpha": selected_alpha},
        "training_accuracy": best_training,
        "testing_accuracy": best_test
    }

                        # print(f"------- With hidden_layers={hidden_layers[layer]}, activation={activations[activation]}, solver={solvers[solver]}, alpha={alphas[alpha]}")
                        # holdout_validation(mlp_clf, x_train_norm, x_test_norm, y_train, y_test)
                        # print(f"Mean - {mean_score}")