In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
import os
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score, \
roc_curve, auc, precision_score, recall_score, confusion_matrix, precision_recall_curve

In [3]:
# Directory where the CSV files are stored
data_dir = 'lasso_subsets'
splits = 10 # from what we decided

# Initialize lists to store data splits
X_train_list, X_test_list, log_X_train_list, log_X_test_list = [], [], [], []
y_train_list, y_test_list = [], [] # same response variables no matter the X transformation

# Iterate over each split index
for i in range(1, splits + 1):
    X_train_file = os.path.join(data_dir, f'train_selected_{i}.csv') # load selected features
    X_test_file = os.path.join(data_dir, f'test_selected_{i}.csv')
    train_file = os.path.join(f'datasets/train_{i}.csv') # load y train and test separately
    test_file = os.path.join(f'datasets/test_{i}.csv')

    # Read training data and split into X_train and y_train
    train_df = pd.read_csv(train_file)
    X_train = pd.read_csv(X_train_file)
    y_train = train_df['mtx_binary']
    
    # Read test data and split into X_test and y_test
    test_df = pd.read_csv(test_file)
    X_test = pd.read_csv(X_test_file)
    y_test = test_df['mtx_binary']
    
    # Append to respective lists
    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

In [5]:
# initialize lists for metrics
acc_list = []
auc_list = []
prec_list = []
tpr_list = []
fpr_list = []

# Perform on the folds
for i in range(1, splits + 1):
    X_train = X_train_list[i - 1]
    X_test = X_test_list[i - 1]
    y_train = y_train_list[i - 1]
    y_test = y_test_list[i - 1]
    # fit model
    ada_model = AdaBoostClassifier(random_state=42)
    ada_model.fit(X_train, y_train)
    y_pred_proba = ada_model.predict_proba(X_test)[:,1]
    y_pred = ada_model.predict(X_test)
    # calculate metrics
    # ROC AUC
    fprs, tprs, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fprs, tprs)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    tpr = recall_score(y_test, y_pred) # redefine as a value not a vector from before
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel() # retrieve to calculate fpr
    fpr = fp / (fp + tn)
    # append metrics for averages
    acc_list.append(accuracy)
    auc_list.append(roc_auc)
    prec_list.append(precision)
    tpr_list.append(tpr)
    fpr_list.append(fpr)
    
    print(f"Split {i} - AUC: {roc_auc}, Accuracy: {accuracy}, Precision: {precision}, TPR: {tpr}, FPR: {fpr}")

# Report avg metrics
print("Mean AUC: ", np.mean(auc_list))
print("Mean Accuracy: ", np.mean(acc_list))
print("Mean Precision: ", np.mean(prec_list))
print("Mean Recall (TPR): ", np.mean(tpr_list))
print("Mean FPR: ", np.mean(fpr_list))

Split 1 - AUC: 0.11111111111111112, Accuracy: 0.3333333333333333, Precision: 0.4, TPR: 0.6666666666666666, FPR: 1.0
Split 2 - AUC: 0.4444444444444445, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 0.6666666666666666, FPR: 0.3333333333333333
Split 3 - AUC: 0.22222222222222224, Accuracy: 0.16666666666666666, Precision: 0.25, TPR: 0.3333333333333333, FPR: 1.0
Split 4 - AUC: 0.75, Accuracy: 0.6666666666666666, Precision: 0.75, TPR: 0.75, FPR: 0.5
Split 5 - AUC: 0.625, Accuracy: 0.8333333333333334, Precision: 0.8, TPR: 1.0, FPR: 0.5
Split 6 - AUC: 0.625, Accuracy: 0.8333333333333334, Precision: 0.8, TPR: 1.0, FPR: 0.5
Split 7 - AUC: 1.0, Accuracy: 1.0, Precision: 1.0, TPR: 1.0, FPR: 0.0
Split 8 - AUC: 0.875, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Split 9 - AUC: 0.75, Accuracy: 0.8333333333333334, Precision: 0.8, TPR: 1.0, FPR: 0.5
Split 10 - AUC: 0.5, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
