In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold
import os
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score, \
roc_curve, auc, precision_score, recall_score, confusion_matrix, precision_recall_curve
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier
import time

In [2]:
# INPUTS #
seed = 42
splits = 10 # from what we decided
# protein_type = 'linear' # make equal to 'linear' or 'log' and variables will update accordingly
# subset = 'full' # full or type of feature selected
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_dir = os.path.join(parent_dir, 'datasets')

# Initialize lists to store data splits
X_train_list, X_test_list = [], []
y_train_list, y_test_list = [], [] # same response variables no matter the X transformation

# Iterate over each split index
for i in range(1, splits + 1):
    train_file = os.path.join(data_dir, f'train_{i}.csv')
    test_file = os.path.join(data_dir, f'test_{i}.csv')

    # Read training data and split into X_train and y_train
    train_df = pd.read_csv(train_file)
    X_train = train_df.drop(columns=['mtx_binary', 'Unnamed: 0', 'EAC_ID'])
    y_train = train_df['mtx_binary']
    
    # Read test data and split into X_test and y_test
    test_df = pd.read_csv(test_file)
    X_test = test_df.drop(columns=['mtx_binary', 'Unnamed: 0', 'EAC_ID'])
    y_test = test_df['mtx_binary']
    
    # Append to respective lists
    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)

In [6]:
# This is a linear svc model without any feature selection using all 10 folds
auc_list = []
acc_list = []
prec_list = []
tpr_list = []
fpr_list = []
# Loop through each split, train the model, and print metrics
for i in range(len(X_train_list)):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]
    
    # implement model
    model = LinearSVC(random_state=seed, dual="auto")
    model.fit(X_train, y_train)
    
    # Get decision function scores (predict_proba)
    y_scores = model.decision_function(X_test) # linear svc does not have a direct predict proba attribute
    # Predict labels
    y_pred = model.predict(X_test)
    
    # calculate metrics
    # ROC AUC
    fprs, tprs, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fprs, tprs)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    tpr = recall_score(y_test, y_pred) # redefine as a value not a vector from before
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel() # retrieve to calculate fpr
    fpr = fp / (fp + tn)
    # append metrics for averages
    acc_list.append(accuracy)
    auc_list.append(roc_auc)
    prec_list.append(precision)
    tpr_list.append(tpr)
    fpr_list.append(fpr)
    
    print(f"Split {i} - AUC: {roc_auc}, Accuracy: {accuracy}, Precision: {precision}, TPR: {tpr}, FPR: {fpr}")

# Report avg metrics
print("Mean AUC: ", np.mean(auc_list))
print("Mean Accuracy: ", np.mean(acc_list))
print("Mean Precision: ", np.mean(prec_list))
print("Mean Recall (TPR): ", np.mean(tpr_list))
print("Mean FPR: ", np.mean(fpr_list))

Split 0 - AUC: 0.22222222222222224, Accuracy: 0.16666666666666666, Precision: 0.25, TPR: 0.3333333333333333, FPR: 1.0
Split 1 - AUC: 0.7777777777777779, Accuracy: 0.8333333333333334, Precision: 0.75, TPR: 1.0, FPR: 0.3333333333333333
Split 2 - AUC: 0.888888888888889, Accuracy: 0.6666666666666666, Precision: 0.6, TPR: 1.0, FPR: 0.6666666666666666
Split 3 - AUC: 0.75, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Split 4 - AUC: 0.5, Accuracy: 0.5, Precision: 0.6, TPR: 0.75, FPR: 1.0
Split 5 - AUC: 0.875, Accuracy: 0.6666666666666666, Precision: 0.75, TPR: 0.75, FPR: 0.5
Split 6 - AUC: 0.875, Accuracy: 0.8333333333333334, Precision: 1.0, TPR: 0.75, FPR: 0.0
Split 7 - AUC: 0.875, Accuracy: 0.8333333333333334, Precision: 1.0, TPR: 0.75, FPR: 0.0
Split 8 - AUC: 1.0, Accuracy: 1.0, Precision: 1.0, TPR: 1.0, FPR: 0.0
Split 9 - AUC: 0.625, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Mean AUC:  0.7388888888888889
Mean Accuracy

In [9]:
# This is a plain svc model without any feature selection using all 10 folds
auc_list = []
acc_list = []
prec_list = []
tpr_list = []
fpr_list = []
# Loop through each split, train the model, and print metrics
for i in range(len(X_train_list)):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]
    
    # implement model
    model = SVC(random_state=seed)
    model.fit(X_train, y_train)
    
    # Get decision function scores (predict_proba)
    y_scores = model.decision_function(X_test) # linear svc does not have a direct predict proba attribute
    # Predict labels
    y_pred = model.predict(X_test)
    
    # calculate metrics
    # ROC AUC
    fprs, tprs, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fprs, tprs)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    tpr = recall_score(y_test, y_pred) # redefine as a value not a vector from before
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel() # retrieve to calculate fpr
    fpr = fp / (fp + tn)
    # append metrics for averages
    acc_list.append(accuracy)
    auc_list.append(roc_auc)
    prec_list.append(precision)
    tpr_list.append(tpr)
    fpr_list.append(fpr)
    
    print(f"Split {i} - AUC: {roc_auc}, Accuracy: {accuracy}, Precision: {precision}, TPR: {tpr}, FPR: {fpr}")

# Report avg metrics
print("Mean AUC: ", np.mean(auc_list))
print("Mean Accuracy: ", np.mean(acc_list))
print("Mean Precision: ", np.mean(prec_list))
print("Mean Recall (TPR): ", np.mean(tpr_list))
print("Mean FPR: ", np.mean(fpr_list))

Split 0 - AUC: 0.5555555555555556, Accuracy: 0.5, Precision: 0.5, TPR: 1.0, FPR: 1.0
Split 1 - AUC: 0.7777777777777778, Accuracy: 0.5, Precision: 0.5, TPR: 1.0, FPR: 1.0
Split 2 - AUC: 0.3333333333333333, Accuracy: 0.5, Precision: 0.5, TPR: 1.0, FPR: 1.0
Split 3 - AUC: 0.25, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Split 4 - AUC: 0.625, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Split 5 - AUC: 0.875, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Split 6 - AUC: 1.0, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Split 7 - AUC: 0.875, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Split 8 - AUC: 0.75, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Split 9 - AUC: 0.0, Accuracy: 0.6666666666666666, Precision: 0.6666666666666666, TPR: 1.0, FPR: 1.0
Mean AUC:  0.6041666666666667
Mean Ac

In [11]:
# This is a plain sgd svm model without any feature selection using all 10 folds
auc_list = []
acc_list = []
prec_list = []
tpr_list = []
fpr_list = []
# Loop through each split, train the model, and print metrics
for i in range(len(X_train_list)):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]
    # implement model
    model = SGDClassifier(random_state=seed,loss="hinge") # another linear svm
    model.fit(X_train, y_train)
        
    # Get decision function scores (predict_proba)
    y_scores = model.decision_function(X_test) # linear svc does not have a direct predict proba attribute
    # Predict labels
    y_pred = model.predict(X_test)
    
    # calculate metrics
    # ROC AUC
    fprs, tprs, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fprs, tprs)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    tpr = recall_score(y_test, y_pred) # redefine as a value not a vector from before
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel() # retrieve to calculate fpr
    fpr = fp / (fp + tn)
    # append metrics for averages
    acc_list.append(accuracy)
    auc_list.append(roc_auc)
    prec_list.append(precision)
    tpr_list.append(tpr)
    fpr_list.append(fpr)
    
    print(f"Split {i} - AUC: {roc_auc}, Accuracy: {accuracy}, Precision: {precision}, TPR: {tpr}, FPR: {fpr}")

# Report avg metrics
print("Mean AUC: ", np.mean(auc_list))
print("Mean Accuracy: ", np.mean(acc_list))
print("Mean Precision: ", np.mean(prec_list))
print("Mean Recall (TPR): ", np.mean(tpr_list))
print("Mean FPR: ", np.mean(fpr_list))

Split 0 - AUC: 0.22222222222222224, Accuracy: 0.16666666666666666, Precision: 0.25, TPR: 0.3333333333333333, FPR: 1.0
Split 1 - AUC: 0.5555555555555556, Accuracy: 0.5, Precision: 0.5, TPR: 1.0, FPR: 1.0
Split 2 - AUC: 0.7777777777777778, Accuracy: 0.6666666666666666, Precision: 0.6, TPR: 1.0, FPR: 0.6666666666666666
Split 3 - AUC: 0.125, Accuracy: 0.5, Precision: 0.6, TPR: 0.75, FPR: 1.0
Split 4 - AUC: 0.625, Accuracy: 0.5, Precision: 0.6, TPR: 0.75, FPR: 1.0
Split 5 - AUC: 0.625, Accuracy: 0.6666666666666666, Precision: 1.0, TPR: 0.5, FPR: 0.0
Split 6 - AUC: 1.0, Accuracy: 0.8333333333333334, Precision: 1.0, TPR: 0.75, FPR: 0.0
Split 7 - AUC: 1.0, Accuracy: 0.6666666666666666, Precision: 1.0, TPR: 0.5, FPR: 0.0
Split 8 - AUC: 0.875, Accuracy: 0.8333333333333334, Precision: 0.8, TPR: 1.0, FPR: 0.5
Split 9 - AUC: 0.125, Accuracy: 0.5, Precision: 0.6, TPR: 0.75, FPR: 1.0
Mean AUC:  0.5930555555555556
Mean Accuracy:  0.5833333333333333
Mean Precision:  0.695
Mean Recall (TPR):  0.73333333

In [13]:
# This is a plain sgd logistic model without any feature selection using all 10 folds
auc_list = []
acc_list = []
prec_list = []
tpr_list = []
fpr_list = []
# Loop through each split, train the model, and print metrics
for i in range(len(X_train_list)):
    X_train = X_train_list[i]
    X_test = X_test_list[i]
    y_train = y_train_list[i]
    y_test = y_test_list[i]
    # implement model
    model = SGDClassifier(random_state=seed,loss="log_loss") # makes logistic with sgd
    model.fit(X_train, y_train)
    
    # Get decision function scores (predict_proba)
    y_scores = model.decision_function(X_test) # linear svc does not have a direct predict proba attribute
    # Predict labels
    y_pred = model.predict(X_test)
    
    # calculate metrics
    # ROC AUC
    fprs, tprs, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fprs, tprs)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    tpr = recall_score(y_test, y_pred) # redefine as a value not a vector from before
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel() # retrieve to calculate fpr
    fpr = fp / (fp + tn)
    # append metrics for averages
    acc_list.append(accuracy)
    auc_list.append(roc_auc)
    prec_list.append(precision)
    tpr_list.append(tpr)
    fpr_list.append(fpr)
    
    print(f"Split {i} - AUC: {roc_auc}, Accuracy: {accuracy}, Precision: {precision}, TPR: {tpr}, FPR: {fpr}")

# Report avg metrics
print("Mean AUC: ", np.mean(auc_list))
print("Mean Accuracy: ", np.mean(acc_list))
print("Mean Precision: ", np.mean(prec_list))
print("Mean Recall (TPR): ", np.mean(tpr_list))
print("Mean FPR: ", np.mean(fpr_list))

Split 0 - AUC: 0.11111111111111112, Accuracy: 0.3333333333333333, Precision: 0.3333333333333333, TPR: 0.3333333333333333, FPR: 0.6666666666666666
Split 1 - AUC: 0.7777777777777779, Accuracy: 0.8333333333333334, Precision: 0.75, TPR: 1.0, FPR: 0.3333333333333333
Split 2 - AUC: 0.6666666666666667, Accuracy: 0.6666666666666666, Precision: 0.6, TPR: 1.0, FPR: 0.6666666666666666
Split 3 - AUC: 0.5, Accuracy: 0.5, Precision: 0.6, TPR: 0.75, FPR: 1.0
Split 4 - AUC: 0.625, Accuracy: 0.5, Precision: 0.6666666666666666, TPR: 0.5, FPR: 0.5
Split 5 - AUC: 0.625, Accuracy: 0.5, Precision: 1.0, TPR: 0.25, FPR: 0.0
Split 6 - AUC: 1.0, Accuracy: 1.0, Precision: 1.0, TPR: 1.0, FPR: 0.0
Split 7 - AUC: 1.0, Accuracy: 0.8333333333333334, Precision: 1.0, TPR: 0.75, FPR: 0.0
Split 8 - AUC: 0.75, Accuracy: 0.8333333333333334, Precision: 0.8, TPR: 1.0, FPR: 0.5
Split 9 - AUC: 0.625, Accuracy: 0.5, Precision: 0.6, TPR: 0.75, FPR: 1.0
Mean AUC:  0.6680555555555555
Mean Accuracy:  0.6499999999999999
Mean Precisi