In [None]:
## Import libraries
import numpy as np
import pandas as pd
import pickle
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, cross_val_score, RandomizedSearchCV

In [None]:
## Set the display option for wide dataframes
pd.set_option('display.width', 1000)

In [None]:
## Set variables
N = 10 # experiment repetitions
k = 4  # k for k-fold cross-validation in hyperparameter tuning
seed = 42  # seed for reproducibility

In [None]:
## Initialize variables to store tuned hyperparameters and results
trials_params, trials_results = [], []

# Load data

In [None]:
## Read list of selected patients
f = open('Data/selected_patients.txt', 'r')
patient_IDs = []
for x in f:
  patient_IDs.append(int(x.strip()))

In [None]:
## Load clinical data
data = pd.read_csv('Data/clinical_data.csv', header=0, index_col=0, delimiter=';')
data = data.loc[patient_IDs]  # Sort the clinical data DataFrame

In [None]:
## Generate binary labels
diagnosis = data['vx-diagnosis']
labels = np.zeros(len(patient_IDs))
labels[diagnosis == 'Sick'] = 1

# Pre-process

In [None]:
## Select features
selected_features = ['age_at_visit', 'vx-PH-Eating habits', 'vx-PH-Cancer family?', 'vx-MH-Radiotherapy', 'vx-MH-Use of hormone replacement?',
                     'age_at_menarche', 'age_at_menopause', 'diabetes', 'nodules']
data = data[selected_features]

In [None]:
## Min-max normalization
M = data.max().values
M[M<1] = 1
m = data.min().values

data = (data-m)/(M-m)

In [None]:
## Convert to numpy array
data = np.asarray(data, dtype=np.float32)

# Function definitions

In [None]:
def weighted_error(y_test, y_test_pred):
    WE = 20

    # Number of elements for which y_test > y_test_pred, i.e., y_test = 1 and y_test_pred = 0 (FN)
    fn = np.sum(np.greater(y_test, y_test_pred))

    # Number of elements for which y_test < y_test_pred, i.e., y_test = 0 and y_test_pred = 1 (FP)
    fp = np.sum(np.less(y_test, y_test_pred))

    return fn*WE + fp

In [None]:
def evaluate_model(preds_raw, ground_truth):

    from sklearn.metrics import confusion_matrix, log_loss, roc_auc_score #, accuracy_score, precision_score, recall_score, f1_score, make_scorer
    
    # BCE loss
    bce = log_loss(ground_truth, preds_raw)

    # Confusion matrix
    TN, FP, FN, TP = confusion_matrix(ground_truth, np.round(preds_raw)).ravel()
    accuracy = (TP + TN) / (TP + TN + FP + FN)  # accuracy = accuracy_score(ground_truth, np.round(preds_raw))
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    #gmean = np.sqrt((TP/(TP+FN))*(TN/(TN+FP)))
    precision = TP / (TP + FP)  # precision = precision_score(ground_truth, np.round(preds_raw))
    F1 = 2 * TP / (2*TP + FP + FN)  # F1 = f1_score(ground_truth, np.round(preds_raw))
    
    # ROC AUC
    auc = roc_auc_score(ground_truth, preds_raw)
    
    # Weighted error    
    we = weighted_error(ground_truth, np.round(preds_raw))
    
    results = {
        'BCELoss':bce,
        'Accuracy':accuracy,
        'TP':TP,
        'FP':FP,
        'TN':TN,
        'FN':FN,
        'Sensitivity':sensitivity,
        'Specificity':specificity,
        #'G-mean':gmean,
        'Precision':precision,
        'Recall':sensitivity,
        'F1':F1,
        'ROC_AUC':auc,
        'WE':we
    }
    return results


# Decision Tree (DT)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Initialize the splitter to split the dataset into 85% train and 15% tests sets N times 
splitter = StratifiedShuffleSplit(n_splits=N, test_size=int(round(0.15*len(labels))), random_state = seed)

In [None]:
for trial, (train_index, test_index) in enumerate(splitter.split(data, labels)):
    
    print(f'Trial {trial + 1}'), print()

    ## Split the dataset into train (85%) and test (15%) sets
    crossval_data = data[train_index]
    crossval_labels = labels[train_index]
    test_data = data[test_index]
    test_labels = labels[test_index]

    # Calculate the weight for training to address class imbalance
    n_sick = labels.sum()
    n_healthy = len(labels) - n_sick
    rate_train = n_healthy / n_sick

    
    ## Use training set for k-fold cross-validation to tune the hyperparameters
    ti = time.time()
    estimator = DecisionTreeClassifier(class_weight={0: 1, 1: rate_train})
    param_grid = {'ccp_alpha' : np.arange(0, 0.1, 0.01), # Complexity parameter used for Minimal Cost-Complexity Pruning. Default: ccp_alpha = 0.0
                  'criterion': ['gini','entropy'],#'log_loss'], # The function to measure the quality of a split. Default: criterion='gini'
                  'max_depth' : [None, 1, 5, 10, 15], # The maximum depth of the tree. Default: max_depth=None
                  #'max_features': [None, 'sqrt', 'log2'], # The number of features to consider when looking for the best split. Default: max_features=None
                  'max_leaf_nodes': [None, 3, 6, 9], # Grow a tree with max_leaf_nodes in best-first fashion. Default: None
                  'min_samples_leaf': [1, 2, 3, 4], # The minimum number of samples required to be at a leaf node. Default: min_samples_leaf=1
                  'min_samples_split' : [2, 5, 10, 15], # The minimum number of samples required to split an internal node. Default: min_samples_split=2
                  'min_weight_fraction_leaf' : np.arange(0.0, 0.5, 0.05), # The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Default: min_weight_fraction_leaf=0
                  #'splitter': ['best','random'] # The strategy used to choose the split at each node. Default: splitter='best'
                 }
    classifier = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=k)
    classifier.fit(crossval_data, crossval_labels)

    parameters = classifier.best_params_  # Parameter setting that gave the best results on the hold out data
    classifier = classifier.best_estimator_  # Estimator which gave highest score on the left out data
    
    train_time = time.time() - ti
    hours, remainder = divmod(train_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f'Hyperparameter tuning took {hours} hours, {minutes} minutes, and {seconds} seconds.')

    ## Store tuned hyperparameters
    trials_params.append({**{'classifier':'DT'}, **{'trial':trial+1}, **parameters})

    ## Save the trained model
    with open('Models/DT_'+str(trial+1)+'.pkl','wb') as f:
        pickle.dump(classifier,f)

    ## Predict
    predictions_train = classifier.predict_proba(crossval_data)[:,1]
    np.save('Predictions/DT_train_'+str(trial+1)+'.npy',predictions_train)
    predictions_test = classifier.predict_proba(test_data)[:,1]
    np.save('Predictions/DT_test_'+str(trial+1)+'.npy',predictions_test)

    ## Print the number of parameters in the model
    num_params = classifier.tree_.node_count
    print(f'Classifier has {num_params} parameters.'), print()
        
    ## Evaluate the model
    results_train = evaluate_model(predictions_train, crossval_labels)
    print('TRAIN results:')
    for metric, value in results_train.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()
            
    results_test = evaluate_model(predictions_test, test_labels)
    print('TEST results:')
    for metric, value in results_test.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()

    ## Store results
    trials_results.append(
        {'classifier':'DT',
         'trial':trial+1,
         'parameters':num_params,
         'trainTime':train_time,
         **{'train_'+k:v for k,v in results_train.items()},
         **{'test_'+k:v for k,v in results_test.items()}
        }
    )

    print(), print(100*'#'), print()

# Random Forest (RF)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Initialize the splitter to split the dataset into 85% train and 15% tests sets N times 
splitter = StratifiedShuffleSplit(n_splits=N, test_size=int(round(0.15*len(labels))), random_state = seed)

In [None]:
for trial, (train_index, test_index) in enumerate(splitter.split(data, labels)):
    
    print(f'Trial {trial + 1}'), print()

    ## Split the dataset into train (85%) and test (15%) sets
    crossval_data = data[train_index]
    crossval_labels = labels[train_index]
    test_data = data[test_index]
    test_labels = labels[test_index]

    # Calculate the weight for training to address class imbalance
    n_sick = labels.sum()
    n_healthy = len(labels) - n_sick
    rate_train = n_healthy / n_sick

    
    ## Use training set for k-fold cross-validation to tune the hyperparameters
    ti = time.time()
    estimator = RandomForestClassifier(class_weight={0: 1, 1: rate_train})
    param_grid = {'n_estimators': np.arange(50, 225, 25),  # Number of trees in random forest. Default: n_estimators=100
                  'criterion': ['gini','entropy'],#'log_loss'], # The function to measure the quality of a split. Default: criterion='gini'
                  'max_features': ['sqrt', 'log2'],  # Number of features to consider at every split. Default: max_features='sqrt'
                  'max_depth': list(np.arange(10, 110, 10))+[None],  # Maximum number of levels in tree. Default: max_depth=None
                  'min_samples_split': [2, 3, 5, 10],  # Minimum number of samples required to split a node. Default: min_samples_split=2
                  'min_samples_leaf': [1, 2, 3, 4],  # Minimum number of samples required at each leaf node. Default: min_samples_leaf=1
                  #'bootstrap': [True, False]  # Method of selecting samples for training each tree. Default: bootstrap=True
                 }
    classifier = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=k)
    classifier.fit(crossval_data, crossval_labels)

    parameters = classifier.best_params_  # Parameter setting that gave the best results on the hold out data
    classifier = classifier.best_estimator_  # Estimator which gave highest score on the left out data
    
    train_time = time.time() - ti
    hours, remainder = divmod(train_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f'Hyperparameter tuning took {hours} hours, {minutes} minutes, and {seconds} seconds.')

    ## Store tuned hyperparameters
    trials_params.append({**{'classifier':'RF'}, **{'trial':trial+1}, **parameters})

    ## Save the trained model
    with open('Models/RF_'+str(trial+1)+'.pkl','wb') as f:
        pickle.dump(classifier,f)

    ## Predict
    predictions_train = classifier.predict_proba(crossval_data)[:,1]
    np.save('Predictions/RF_train_'+str(trial+1)+'.npy',predictions_train)
    predictions_test = classifier.predict_proba(test_data)[:,1]
    np.save('Predictions/RF_test_'+str(trial+1)+'.npy',predictions_test)

    ## Print the number of parameters in the model
    num_params = sum([tree.tree_.node_count for tree in classifier.estimators_])
    print(f'Classifier has {num_params} parameters.'), print()
        
    ## Evaluate the model
    results_train = evaluate_model(predictions_train, crossval_labels)
    print('TRAIN results:')
    for metric, value in results_train.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()
            
    results_test = evaluate_model(predictions_test, test_labels)
    print('TEST results:')
    for metric, value in results_test.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()

    ## Store results
    trials_results.append(
        {'classifier':'RF',
         'trial':trial+1,
         'parameters':num_params,
         'trainTime':train_time,
         **{'train_'+k:v for k,v in results_train.items()},
         **{'test_'+k:v for k,v in results_test.items()}
        }
    )

    print(), print(100*'#'), print()

# Support Vector Machines (SVM)

In [None]:
from sklearn.svm import SVC

In [None]:
# Initialize the splitter to split the dataset into 85% train and 15% tests sets N times 
splitter = StratifiedShuffleSplit(n_splits=N, test_size=int(round(0.15*len(labels))), random_state = seed)

In [None]:
for trial, (train_index, test_index) in enumerate(splitter.split(data, labels)):
    
    print(f'Trial {trial + 1}'), print()

    ## Split the dataset into train (85%) and test (15%) sets
    crossval_data = data[train_index]
    crossval_labels = labels[train_index]
    test_data = data[test_index]
    test_labels = labels[test_index]

    # Calculate the weight for training to address class imbalance
    n_sick = labels.sum()
    n_healthy = len(labels) - n_sick
    rate_train = n_healthy / n_sick

    
    ## Use training set for k-fold cross-validation to tune the hyperparameters
    ti = time.time()
    estimator = SVC(class_weight={0: 1, 1: rate_train}, probability=True)
    param_grid = {'C': [1,10,100,1000], # Regularization parameter. Default: C=1.0
                  'kernel': ['linear', 'rbf', 'sigmoid', 'poly'], # Default: kernel='rbf'
                  'gamma': ['scale', 'auto', 1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001] # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. Default: gamma='scale'
                 }
    classifier = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=k)
    classifier.fit(crossval_data, crossval_labels)

    parameters = classifier.best_params_  # Parameter setting that gave the best results on the hold out data
    classifier = classifier.best_estimator_  # Estimator which gave highest score on the left out data
    
    train_time = time.time() - ti
    hours, remainder = divmod(train_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f'Hyperparameter tuning took {hours} hours, {minutes} minutes, and {seconds} seconds.')

    ## Store tuned hyperparameters
    trials_params.append({**{'classifier':'SVM'}, **{'trial':trial+1}, **parameters})

    ## Save the trained model
    with open('Models/SVM_'+str(trial+1)+'.pkl','wb') as f:
        pickle.dump(classifier,f)

    ## Predict
    predictions_train = classifier.predict_proba(crossval_data)[:,1]
    np.save('Predictions/SVM_train_'+str(trial+1)+'.npy',predictions_train)
    predictions_test = classifier.predict_proba(test_data)[:,1]
    np.save('Predictions/SVM_test_'+str(trial+1)+'.npy',predictions_test)

    ## Print the number of parameters in the model
    n_support_vectors = len(classifier.support_vectors_)
    n_coefficients = len(classifier.dual_coef_[0])
    num_params = n_support_vectors + n_coefficients
    print(f'Classifier has {num_params} parameters.'), print()
        
    ## Evaluate the model
    results_train = evaluate_model(predictions_train, crossval_labels)
    print('TRAIN results:')
    for metric, value in results_train.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()
            
    results_test = evaluate_model(predictions_test, test_labels)
    print('TEST results:')
    for metric, value in results_test.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()

    ## Store results
    trials_results.append(
        {'classifier':'SVM',
         'trial':trial+1,
         'parameters':num_params,
         'trainTime':train_time,
         **{'train_'+k:v for k,v in results_train.items()},
         **{'test_'+k:v for k,v in results_test.items()}
        }
    )

    print(), print(100*'#'), print()

# AdaBoost with DT base estimator

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Initialize the splitter to split the dataset into 85% train and 15% tests sets N times 
splitter = StratifiedShuffleSplit(n_splits=N, test_size=int(round(0.15*len(labels))), random_state = seed)

In [None]:
for trial, (train_index, test_index) in enumerate(splitter.split(data, labels)):
    
    print(f'Trial {trial + 1}'), print()

    ## Split the dataset into train (85%) and test (15%) sets
    crossval_data = data[train_index]
    crossval_labels = labels[train_index]
    test_data = data[test_index]
    test_labels = labels[test_index]

    # Calculate the weight for training to address class imbalance
    n_sick = labels.sum()
    n_healthy = len(labels) - n_sick
    rate_train = n_healthy / n_sick

    
    ## Use training set for k-fold cross-validation to tune the hyperparameters
    ti = time.time()
    base_estimator = DecisionTreeClassifier(class_weight={0: 1, 1: rate_train})
    estimator = AdaBoostClassifier(base_estimator)
    param_grid = {'n_estimators': np.arange(10, 110, 10), # The maximum number of estimators at which boosting is terminated. Default: n_estimators=50
                  'learning_rate': [0.01, 0.1, 0.5, 1.0], # Weight applied to each classifier at each boosting iteration. Default: learning_rate=1.0
                      
                  ## Decision Tree parameters
                  #'base_estimator__ccp_alpha' : np.arange(0, 0.1, 0.01), # Complexity parameter used for Minimal Cost-Complexity Pruning. Default: ccp_alpha = 0.0
                  #'base_estimator__criterion': ['gini','entropy'],#'log_loss'], # The function to measure the quality of a split. Default: criterion='gini'
                  #'base_estimator__max_depth' : [None, 1, 5, 10, 15], # The maximum depth of the tree. Default: max_depth=None
                  ##'base_estimator__max_features': [None, 'sqrt', 'log2'], # The number of features to consider when looking for the best split. Default: max_features=None
                  #'base_estimator__max_leaf_nodes': [None, 3, 6, 9], # Grow a tree with max_leaf_nodes in best-first fashion. Default: None
                  #'base_estimator__min_samples_leaf': [1, 2, 3, 4], # The minimum number of samples required to be at a leaf node. Default: min_samples_leaf=1
                  #'base_estimator__min_samples_split' : [2, 5, 10, 15], # The minimum number of samples required to split an internal node. Default: min_samples_split=2
                  #'base_estimator__min_weight_fraction_leaf' : np.arange(0.0, 0.5, 0.05), # The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Default: min_weight_fraction_leaf=0
                  ##'base_estimator__splitter': ['best','random'] # The strategy used to choose the split at each node. Default: splitter='best'
                 }
    classifier = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=k)
    classifier.fit(crossval_data, crossval_labels)

    parameters = classifier.best_params_  # Parameter setting that gave the best results on the hold out data
    classifier = classifier.best_estimator_  # Estimator which gave highest score on the left out data
    
    train_time = time.time() - ti
    hours, remainder = divmod(train_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f'Hyperparameter tuning took {hours} hours, {minutes} minutes, and {seconds} seconds.')

    ## Store tuned hyperparameters
    trials_params.append({**{'classifier':'DT_AdaBoost'}, **{'trial':trial+1}, **parameters})

    ## Save the trained model
    with open('Models/DT_AdaBoost_'+str(trial+1)+'.pkl','wb') as f:
        pickle.dump(classifier,f)

    ## Predict
    predictions_train = classifier.predict_proba(crossval_data)[:,1]
    np.save('Predictions/DT_AdaBoost_train_'+str(trial+1)+'.npy',predictions_train)
    predictions_test = classifier.predict_proba(test_data)[:,1]
    np.save('Predictions/DT_AdaBoost_test_'+str(trial+1)+'.npy',predictions_test)

    ## Print the number of parameters in the model
    num_params = sum(estimator.tree_.node_count for estimator in classifier.estimators_)
    print(f'Classifier has {num_params} parameters.'), print()
        
    ## Evaluate the model
    results_train = evaluate_model(predictions_train, crossval_labels)
    print('TRAIN results:')
    for metric, value in results_train.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()
            
    results_test = evaluate_model(predictions_test, test_labels)
    print('TEST results:')
    for metric, value in results_test.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()

    ## Store results
    trials_results.append(
        {'classifier':'DT_AdaBoost',
         'trial':trial+1,
         'parameters':num_params,
         'trainTime':train_time,
         **{'train_'+k:v for k,v in results_train.items()},
         **{'test_'+k:v for k,v in results_test.items()}
        }
    )

    print(), print(100*'#'), print()

# AdaBoost with RF base estimator

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Initialize the splitter to split the dataset into 85% train and 15% tests sets N times 
splitter = StratifiedShuffleSplit(n_splits=N, test_size=int(round(0.15*len(labels))), random_state = seed)

In [None]:
for trial, (train_index, test_index) in enumerate(splitter.split(data, labels)):
    
    print(f'Trial {trial + 1}'), print()

    ## Split the dataset into train (85%) and test (15%) sets
    crossval_data = data[train_index]
    crossval_labels = labels[train_index]
    test_data = data[test_index]
    test_labels = labels[test_index]

    # Calculate the weight for training to address class imbalance
    n_sick = labels.sum()
    n_healthy = len(labels) - n_sick
    rate_train = n_healthy / n_sick

    
    ## Use training set for k-fold cross-validation to tune the hyperparameters
    ti = time.time()
    base_estimator = RandomForestClassifier(class_weight={0: 1, 1: rate_train})
    estimator = AdaBoostClassifier(base_estimator)
    param_grid = {'n_estimators': np.arange(10, 110, 10), # The maximum number of estimators at which boosting is terminated. Default: n_estimators=50
                  'learning_rate': [0.01, 0.1, 0.2, 0.5], # Weight applied to each classifier at each boosting iteration. Default: learning_rate=1.0
                      
                  ## Random Forest parameters
                  #'base_estimator__n_estimators': np.arange(50, 225, 25),  # Number of trees in random forest. Default: n_estimators=100
                  #'base_estimator__criterion': ['gini','entropy'],#'log_loss'], # The function to measure the quality of a split. Default: criterion='gini'
                  #'base_estimator__max_features': ['sqrt', 'log2'],  # Number of features to consider at every split. Default: max_features='sqrt'
                  #'base_estimator__max_depth': list(np.arange(10, 110, 10))+['None'],  # Maximum number of levels in tree. Default: max_depth=None
                  #'base_estimator__min_samples_split': [2, 3, 5, 10],  # Minimum number of samples required to split a node. Default: min_samples_split=2
                  #'base_estimator__min_samples_leaf': [1, 2, 3, 4],  # Minimum number of samples required at each leaf node. Default: min_samples_leaf=1
                  ##'base_estimator__bootstrap': [True, False]  # Method of selecting samples for training each tree. Default: bootstrap=True
                 }
    classifier = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=k)
    classifier.fit(crossval_data, crossval_labels)

    parameters = classifier.best_params_  # Parameter setting that gave the best results on the hold out data
    classifier = classifier.best_estimator_  # Estimator which gave highest score on the left out data
    
    train_time = time.time() - ti
    hours, remainder = divmod(train_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f'Hyperparameter tuning took {hours} hours, {minutes} minutes, and {seconds} seconds.')

    ## Store tuned hyperparameters
    trials_params.append({**{'classifier':'RF_AdaBoost'}, **{'trial':trial+1}, **parameters})

    ## Save the trained model
    with open('Models/RF_AdaBoost_'+str(trial+1)+'.pkl','wb') as f:
        pickle.dump(classifier,f)

    ## Predict
    predictions_train = classifier.predict_proba(crossval_data)[:,1]
    np.save('Predictions/RF_AdaBoost_train_'+str(trial+1)+'.npy',predictions_train)
    predictions_test = classifier.predict_proba(test_data)[:,1]
    np.save('Predictions/RF_AdaBoost_test_'+str(trial+1)+'.npy',predictions_test)

    ## Print the number of parameters in the model
    n_estimators = len(classifier.estimators_)
    num_params = sum(
        sum(tree.tree_.node_count for tree in estimator.estimators_)
        for estimator in classifier.estimators_
    )
    print(f'Classifier has {num_params} parameters.'), print()
        
    ## Evaluate the model
    results_train = evaluate_model(predictions_train, crossval_labels)
    print('TRAIN results:')
    for metric, value in results_train.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()
            
    results_test = evaluate_model(predictions_test, test_labels)
    print('TEST results:')
    for metric, value in results_test.items():
        print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
    print()

    ## Store results
    trials_results.append(
        {'classifier':'RF_AdaBoost',
         'trial':trial+1,
         'parameters':num_params,
         'trainTime':train_time,
         **{'train_'+k:v for k,v in results_train.items()},
         **{'test_'+k:v for k,v in results_test.items()}
        }
    )

    print(), print(100*'#'), print()

In [None]:
## Store tuned hyperparameters and results
pd.DataFrame(trials_params).to_csv('CD_Parameters_'+str(N)+'trials.csv')
pd.DataFrame(trials_results).round(decimals=5).to_csv('CD_Results_'+str(N)+'trials.csv')

# Compare models

In [None]:
trials_results = pd.DataFrame(trials_results)

In [None]:
## Print statistics
models = trials_results.classifier.unique()
metrics = [c for c in trials_results.columns if 'test_' in c and c not in ['test_TP','test_FP','test_TN','test_FN']]
    
statistics = pd.DataFrame(index=models, columns=[item for sublist in [[metric+'_mean', metric+'_std'] for metric in metrics] for item in sublist])
    
for metric in metrics:
    mn, st = metric+'_mean', metric+'_std'
    for model in models:
        results = trials_results[trials_results['classifier']==model][metric].values
        statistics.at[model,mn] = results.mean()
        statistics.at[model,st] = results.std()

statistics

In [None]:
for metric in [m for m in metrics if 'test' in m]:
    mn, st = metric+'_mean', metric+'_std'
    if 'Loss' in metric or 'WE' in metric:
        model_best = pd.to_numeric(statistics[metric+'_mean']).idxmin()
        print(f'Model with lowest {metric} is {model_best} with value {statistics.loc[model_best,mn]} and standard deviation {statistics.loc[model_best,st]}')
    else:
        model_best = pd.to_numeric(statistics[metric+'_mean']).idxmax()
        print(f'Model with highest {metric} is {model_best} with value {statistics.loc[model_best,mn]} and standard deviation {statistics.loc[model_best,st]}')

In [None]:
## Print mean and std metrics for each model
for classifier_type in trials_results.classifier.unique():
    print(f'Classifier: {classifier_type}')
    results = trials_results[trials_results['classifier'] == classifier_type]

    # Number of parameters
    parameters = results['parameters'].values
    print(f'Mean number of parameters: {parameters.mean()} [{parameters.min()}, {parameters.max()}], std {parameters.std()}')

    # training time
    trainTime = results['trainTime'].values
    hours, remainder = divmod(trainTime.mean(), 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f'Mean training time: {hours} hours, {minutes} minutes, and {seconds} seconds, (std {trainTime.std()} sec)')
    print()
    
    # TRAIN results
    metrics = ['BCELoss','Accuracy','Sensitivity','Specificity','ROC_AUC','Precision','F1','WE']
    for metric in metrics:
        values = results['train_' + metric].values
        print(f'Mean train {metric}: {values.mean()}, std {values.std()}')
    print()

    # TEST results
    for metric in metrics:
        values = results['test_' + metric].values
        print(f'Mean test {metric}: {values.mean()}, std {values.std()}')
    print()

    print('-'*120), print()

In [None]:
## Show boxplots
compared_metrics = ['test_BCELoss','test_Accuracy','test_F1','test_ROC_AUC','test_WE']  #[c for c in cd_trials_results.columns if 'test_' in c and c not in ['test_TP','test_FP','test_TN','test_FN','test_WE','test_Loss']]
M = len(compared_metrics)
R = M//2 + int(M % 2 > 0)
plt.figure(figsize=(5*R, 20))
for i,metric in enumerate(compared_metrics):
    plt.subplot(R,2,i+1)
    sns.boxplot(x='classifier', y=metric, data=trials_results, palette='Set2')
    plt.xticks(rotation=25, ha='right')  # Rotate labels 25 degrees
    plt.grid(axis='y')
    plt.xlabel('')
    if 'Loss' in metric or 'WE' in metric:
        plt.ylim([0,trials_results[metric].values.max()+0.05*trials_results[metric].values.max()])
    else:
        plt.ylim([0,1])
    plt.title(metric, fontsize=16)
    #plt.tight_layout()
    
plt.savefig('CD_Boxplots_allModels.png')
plt.subplots_adjust(hspace=0.3)  # Increase hspace for more vertical spacing
plt.show()

In [None]:
## Statistical model comparison
from scipy.stats import shapiro
from scipy.stats import probplot
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.multicomp import pairwise_tukeyhsd
    
models = trials_results.classifier.unique()
compared_metrics = ['test_BCELoss','test_Accuracy','test_F1','test_ROC_AUC','test_WE']
    
## 1. Select independent metrics
#These should be independent. Use the correlation to find dependent variables to remove, if necessary 
print('Check that the metrics are independent:')
print(trials_results[compared_metrics].corr()), print()  # Check for multicollinearity using the correlation matrix

var = trials_results[compared_metrics].var().to_numpy()
compared_metrics = [metric for metric,v in zip(compared_metrics,var) if not v == 0]  # Remove metrics with zero variance across repetitions

## 2. Check for Normality
normality = np.ones(len(compared_metrics))
L = len(models)//5 + (len(models) % 5 > 0)
for i,metric in enumerate(compared_metrics): 
    print(metric)
    ## Check for normality
    plt.figure(figsize=(20,2*L))
    for j,model in enumerate(models):
        # Shapiro-Wilk test
        stat, p_value = shapiro(trials_results[trials_results['classifier']==model][metric])
        print(f'{model}: Shapiro-Wilk p-value={p_value}')
        # If p-value < 0.05, reject normality (non-normal distribution)
        normality[i] *= int(p_value > 0.05)

        # Generate Q-Q plots to visually inspect normality 
        plt.subplot(L,5,j+1)
        probplot(trials_results[trials_results['classifier']==model][metric], dist='norm', plot=plt)
        plt.title(model)
    plt.show()
    print(f'\nNormality test for {metric}: {bool(normality[i])}')
    print(100*'-')
print(), print()
    
    
## 3. Statistical test: MANOVA
dependent_variables = ' + '.join(compared_metrics)
formula = f'{dependent_variables} ~ classifier'
manova = MANOVA.from_formula(formula, data=trials_results)
manova_test = manova.mv_test()
print(manova_test)

## 4. Post-hoc test: Tukey's HSD
if manova_test.results['classifier']['stat']['Pr > F']['Pillai\'s trace'] < 0.05:
    for metric in compared_metrics:
        print(metric)
        tukey = pairwise_tukeyhsd(trials_results[metric],   # Metric data
                                          trials_results['classifier'],   # Grouping variable (model)
                                          alpha=0.05)   # Significance level
        print(tukey)
else:
    print(f"MANOVA is not significant, no need to apply Tukey's test.")

# Selected model

In [None]:
selected_model = 'RF'
results_selected = trials_results[trials_results['classifier']==selected_model]

In [None]:
## Show boxplots
df_long = pd.melt(results_selected[compared_metrics], value_vars=compared_metrics, var_name='metric', value_name='value')
plt.figure(figsize=(8, 5))
sns.boxplot(y='value',x='metric',data=df_long, palette='Set2', fliersize=5)
new_labels = [l.replace('test_','') for l in compared_metrics]
plt.xticks(ticks=range(len(new_labels)), labels=new_labels, fontsize=18, rotation=25, ha='right')
plt.yticks(fontsize=18)
plt.grid(axis='y')
plt.xlabel('', fontsize=18)
plt.ylabel('', fontsize=18)
plt.ylim([-0.05,1.05])
plt.savefig('CD_Boxplot_'+selected_model+'.png')
plt.show()