In [None]:
## Import libraries
import numpy as np
import pandas as pd
import pickle
import time
import os

from sklearn.model_selection import StratifiedShuffleSplit
from Utils import evaluate_model_skl, store_results, visualize_boxplots, visualize_boxplot_onemodel, compare_models #, weighted_error

In [None]:
os.mkdir('Clinical_data_classifiers')
os.mkdir('Clinical_data_classifiers/Models')
os.mkdir('Clinical_data_classifiers/Predictions')

In [None]:
pd.set_option('display.width', 1000)

# Load data

In [None]:
clinical_data_h = pd.read_csv('Clinical_data/clinical_data_h.csv')
clinical_data_s = pd.read_csv('Clinical_data/clinical_data_s.csv')
cd_colnames = clinical_data_h.columns

In [None]:
## Generate labels
labels_h = [0]*len(clinical_data_h)
labels_s = [1]*len(clinical_data_s)

In [None]:
data = pd.concat([clinical_data_h,clinical_data_s])
labels = np.concatenate((labels_h, labels_s))

# Pre-processing

In [None]:
## Normalize
M = data.max().values
M[M<1] = 1
m = data.min().values

data = (data-m)/(M-m)

In [None]:
## Select columns
id_columns_to_delete = [1, 3, 4, 6, 8, 9, 10, 12, 13, 14, 15, 16, 18, 22, 23, 25]
columns_to_delete = cd_colnames[id_columns_to_delete]
data.drop(columns_to_delete,axis=1,inplace=True)
cd_colnames = list(cd_colnames)
for f in columns_to_delete:
    cd_colnames.remove(f)

In [None]:
## Convert to numpy array
data = np.asarray(data, dtype=np.float32)

# Function definitions

In [None]:
## Hyperparameter tuning
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV

def tune_hyperparameters(classifier_type, data, labels, k=4):
    n_sick = labels.sum()
    n_healthy = len(labels) - n_sick
    rate_train = n_healthy / n_sick

    if classifier_type == 'SVC' or classifier_type == 'SVM':
        # Support Vector Machine (SVM) classifier
        estimator = SVC(class_weight={0: 1, 1: rate_train}, probability=True)
        param_grid = {'C': [1,10,100,1000], # Regularization parameter. Default: C=1.0
                      'kernel': ['linear', 'rbf', 'sigmoid', 'poly'], # Default: kernel='rbf'
                      'gamma': ['scale', 'auto', 1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001] # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. Default: gamma='scale'
                     }

    elif classifier_type == 'DT':
        # Decision Tree (DT) classifier
        estimator = DecisionTreeClassifier(class_weight={0: 1, 1: rate_train})
        param_grid = {'ccp_alpha' : np.arange(0, 0.1, 0.01), # Complexity parameter used for Minimal Cost-Complexity Pruning. Default: ccp_alpha = 0.0
                      'criterion': ['gini','entropy'],#'log_loss'], # The function to measure the quality of a split. Default: criterion='gini'
                      'max_depth' : [None, 1, 5, 10, 15], # The maximum depth of the tree. Default: max_depth=None
                      #'max_features': [None, 'sqrt', 'log2'], # The number of features to consider when looking for the best split. Default: max_features=None
                      'max_leaf_nodes': [None, 3, 6, 9], # Grow a tree with max_leaf_nodes in best-first fashion. Default: None
                      'min_samples_leaf': [1, 2, 3, 4], # The minimum number of samples required to be at a leaf node. Default: min_samples_leaf=1
                      'min_samples_split' : [2, 5, 10, 15], # The minimum number of samples required to split an internal node. Default: min_samples_split=2
                      'min_weight_fraction_leaf' : np.arange(0.0, 0.5, 0.05), # The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Default: min_weight_fraction_leaf=0
                      #'splitter': ['best','random'] # The strategy used to choose the split at each node. Default: splitter='best'
                     }

    elif classifier_type == 'RF':
        # Random Forest (RF) classifier
        estimator = RandomForestClassifier(class_weight={0: 1, 1: rate_train})
        param_grid = {'n_estimators': np.arange(50, 225, 25),  # Number of trees in random forest. Default: n_estimators=100
                      'criterion': ['gini','entropy'],#'log_loss'], # The function to measure the quality of a split. Default: criterion='gini'
                      'max_features': ['sqrt', 'log2'],  # Number of features to consider at every split. Default: max_features='sqrt'
                      'max_depth': list(np.arange(10, 110, 10))+['None'],  # Maximum number of levels in tree. Default: max_depth=None
                      'min_samples_split': [2, 3, 5, 10],  # Minimum number of samples required to split a node. Default: min_samples_split=2
                      'min_samples_leaf': [1, 2, 3, 4],  # Minimum number of samples required at each leaf node. Default: min_samples_leaf=1
                      #'bootstrap': [True, False]  # Method of selecting samples for training each tree. Default: bootstrap=True
                     }

    elif classifier_type == 'DT_AdaBoost':
        # AdaBoost with Decision Tree base estimator
        base_estimator = DecisionTreeClassifier(class_weight={0: 1, 1: rate_train})
        estimator = AdaBoostClassifier(base_estimator)
        param_grid = {'n_estimators': np.arange(10, 110, 10), # The maximum number of estimators at which boosting is terminated. Default: n_estimators=50
                      'learning_rate': [0.01, 0.1, 0.5, 1.0], # Weight applied to each classifier at each boosting iteration. Default: learning_rate=1.0
                      
                      ## Decision Tree parameters
                      #'base_estimator__ccp_alpha' : np.arange(0, 0.1, 0.01), # Complexity parameter used for Minimal Cost-Complexity Pruning. Default: ccp_alpha = 0.0
                      #'base_estimator__criterion': ['gini','entropy'],#'log_loss'], # The function to measure the quality of a split. Default: criterion='gini'
                      #'base_estimator__max_depth' : [None, 1, 5, 10, 15], # The maximum depth of the tree. Default: max_depth=None
                      ##'base_estimator__max_features': [None, 'sqrt', 'log2'], # The number of features to consider when looking for the best split. Default: max_features=None
                      #'base_estimator__max_leaf_nodes': [None, 3, 6, 9], # Grow a tree with max_leaf_nodes in best-first fashion. Default: None
                      #'base_estimator__min_samples_leaf': [1, 2, 3, 4], # The minimum number of samples required to be at a leaf node. Default: min_samples_leaf=1
                      #'base_estimator__min_samples_split' : [2, 5, 10, 15], # The minimum number of samples required to split an internal node. Default: min_samples_split=2
                      #'base_estimator__min_weight_fraction_leaf' : np.arange(0.0, 0.5, 0.05), # The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Default: min_weight_fraction_leaf=0
                      ##'base_estimator__splitter': ['best','random'] # The strategy used to choose the split at each node. Default: splitter='best'
                     }

    elif classifier_type == 'RF_AdaBoost':
        # AdaBoost with Decision Tree base estimator
        base_estimator = RandomForestClassifier(class_weight={0: 1, 1: rate_train})
        estimator = AdaBoostClassifier(base_estimator)
        param_grid = {'n_estimators': np.arange(10, 110, 10), # The maximum number of estimators at which boosting is terminated. Default: n_estimators=50
                      'learning_rate': [0.01, 0.1, 0.2, 0.5], # Weight applied to each classifier at each boosting iteration. Default: learning_rate=1.0
                      
                      ## Random Forest parameters
                      #'base_estimator__n_estimators': np.arange(50, 225, 25),  # Number of trees in random forest. Default: n_estimators=100
                      #'base_estimator__criterion': ['gini','entropy'],#'log_loss'], # The function to measure the quality of a split. Default: criterion='gini'
                      #'base_estimator__max_features': ['sqrt', 'log2'],  # Number of features to consider at every split. Default: max_features='sqrt'
                      #'base_estimator__max_depth': list(np.arange(10, 110, 10))+['None'],  # Maximum number of levels in tree. Default: max_depth=None
                      #'base_estimator__min_samples_split': [2, 3, 5, 10],  # Minimum number of samples required to split a node. Default: min_samples_split=2
                      #'base_estimator__min_samples_leaf': [1, 2, 3, 4],  # Minimum number of samples required at each leaf node. Default: min_samples_leaf=1
                      ##'base_estimator__bootstrap': [True, False]  # Method of selecting samples for training each tree. Default: bootstrap=True
                     }

    else:
        print('Wrong classifier type')
        return

    cost_scorer = 'roc_auc'  # cost_scorer = make_scorer(weighted_error, greater_is_better=False)

    # Tune hyperparameters with k-fold cross-validation on training set
    classifier = GridSearchCV(estimator, param_grid, scoring=cost_scorer, cv=k)
    classifier.fit(data, labels)
        
    return classifier.best_estimator_, classifier.best_params_

In [None]:
## Number of parameters in a classifier

def num_parameters(classifier_type, classifier):
        
    if classifier_type == 'SVC' or classifier_type == 'SVM':
        n_support_vectors = len(classifier.support_vectors_)
        n_coefficients = len(classifier.dual_coef_[0])
        n_parameters = n_support_vectors + n_coefficients
        
    elif classifier_type == 'DT':
        n_parameters = classifier.tree_.node_count
        
    elif classifier_type == 'RF':
        n_trees = len(classifier.estimators_)
        n_parameters = sum(tree.tree_.node_count for tree in classifier.estimators_)
        
    elif classifier_type == 'DT_AdaBoost':
        n_estimators = len(classifier.estimators_)
        n_parameters = sum(estimator.tree_.node_count for estimator in classifier.estimators_)
        
    elif classifier_type == 'RF_AdaBoost':
        n_estimators = len(classifier.estimators_)
        n_parameters = sum(
            sum(tree.tree_.node_count for tree in estimator.estimators_)
            for estimator in classifier.estimators_
        )
        
    else:
        n_parameters = None
        
    return n_parameters

# Tune hyperparameters and train N times

In [None]:
N = 10
k = 4  # k for k-fold cross-validation in hyperparameter tuning
seed = 42

In [None]:
trials_params, trials_results = [], []

splitter = StratifiedShuffleSplit(n_splits=N, test_size=int(round(0.15*len(labels))), random_state = seed)

for trial, (train_index, test_index) in enumerate(splitter.split(data, labels)):
    
    print(f'Trial {trial + 1}'), print()

    ### Split the dataset
    crossval_data = data[train_index]
    crossval_labels = labels[train_index]
    test_data = data[test_index]
    test_labels = labels[test_index]
    #crossval_data, crossval_labels, test_data, test_labels = split_dataset(data, labels)
    
    for classifier_type in ['SVM','DT','RF','DT_AdaBoost','RF_AdaBoost']:

        print(f'Classifier: {classifier_type}')
        
        ## Tune hyperparameters with 4-fold cross-validation and then train on the entire training set with the tuned hyperparameters
        ti = time.time()
        classifier, parameters = tune_hyperparameters(classifier_type, crossval_data, crossval_labels, k)
        trials_params.append({**{'classifier':classifier_type}, **{'trial':trial+1}, **parameters})
        train_time = time.time() - ti

        hours, remainder = divmod(train_time, 3600)
        minutes, seconds = divmod(remainder, 60)
        print(f'Hyperparameter tuning took {hours} hours, {minutes} minutes, and {seconds} seconds.')

        ## Save the model
        with open('Clinical_data_classifiers/Models/'+classifier_type+'_'+str(trial+1)+'.pkl','wb') as f:
            pickle.dump(classifier,f)
        
        ## Predict
        predictions_train = classifier.predict_proba(crossval_data)[:,1]
        np.save('Clinical_data_classifiers/Predictions/'+classifier_type+'_train_'+str(trial+1)+'.npy',predictions_train)
        predictions_test = classifier.predict_proba(test_data)[:,1]
        np.save('Clinical_data_classifiers/Predictions/'+classifier_type+'_test_'+str(trial+1)+'.npy',predictions_test)

        ## Print the number of parameters in the model
        num_params = num_parameters(classifier_type, classifier)
        print(f'Classifier has {num_params} parameters.'), print()
        
        ## Evaluate the model
        results_train = evaluate_model_skl(predictions_train, crossval_labels)
        print('TRAIN results:')
        for metric, value in results_train.items():
            print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
        print()
            
        results_test = evaluate_model_skl(predictions_test, test_labels)
        print('TEST results:')
        for metric, value in results_test.items():
            print(f'{metric}: {value:.4f}' if isinstance(value, (float, int)) else f'{metric}: {value}')
        print()

        ## Store results
        trials_results.append({**{'classifier':classifier_type}, **{'trial':trial+1}, 
                               **store_results(num_params, train_time, results_train, results_test)})

    print(), print(100*'#'), print()
    
pd.DataFrame(trials_params).to_csv('Clinical_data_classifiers/Parameters_'+str(N)+'trials.csv')
pd.DataFrame(trials_results).round(decimals=5).to_csv('Clinical_data_classifiers/Results_'+str(N)+'trials.csv')
        

# Compare models

In [None]:
## Read results
trials_results = pd.read_csv('Clinical_data_classifiers/Results_'+str(N)+'trials.csv', index_col=0)
trials_results.fillna(1e-10, inplace=True)

In [None]:
## Print statistics
models = trials_results.classifier.unique()
metrics = [c for c in trials_results.columns if 'test_' in c and c not in ['test_TP','test_FP','test_TN','test_FN']]
    
statistics = pd.DataFrame(index=models, columns=[item for sublist in [[metric+'_mean', metric+'_std'] for metric in metrics] for item in sublist])
    
for metric in metrics:
    mn, st = metric+'_mean', metric+'_std'
    for model in models:
        results = trials_results[trials_results['classifier']==model][metric].values
        statistics.at[model,mn] = results.mean()
        statistics.at[model,st] = results.std()

statistics

In [None]:
for metric in [m for m in metrics if 'test' in m]:
    mn, st = metric+'_mean', metric+'_std'
    if 'Loss' in metric or 'WE' in metric:
        model_best = pd.to_numeric(statistics[metric+'_mean']).idxmin()
        print(f'Model with lowest {metric} is {model_best} with value {statistics.loc[model_best,mn]} and standard deviation {statistics.loc[model_best,st]}')
    else:
        model_best = pd.to_numeric(statistics[metric+'_mean']).idxmax()
        print(f'Model with highest {metric} is {model_best} with value {statistics.loc[model_best,mn]} and standard deviation {statistics.loc[model_best,st]}')

In [None]:
## Print mean and std metrics for each model
for classifier_type in trials_results.classifier.unique():
    print(f'Classifier: {classifier_type}')
    results = trials_results[trials_results['classifier'] == classifier_type]

    # Number of parameters
    parameters = results['Parameters'].values
    print(f'Mean number of parameters: {parameters.mean()} [{parameters.min()}, {parameters.max()}], std {parameters.std()}')

    # training time
    trainTime = results['trainTime'].values
    hours, remainder = divmod(trainTime.mean(), 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f'Mean training time: {hours} hours, {minutes} minutes, and {seconds} seconds, (std {trainTime.std()} sec)')
    print()
    
    # TRAIN results
    metrics = ['BCELoss','Accuracy','Sensitivity','Specificity','ROC_AUC','Precision','F1','WE']
    for metric in metrics:
        values = results['train_' + metric].values
        print(f'Mean train {metric}: {values.mean()}, std {values.std()}')
    print()

    # TEST results
    for metric in metrics:
        values = results['test_' + metric].values
        print(f'Mean test {metric}: {values.mean()}, std {values.std()}')
    print()

    print('-'*120), print()

In [None]:
## Show boxplots
visualize_boxplots(trials_results,
                   ['test_BCELoss','test_Accuracy','test_F1','test_ROC_AUC','test_WE'], #[c for c in cd_trials_results.columns if 'test_' in c and c not in ['test_TP','test_FP','test_TN','test_FN','test_WE','test_Loss']],
                   True,'Clinical_data_classifiers/Boxplots_allModels.png')

In [None]:
## Statistical model comparison
compare_models(trials_results)

# Model selection

In [None]:
selected_model = 'RF'

In [None]:
visualize_boxplot_onemodel(trials_results[trials_results['classifier']==selected_model],
                           ['test_Accuracy','test_Sensitivity','test_Specificity','test_F1','test_ROC_AUC'],
                           True,'Clinical_data_classifiers/Boxplot_'+selected_model+'.png')