In [None]:
import pandas as pd
import numpy as np
import warnings
import mrmr
warnings.filterwarnings("ignore")

In [None]:
# Define the list of feature names
feature_names =  ['age', 'rank', 'height', 'weight', 'bmi', 'bp_s', 'bp_d', 'bp', 'map', 'smoker', 'alcohol', 'faam', 'eilp', 'chronicity', 'ttp', 'ttd', 'ttt', 'wait time', 'time_dg', 'co_morb', 'prior_injuries', 'prior_surgery', 'prior_courses']

# Read the feature data from a CSV file into a Pandas DataFrame
X = pd.read_csv('data//X.csv', header=None)

# Assign the feature names to the columns of the DataFrame
X.columns = feature_names

# Read the target variable (labels) from a CSV file into a Pandas DataFrame
y = pd.read_csv('data//y.csv', header=None)

# Assign a column name ('labels') to the target variable
y.columns = ['labels']

# Use the mrmr_classif function to select features based on minimum redundancy maximum relevance
selected_features = mrmr.mrmr_classif(X, y.iloc[:, 0].values, K=X.shape[1])

# Initialize a list with the first selected feature
valid_features = [selected_features[0]]

# Iterate over the remaining selected features and check for correlation with already selected features
for feat in range(1, X.shape[1]):
    # Check if the absolute correlation between the current feature and the valid features is greater than 0.5
    if any(abs(X[valid_features + [selected_features[feat]]].corr().iloc[:-1, -1].values) > 0.5):
        # If the correlation is greater than 0.5, skip adding the feature to the valid features list
        pass
    else:
        # If the correlation is not greater than 0.5, add the feature to the valid features list
        valid_features.append(selected_features[feat])

# Create a new DataFrame (X_) containing only the selected valid features
X_ = X[valid_features].copy()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import cross_validate, StratifiedKFold, RandomizedSearchCV

In [None]:
param_grid_logistic = {
    'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

param_grid_svc = {
    'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

param_grid_rf = {
    'n_estimators': [10, 20, 50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [1, 2]
}

param_grid_lda = {}  # LinearDiscriminantAnalysis does not have hyperparameters to tune

param_grid_xgb = {
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
    'max_depth': [1, 2],
    'n_estimators': [10, 20, 50, 100],
    'booster': ['gbtree', 'gblinear', 'dart']
}

models = [LogisticRegression(class_weight='balanced'),
          SVC(probability=True, max_iter=100,class_weight='balanced'),
          RandomForestClassifier(class_weight='balanced',warm_start=True),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', LinearDiscriminantAnalysis())]),
          Pipeline([('smote', SMOTE(sampling_strategy='auto', random_state=42)), ('clf', XGBClassifier())])]

model_names = ['Logistic Regression',
               'SVM',
               'Random Forest',
               'Linear Discriminant Analysis',
               'XGBoost']

# Perform hyperparameter tuning for each model
tuned_models = []
for model, param_grid in zip(models, [param_grid_logistic, param_grid_svc, param_grid_rf, param_grid_lda, param_grid_xgb]):
    if isinstance(model, Pipeline):
        # If the model is a pipeline, update the 'clf' step with RandomizedSearchCV
        model_name = model.steps[-1][0]
        tuned_model = Pipeline([
            ('smote', model.steps[0][1]),  # Assuming the first step is 'smote'
            ('clf', RandomizedSearchCV(model.steps[-1][1], param_distributions=param_grid, scoring='roc_auc', n_iter=10, random_state=42, cv=4))
        ])
    else:
        # If the model is not a pipeline, wrap it with RandomizedSearchCV
        model_name = model.__class__.__name__
        tuned_model = RandomizedSearchCV(model, param_distributions=param_grid, scoring='roc_auc', n_iter=10, random_state=42, cv=4)

    tuned_models.append(tuned_model)

# Create the model dictionary
model_dict = dict(zip(model_names, tuned_models))

In [None]:
def sensitivity_function(y_true, y_pred):
  conf_matrix = metrics.confusion_matrix(y_true, y_pred)
  TP = conf_matrix[1][1]
  TN = conf_matrix[0][0]
  FP = conf_matrix[0][1]
  FN = conf_matrix[1][0]

  # calculate the sensitivity
  conf_sensitivity = (TP / float(TP + FN))
  return conf_sensitivity

def specificity_function(y_true, y_pred):
  conf_matrix = metrics.confusion_matrix(y_true, y_pred)
  TP = conf_matrix[1][1]
  TN = conf_matrix[0][0]
  FP = conf_matrix[0][1]
  FN = conf_matrix[1][0]

  # calculate the specificity
  conf_specificity = (TN / float(TN + FP))
  return conf_specificity

def orp_fpr_function(y_true, y_pred):
  fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred)
  i = np.arange(len(tpr))
  roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(thresholds, index = i)})
  roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
  return list(roc_t['fpr'])[0]

def orp_tpr_function(y_true, y_pred):
  fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred)
  i = np.arange(len(tpr))
  roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(thresholds, index = i)})
  roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
  return list(roc_t['tpr'])[0]

def evaluate(y_test, y_hat, y_prob):
  acc = metrics.accuracy_score(y_test, y_hat)
  sens = sensitivity_function(y_test, y_hat)
  spec = specificity_function(y_test, y_hat)
  auc = metrics.roc_auc_score(y_test, y_prob)
  orp_fpr = orp_fpr_function(y_test, y_prob)
  orp_tpr = orp_tpr_function(y_test, y_prob)
  return [acc, sens, spec, auc, orp_fpr, orp_tpr]

sensitivity = metrics.make_scorer(sensitivity_function, greater_is_better=True)
specificity = metrics.make_scorer(specificity_function, greater_is_better=True)
orp_fpr = metrics.make_scorer(orp_fpr_function, greater_is_better=True, needs_proba=True)
orp_tpr = metrics.make_scorer(orp_tpr_function, greater_is_better=True, needs_proba=True)
scoring = {'Accuracy' : 'accuracy', 'Sensitivity': sensitivity, 'Specificity' : specificity, 'AUC': 'roc_auc', 'ORP FPR' : orp_fpr, 'ORP TPR' : orp_tpr, 'F1': 'f1'}

In [None]:
results = []
for name, model in model_dict.items():
    scores = cross_validate(estimator=model, X=X_.values, y=y['labels'].values, cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42),scoring=scoring,return_train_score=True)
    results.append([name,
                    str(np.round(np.mean(scores['train_Accuracy']),2)) + ' ± ' + str(np.round(np.std(scores['train_Accuracy']),2)), 
                    str(np.round(np.mean(scores['train_Sensitivity']),2)) + ' ± ' + str(np.round(np.std(scores['train_Sensitivity']),2)), 
                    str(np.round(np.mean(scores['train_Specificity']),2)) + ' ± ' + str(np.round(np.std(scores['train_Specificity']),2)), 
                    str(np.round(np.mean(scores['train_AUC']),2)) + ' ± ' + str(np.round(np.std(scores['train_AUC']),2)), 
                    str(np.round(np.mean(scores['train_ORP FPR']),2)) + ' ± ' + str(np.round(np.std(scores['train_ORP FPR']),2)), 
                    str(np.round(np.mean(scores['train_ORP TPR']),2)) + ' ± ' + str(np.round(np.std(scores['train_ORP TPR']),2))])
    results.append([name,
                    str(np.round(np.mean(scores['test_Accuracy']),2)) + ' ± ' + str(np.round(np.std(scores['test_Accuracy']),2)), 
                    str(np.round(np.mean(scores['test_Sensitivity']),2)) + ' ± ' + str(np.round(np.std(scores['test_Sensitivity']),2)), 
                    str(np.round(np.mean(scores['test_Specificity']),2)) + ' ± ' + str(np.round(np.std(scores['test_Specificity']),2)), 
                    str(np.round(np.mean(scores['test_AUC']),2)) + ' ± ' + str(np.round(np.std(scores['test_AUC']),2)), 
                    str(np.round(np.mean(scores['test_ORP FPR']),2)) + ' ± ' + str(np.round(np.std(scores['test_ORP FPR']),2)), 
                    str(np.round(np.mean(scores['test_ORP TPR']),2)) + ' ± ' + str(np.round(np.std(scores['test_ORP TPR']),2))])

In [None]:
results_df = pd.DataFrame(data = np.vstack(results), columns = ['Model', 'Accuracy', 'Sensitivity', 'Specificity', 'AUROC', 'ORP FPR', 'ORP TPR'])
results_df.to_csv('data//classification_result.csv', index=None)