## Importing necessary libraries

In [1]:
%pwd

'C:\\Users\\Eleonora\\statistical learning for healthcare data\\progetto'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(130298) 

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, mean_squared_error

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer 
from fancyimpute import IterativeImputer, KNN

In [6]:
from sklearn import svm
from sklearn.ensemble import IsolationForest
from sklearn.tree import DecisionTreeClassifier

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, f1_score, recall_score, precision_score, roc_curve, mean_squared_error

In [8]:
from sklearn.model_selection import GridSearchCV

In [40]:
from sklearn.ensemble import AdaBoostClassifier

In [9]:
def compute_metrics(Y, final_pred):
    fpr, tpr, _ = roc_curve(Y,final_pred)
    AUC = auc(fpr, tpr)
    f1 = f1_score(Y,final_pred)
    rec = recall_score(Y,final_pred)
    prec = precision_score(Y,final_pred)
    print(f'AUC: {AUC}, F1: {f1}, Recall: {rec}, Precision: {prec}')
    return [AUC, f1, rec, prec]

## Datasets definition

In [10]:
outcome_name = 're.admission.within.6.months'

# take the training set
X_train = pd.read_csv('train_data_drugs.csv')
X_train.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_train = X_train[outcome_name].copy()
X_train.drop(columns = outcome_name, inplace = True)

# take the test set
X_test = pd.read_csv('test_data_drugs.csv')
X_test.set_index('inpatient.number', inplace = True)

# separate the outcome
Y_test = X_test[outcome_name].copy()
X_test.drop(columns = outcome_name, inplace = True)


In [11]:
print(f'Size of traing set: {X_train.shape} and of test set: {X_test.shape}')

Size of traing set: (1567, 81) and of test set: (397, 81)


In [12]:
# take the lists of variables by type

cat_columns = ['DestinationDischarge','admission.ward','admission.way','discharge.department',
                       'type.of.heart.failure', 'NYHA.cardiac.function.classification', 'Killip.grade',
                       'consciousness', 'ageCat']

ordinal_columns = ['CCI.score', 'eye.opening','verbal.response', 'movement', 'GCS']

not_continuous = cat_columns.copy()

binary_columns = ['gender', 'diabetes', 'moderate.to.severe.chronic.kidney.disease',
                  'return.to.emergency.department.within.6.months', 'diuretics',
                  'hypertension', 'heart_failure', 'angina_etal', 'cholesterol_drug']

not_continuous.extend(binary_columns)
not_continuous.extend(ordinal_columns)

In [13]:
# for each category we add its specification, needed to retreive columns after OneHotEcoding
for cat in cat_columns:
    X_train[cat] = cat + '_' + X_train[cat]
    X_test[cat] = cat + '_' + X_test[cat]

In [14]:
cont_columns = [col_name for col_name in X_train.columns if col_name not in not_continuous]

In [15]:
var_to_log = ['creatinine.enzymatic.method', 'urea', 'glomerular.filtration.rate', 
              'cystatin', 'lymphocyte.count', 'neutrophil.count',
              'activated.partial.thromboplastin.time', 'prothrombin.time.ratio',
              'glutamyltranspeptidase','indirect.bilirubin','alkaline.phosphatase',
              'globulin','direct.bilirubin','cholesterol',
              #'low.density.lipoprotein.cholesterol', # removed in the last corr analysis
              'triglyceride']

## Pipeline definition

In [16]:
def log_transf_stand(x):
    "Transform to log and standardize"
    x = np.log(x)
    return (x - x.mean())/x.std()

log_transformer = FunctionTransformer(log_transf_stand)

In [17]:
full_pipeline = ColumnTransformer([
        ("log", log_transformer, var_to_log),
        ("num", StandardScaler(), list(set(cont_columns) - set(var_to_log))),     
        ("cat", OneHotEncoder(), cat_columns)
], remainder = 'passthrough') # do not modify columns not listed

#total_pipeline = Pipeline(['stationarity', starting_pipeline])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test) 

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


In [18]:
print(f'Size of traing set: {X_train_prepared.shape} and of test set: {X_test_prepared.shape}')

Size of traing set: (1567, 107) and of test set: (397, 107)


In [19]:
# imputation cannot be done inside the Pipeline because the method 'transform' is missing

knn_imputer = KNN()
X_train_prepared = knn_imputer.fit_transform(X_train_prepared)
X_test_prepared = knn_imputer.fit_transform(X_test_prepared)

Imputing row 1/1567 with 1 missing, elapsed time: 1.688
Imputing row 101/1567 with 3 missing, elapsed time: 1.695
Imputing row 201/1567 with 4 missing, elapsed time: 1.702
Imputing row 301/1567 with 1 missing, elapsed time: 1.711
Imputing row 401/1567 with 17 missing, elapsed time: 1.718
Imputing row 501/1567 with 5 missing, elapsed time: 1.727
Imputing row 601/1567 with 5 missing, elapsed time: 1.734
Imputing row 701/1567 with 6 missing, elapsed time: 1.742
Imputing row 801/1567 with 2 missing, elapsed time: 1.751
Imputing row 901/1567 with 1 missing, elapsed time: 1.758
Imputing row 1001/1567 with 2 missing, elapsed time: 1.766
Imputing row 1101/1567 with 2 missing, elapsed time: 1.775
Imputing row 1201/1567 with 5 missing, elapsed time: 1.786
Imputing row 1301/1567 with 2 missing, elapsed time: 1.794
Imputing row 1401/1567 with 5 missing, elapsed time: 1.802
Imputing row 1501/1567 with 1 missing, elapsed time: 1.809
Imputing row 1/397 with 10 missing, elapsed time: 0.117
Imputing ro

In [20]:
print(f'Size of traing set: {X_train_prepared.shape} and of test set: {X_test_prepared.shape}')

Size of traing set: (1567, 107) and of test set: (397, 107)


In [20]:
# list of all the columns names
features_names = ['body.temperature', 'pulse', 'respiration', 'systolic.blood.pressure',
                  'diastolic.blood.pressure', 'weight', 'BMI', 'fio2',
                  'left.ventricular.end.diastolic.diameter.LV', 'creatinine.enzymatic.method',
                  'urea', 'uric.acid', 'glomerular.filtration.rate', 'cystatin', 'monocyte.count',
                  'red.blood.cell', 'coefficient.of.variation.of.red.blood.cell.distribution.width',
                  'standard.deviation.of.red.blood.cell.distribution.width', 'mean.corpuscular.volume', 
                  'lymphocyte.count', 'mean.hemoglobin.concentration', 'mean.platelet.volume',
                  'eosinophil.count', 'hemoglobin', 'platelet', 'platelet.distribution.width',
                  'neutrophil.count', 'D.dimer', 'activated.partial.thromboplastin.time',
                  'thrombin.time', 'prothrombin.activity', 'prothrombin.time.ratio', 'fibrinogen',
                  'high.sensitivity.troponin', 'carbon.dioxide.binding.capacity', 'potassium', 
                  'chloride', 'sodium', 'glutamic.oxaloacetic.transaminase', 'creatine.kinase',
                  'creatine.kinase.isoenzyme', 'lactate.dehydrogenase', 'brain.natriuretic.peptide',
                  'nucleotidase', 'fucosidase', 'albumin', 'white.globulin.ratio',
                  'glutamyltranspeptidase', 'glutamic.pyruvic.transaminase', 'indirect.bilirubin',
                  'alkaline.phosphatase', 'globulin', 'direct.bilirubin', 'total.bile.acid',
                  'total.protein', 'cholesterol', #'low.density.lipoprotein.cholesterol',
                  'triglyceride', 'high.density.lipoprotein.cholesterol', 'dischargeDay',
                  'gender', 'diabetes', 'moderate.to.severe.chronic.kidney.disease',
                  #'return.to.emergency.department.within.6.months',
                  'diuretics', 
                  'hypertension', 'heart_failure', 'angina_etal', 'cholesterol_drug', 
                  'CCI.score', 'eye.opening', 'verbal.response', 'movement', 'GCS',
                  'DestinationDischarge_HealthcareFacility', 'DestinationDischarge_Home',
                  'admission.ward_Cardiology', 'admission.ward_GeneralWard', 'admission.ward_ICU',
                  'admission.ward_Others', 'admission.way_Emergency', 'admission.way_NonEmergency',
                  'discharge.department_Cardiology', 'discharge.department_GeneralWard',
                  'discharge.department_ICU', 'discharge.department_Others',
                  'type.of.heart.failure_Both', 'type.of.heart.failure_Left', 'type.of.heart.failure_Right',
                  'NYHA.cardiac.function.classification_II', 'NYHA.cardiac.function.classification_III',
                  'NYHA.cardiac.function.classification_IV', 'Killip.grade_I', 'Killip.grade_II',
                  'Killip.grade_III', 'Killip.grade_IV', 'consciousness_Clear', 'consciousness_Nonresponsive',
                  'consciousness_ResponsiveToPain', 'consciousness_ResponsiveToSound',
                  'ageCat_(21,29]', 'ageCat_(29,39]', 'ageCat_(39,49]', 'ageCat_(49,59]',
                  'ageCat_(59,69]', 'ageCat_(69,79]', 'ageCat_(79,89]', 'ageCat_(89,110]']

In [22]:
#for idx,elem in enumerate(features_names):
#    if elem == 'nan':
#        print(idx)

In [23]:
#X_train_prepared = np.delete(X_train_prepared, idx, axis = 1)
#X_test_prepared = np.delete(X_test_prepared, idx, axis = 1)

# ADJUST THIS!!!

## SVM

In [21]:
clf = svm.SVC()

scores = cross_val_score(clf,X_train_prepared,Y_train,cv=10,scoring='f1_weighted')
print(f'Mean F1-weighted score: {scores.mean()} and stadard deviation: {scores.std()}')

Mean F1-weighted score: 0.5606718796218569 and stadard deviation: 0.027442772112927217


In [22]:
clf = svm.SVC(class_weight = 'balanced')

scores = cross_val_score(clf,X_train_prepared,Y_train,cv=10,scoring='f1_weighted')
print(f'Mean F1-weighted score: {scores.mean()} and stadard deviation: {scores.std()}')

Mean F1-weighted score: 0.6228722684625217 and stadard deviation: 0.02822839784352566


In [24]:
def compute_metrics(Y, final_pred):
    """
    Compute AUC, F1, Recall and Precision given the target and the model predictions
    """
    fpr, tpr, _ = roc_curve(Y,final_pred)
    AUC = auc(fpr, tpr)
    f1 = f1_score(Y,final_pred)
    rec = recall_score(Y,final_pred)
    prec = precision_score(Y,final_pred)
    #print(f'AUC: {AUC}, F1: {f1}, Recall: {rec}, Precision: {prec}')
    return [AUC, f1, rec, prec]

In [25]:
def final_svm_model(X_train, y_train, svm_clf, scoring):
    """
    Performs a Randomized Grid Search trying to maximize the scoring defined 
    svm_clf can be the vanilla model svm.SVC() with any added specification
    """
    param_grid = {'C': np.linspace(0.5, 10, 100), 'gamma': np.logspace(-4, 1, 100)}
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

    grid = RandomizedSearchCV(svm_clf, param_grid, n_iter=10, scoring=scoring, cv=cv, random_state= 1)

    grid.fit(X_train,y_train)
    final_model = grid.best_estimator_
    
    return final_model

In [26]:
def svm_testing(X_train, y_train, X_test, y_test, svm_clf, scoring):
    """
    1) Find the best model through CV
    2) Predict on test data
    3) Evaluate AUC, F1, Recall and Precision
    """
    final_model = final_svm_model(X_train, y_train, svm_clf, scoring)
    final_predictions = final_model.predict(X_test)
    metrics = compute_metrics(Y = y_test, final_pred = final_predictions)
    
    return metrics

In [28]:
import time

In [30]:
metrics_1 = svm_testing(X_train_prepared, Y_train, X_test_prepared, Y_test, svm.SVC(),'f1')

In [31]:
metrics_2 = svm_testing(X_train_prepared, Y_train, X_test_prepared, Y_test, svm.SVC(),'f1_weighted')

In [32]:
metrics_3 = svm_testing(X_train_prepared, Y_train, X_test_prepared, Y_test, svm.SVC(class_weight = 'balanced'),'f1')

In [33]:
metrics_4 = svm_testing(X_train_prepared, Y_train, X_test_prepared, Y_test, svm.SVC(class_weight = 'balanced'),'f1_weighted')

In [34]:
metrics_5 = svm_testing(X_train_prepared, Y_train, X_test_prepared, Y_test, svm.SVC(class_weight = 'balanced'),'recall')

In [36]:
metrics_6 = svm_testing(X_train_prepared, Y_train, X_test_prepared, Y_test, svm.SVC(class_weight = 'balanced'),'balanced_accuracy')

In [37]:
metrics_7 = svm_testing(X_train_prepared, Y_train, X_test_prepared, Y_test, svm.SVC(class_weight = 'balanced'),'roc_auc')

In [38]:
pd.DataFrame([metrics_1, metrics_2, metrics_3, metrics_4, metrics_5, metrics_6, metrics_7], 
             columns = ['AUC', 'F1-score', 'Recall', 'Precision'],
             index = ['SVM, F1-score','SVM, f1_weighted','SVM(balanced), f1','SVM(balanced), f1_weighted',
                     'SVM(balanced), recall','SVM(balanced), balanced_accuracy','SVM(balanced), roc_auc'])

Unnamed: 0,AUC,F1-score,Recall,Precision
"SVM, F1-score",0.598097,0.45977,0.38961,0.560748
"SVM, f1_weighted",0.598097,0.45977,0.38961,0.560748
"SVM(balanced), f1",0.586032,0.504673,0.525974,0.48503
"SVM(balanced), f1_weighted",0.598097,0.45977,0.38961,0.560748
"SVM(balanced), recall",0.603455,0.530488,0.564935,0.5
"SVM(balanced), balanced_accuracy",0.607798,0.523962,0.532468,0.515723
"SVM(balanced), roc_auc",0.607798,0.523962,0.532468,0.515723


### AdaBoost

In [42]:
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators = 100)

param_grid = {'base_estimator__max_depth': np.linspace(3,10,3).astype(int), 
              'base_estimator__min_samples_leaf': np.linspace(1,20,5).astype(int), 
              'base_estimator__min_samples_split': np.linspace(2,40,10).astype(int), 
              'base_estimator__min_impurity_decrease': np.linspace(0,0.4,10), 
              'base_estimator__max_leaf_nodes': np.linspace(2,20,5).astype(int)}
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

grid_clf = RandomizedSearchCV(clf, param_distributions = param_grid, n_iter=5, cv=cv) # if greater than 5 runtime is too high
grid_clf.fit(X_train_prepared,Y_train)
best_params_clf = grid_clf.best_params_

In [43]:
clf.set_params(**best_params_clf)
clf.fit(X_train_prepared,Y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3,
                                                         max_leaf_nodes=20,
                                                         min_impurity_decrease=0.26666666666666666,
                                                         min_samples_leaf=5,
                                                         min_samples_split=27),
                   n_estimators=100)