In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
##load features and cohort data
path = '/gpfs/commons/groups/gursoy_lab/aelhussein/DCI_FL/Data/'
data = pd.read_csv(path+'dataset.csv')
cohort = pd.read_csv(path+'cohort_use.csv')

In [29]:
##create full dataset then split features and outcomes (needed as not full cohort in the dataset) 
full_df = data.merge(cohort[['hadm_id', 'outcome']], on = 'hadm_id').set_index('hadm_id')
X = full_df.loc[:, ~full_df.columns.isin(['outcome', 'hours_since_admit'])]
y = full_df['outcome']

In [31]:
## train-test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.8)

## create scaler and apply only to numeric data before adding binary data
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train.iloc[:,:-5])
X_train_norm = pd.DataFrame(X_train_norm, index = X_train.index, columns = X_train.columns[:-5])
X_train_norm = X_train_norm.merge(X_train.iloc[:,-5:], left_index = True, right_index = True)

##apply scaler to test data
X_test_norm = scaler.transform(X_test.iloc[:,:-5])
X_test_norm = pd.DataFrame(X_test_norm, index = X_test.index, columns = X_test.columns[:-5])
X_test_norm = X_test_norm.merge(X_test.iloc[:,-5:], left_index = True, right_index = True)

In [36]:
##RF model
clf = RandomForestClassifier(max_depth=8, random_state=0)
clf.fit(X_train_norm, y_train)
fpr, tpr, thresholds = metrics.roc_curve(y_test.values, clf.predict_proba(X_test_norm)[:,1])
AUC = metrics.auc(fpr, tpr)
AUC

0.798938333512392

In [37]:
##LR model
clf = LogisticRegression()
clf.fit(X_train_norm, y_train)
fpr, tpr, thresholds = metrics.roc_curve(y_test.values, clf.predict_proba(X_test_norm)[:,1])
AUC = metrics.auc(fpr, tpr)
AUC

0.793846812994218