In [1]:
#supress warnings (especially from sklearn)
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import os
import numpy as np
np.random.seed(1337) # for reproducibility
import random as rn

os.environ['PYTHONHASHSEED'] = '0'
rn.seed(1254)


import pandas as pd
import csv
from scipy import interp

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, average_precision_score, precision_recall_curve, roc_curve, auc, precision_score, roc_curve, confusion_matrix, precision_recall_fscore_support, f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.scorer import make_scorer

In [3]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.activations import softmax, relu
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.backend import clear_session
import tensorflow
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
from tensorflow.keras.backend import set_session

In [4]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [5]:
import keras
from tensorflow.keras import backend as K

Using TensorFlow backend.


<h1> All Data Results </h1>


In [6]:
def preprocess():
    df = pd.read_csv('../data/x_lace_df.csv')
    df = df.drop(['subject_id', 'hadm_id'], axis=1)
    y = pd.read_csv('../data/y_more_no_df_clean.csv')
    return df, y

def sensitivity(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (possible_positives + K.epsilon())

def specificity(y_true, y_pred):
    true_negatives = K.sum(K.round(K.clip((1-y_true) * (1-y_pred), 0, 1)))
    possible_negatives = K.sum(K.round(K.clip(1-y_true, 0, 1)))
    return true_negatives / (possible_negatives + K.epsilon())

In [17]:
def create_model():
    model = Sequential()
    model.add(Dense(300,input_dim=64,activation='relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(150,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[sensitivity])
    return model

In [18]:
df, labels = preprocess()
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.15)
def run_pipeline():
    epochs = 1000
    batch_size = 3000
    nn_grid = create_model()
    num_classes = 2
    nn_grid.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
    y_pred = np.around(nn_grid.predict(X_test))
    return y_pred, nn_grid

In [19]:
config = ConfigProto(allow_soft_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 1.0
session = InteractiveSession(config=config)
set_session(session)

In [25]:
y_pred, nn_grid = run_pipeline()

<h2> All Data Neural Network Results </h2>


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      4846
           1       0.52      0.15      0.23       566

   micro avg       0.90      0.90      0.90      5412
   macro avg       0.71      0.57      0.59      5412
weighted avg       0.87      0.90      0.87      5412



In [22]:
tensorflow.keras.models.save_model(nn_grid,"./alldata_model.h5", overwrite=True, include_optimizer=True)

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [13]:
lr = LogisticRegression(random_state = 0, n_jobs=-1)
rf = RandomForestClassifier(random_state=0, n_jobs=-1)
K = 5
skf = StratifiedKFold(n_splits=K)
df, labels = preprocess()

<h2> All Data Baseline Results </h2>

In [14]:
#LR
y_real = []
y_predication = []
for train_indices, test_indices in skf.split(df, labels):
    Xtrain, Ytrain = df.iloc[train_indices], labels.iloc[train_indices]
    Xvalid, Yvalid = df.iloc[test_indices], labels.iloc[test_indices]
    y_real = y_real + Yvalid.values.tolist()
    lr.fit(Xtrain, Ytrain)
    y_preds = [0 if x < 0.5 else 1 for x in lr.predict(Xvalid)]
    y_predication = y_predication + y_preds
print(classification_report(y_real, y_predication))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93     32191
           1       0.27      0.11      0.16      3885

   micro avg       0.87      0.87      0.87     36076
   macro avg       0.59      0.54      0.55     36076
weighted avg       0.83      0.87      0.85     36076



In [15]:
#RF
y_real = []
y_predication = []
for train_indices, test_indices in skf.split(df, labels):
    Xtrain, Ytrain = df.iloc[train_indices], labels.iloc[train_indices]
    Xvalid, Yvalid = df.iloc[test_indices], labels.iloc[test_indices]
    y_real = y_real + Yvalid.values.tolist()
    rf.fit(Xtrain, Ytrain)
    y_preds = [0 if x < 0.5 else 1 for x in rf.predict(Xvalid)]
    y_predication = y_predication + y_preds
print(classification_report(y_real, y_predication))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88     32191
           1       0.13      0.17      0.14      3885

   micro avg       0.79      0.79      0.79     36076
   macro avg       0.51      0.51      0.51     36076
weighted avg       0.81      0.79      0.80     36076



In [18]:
def run_xgboost(optimize=False):
    x_df, y_df = preprocess()
    xgb_opt = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=0.6, gamma=0.0, learning_rate=0.1, max_delta_step=0, max_depth=2, min_child_weight=2, missing=None, n_estimators=50, n_jobs=1, nthread=4, objective='binary:logistic', random_state=0, reg_alpha=5.0, reg_lambda=1, scale_pos_weight=8.286036036036036, seed=1, silent=True, subsample=0.5)
    K = 5
    eval_size = int(np.round(1./K))
    skf = StratifiedKFold(n_splits=K)
    prediction = np.array([])
    reals = np.array([])
    name = 'XGBoost'
    for train_indices, test_indices in skf.split(x_df, y_df):
        X_train, y_train = x_df.iloc[train_indices], y_df.iloc[train_indices]
        X_valid, y_valid = x_df.iloc[test_indices], y_df.iloc[test_indices]
        class_weight_scale = 1.*y_df['label'].value_counts()[0]/y_df['label'].value_counts()[1]
        xgb_opt.set_params(**{'scale_pos_weight' : class_weight_scale})
        xgb_opt.fit(X_train,y_train)
        xgb_opt_pred_prob = xgb_opt.predict_proba(X_valid)[:, 1]

        y_valid = y_valid.values.tolist()
            
        reals = np.append(reals,y_valid)
        reals = reals.astype(int)
        
        prediction = np.append(prediction, xgb_opt_pred_prob)
        prediction = prediction.astype(int)
    print(classification_report(y_true = y_df.label, y_pred = xgb_opt.predict(x_df)))
    print(classification_report(reals, prediction))

<h2> All Data XGBoost Results </h2>

In [19]:
run_xgboost()

              precision    recall  f1-score   support

           0       0.95      0.64      0.77     32191
           1       0.20      0.75      0.32      3885

   micro avg       0.66      0.66      0.66     36076
   macro avg       0.58      0.69      0.54     36076
weighted avg       0.87      0.66      0.72     36076

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     32191
           1       0.00      0.00      0.00      3885

   micro avg       0.89      0.89      0.89     36076
   macro avg       0.45      0.50      0.47     36076
weighted avg       0.80      0.89      0.84     36076



In [None]:
df = pd.read_csv('../data/x_lace_df.csv')
x_df = df.drop(['subject_id', 'hadm_id'], axis=1)
y_df = pd.read_csv('../data/y_more_no_df_clean.csv')
class_weight_scale = 1.*y_df.label.value_counts()[0]/y_df.label.value_counts()[1]
X_train = x_df
y_train = y_df['label']
xgb_opt = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=0.6, gamma=0.0, learning_rate=0.1, max_delta_step=0, max_depth=2, min_child_weight=2, missing=None, n_estimators=50, n_jobs=1, nthread=4, objective='binary:logistic', random_state=0, reg_alpha=5.0, reg_lambda=1, scale_pos_weight=8.286036036036036, seed=1, silent=True, subsample=0.5)
K = 5
eval_size = int(np.round(1./K))
skf = StratifiedKFold(n_splits=K)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
lw = 2
i = 0
roc_aucs_xgbopt = []
for train_indices, test_indices in skf.split(x_df, y_df['label']):
    X_train, y_train = x_df.iloc[train_indices], y_df['label'].iloc[train_indices]
    X_valid, y_valid = x_df.iloc[test_indices], y_df['label'].iloc[test_indices]
    class_weight_scale = 1.*y_train.value_counts()[0]/y_train.value_counts()[1]
    xgb_opt.set_params(**{'scale_pos_weight' : class_weight_scale})
    xgb_opt.fit(X_train,y_train)
    xgb_opt_pred_prob = xgb_opt.predict_proba(X_valid)[:,1]

print(classification_report(y_true = y_df.label, y_pred = xgb_opt.predict(x_df)))

XGBOOST Results

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.0, learning_rate=0.1,
       max_delta_step=0, max_depth=2, min_child_weight=2, missing=None,
       n_estimators=50, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=5.0, reg_lambda=1,
       scale_pos_weight=8.286036036036036, seed=1, silent=True,
       subsample=0.5)
       
              precision    recall  f1-score   support

           0       0.95      0.64      0.77     32191
           1       0.20      0.75      0.32      3885

       micro avg       0.66      0.66      0.66     36076
       macro avg       0.58      0.69      0.54     36076
       weighted avg    0.87      0.66      0.72     36076

<h1> Clustered Data Results </h1>

In [20]:
def preprocess():
    data = pd.read_csv('../data/labeled_clustered_data.csv')
    df_0 = data[(data[['cluster_num']] == 0).any(axis=1)]
    df_0_label=df_0.pop('label')
    df_1 = data[(data[['cluster_num']] == 1).any(axis=1)]
    df_1_label=df_1.pop('label')
    df_2 = data[(data[['cluster_num']] == 2).any(axis=1)]
    df_2_label=df_2.pop('label')
    dfs = [df_0, df_1, df_2]
    dfs_labels = [df_0_label, df_1_label, df_2_label]
    return dfs, dfs_labels

def create_pipeline_baseline():
    pipeline = []
    skf = StratifiedKFold(n_splits=2)
    lr = GridSearchCV(LogisticRegression(random_state = 0), cv=skf, verbose=0, param_grid={})
    rf = GridSearchCV(RandomForestClassifier(random_state=0), cv=skf, verbose=0, param_grid={})
    pipeline = [['LogisticRegression', lr], ['RandomForest',rf]]
    return pipeline


def run_pipeline():
    dfs, dfs_labels = preprocess()
    print('finished preprocess')
    baseline_grid = create_pipeline_baseline()
    print('created pipeline and running ...')
    skf = StratifiedKFold(n_splits=5)
    for i, df in enumerate(dfs):
        labels = dfs_labels[i]
        opt_df, _, labels, _ = train_test_split(df, labels, test_size=0)
        for name, grid in baseline_grid:
            prediction = np.array([])
            reals = np.array([])
            for train_indices, test_indices in skf.split(opt_df, labels):
                X_train, y_train = opt_df.iloc[train_indices], labels.iloc[train_indices]
                X_valid, y_valid = opt_df.iloc[test_indices], labels.iloc[test_indices]
                grid.fit(X_train, y_train)
                y_pred = np.around(grid.predict(X_valid))
                reals = np.append(reals,y_valid)
                reals = reals.astype(int)
                prediction = np.append(prediction, y_pred)
                prediction = prediction.astype(int)
            print(name, 'cluster: ' + str(i))
            print(classification_report(reals, prediction))
        print('RUN_XGBOOST CLUSTER: '+str(i))
        if i == 0:
            xgb_opt = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4, gamma=0.0, learning_rate=0.1,
       max_delta_step=0, max_depth=1, min_child_weight=1, missing=None,
       n_estimators=50, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=1.0000000000000002e-06, reg_lambda=1,
       scale_pos_weight=10.186375321336762, seed=1, silent=True,
        subsample=0.2)
        elif i == 1:
            xgb_opt = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0.0, learning_rate=0.1,
       max_delta_step=0, max_depth=2, min_child_weight=3, missing=None,
       n_estimators=50, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=8.653391412570006, seed=1, silent=True,
       subsample=0.6)
        else:
            xgb_opt = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0.1, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=6, missing=None,
       n_estimators=50, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0.0005, reg_lambda=1,
       scale_pos_weight=5.418508287292818, seed=1, silent=True,
       subsample=0.7)
        skf = StratifiedKFold(n_splits=5)        
        name = 'XGBoost'
        fold = 0
        prediction = np.array([])
        reals = np.array([])
        for train_indices, test_indices in skf.split(opt_df, labels):
            X_train, y_train = opt_df.iloc[train_indices], labels.iloc[train_indices]
            X_valid, y_valid = opt_df.iloc[test_indices], labels.iloc[test_indices]
            class_weight_scale = 1.*y_train.value_counts()[0]/y_train.value_counts()[1]
            xgb_opt.set_params(**{'scale_pos_weight' : class_weight_scale})
            xgb_opt.fit(X_train,y_train)
            xgb_opt_pred_prob = xgb_opt.predict_proba(X_valid)[:, 1]
            xgb_opt_pred_prob = np.around(xgb_opt_pred_prob)
            y_valid = y_valid.values.tolist()
            reals = np.append(reals,y_valid)
            reals = reals.astype(int)
            prediction = np.append(prediction, xgb_opt_pred_prob)
            prediction = prediction.astype(int)
        print(classification_report(y_true = labels, y_pred = xgb_opt.predict(opt_df)))
        print(classification_report(reals, prediction))

In [21]:
run_pipeline()

finished preprocess
created pipeline and running ...
LogisticRegression cluster: 0
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      9906
           1       0.40      0.05      0.10       972

   micro avg       0.91      0.91      0.91     10878
   macro avg       0.65      0.52      0.52     10878
weighted avg       0.87      0.91      0.88     10878

RandomForest cluster: 0
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      9906
           1       0.47      0.04      0.07       972

   micro avg       0.91      0.91      0.91     10878
   macro avg       0.69      0.52      0.51     10878
weighted avg       0.87      0.91      0.87     10878

RUN_XGBOOST CLUSTER: 0
              precision    recall  f1-score   support

           0       0.96      0.72      0.82      9906
           1       0.19      0.67      0.30       972

   micro avg       0.71      0.71      0.71     1087

In [27]:
def preprocess():
    data = pd.read_csv('../data/labeled_clustered_data.csv')
    df_0 = data[(data[['cluster_num']] == 0).any(axis=1)]
    df_0_label=df_0.pop('label')
    df_1 = data[(data[['cluster_num']] == 1).any(axis=1)]
    df_1_label=df_1.pop('label')
    df_2 = data[(data[['cluster_num']] == 2).any(axis=1)]
    df_2_label=df_2.pop('label')
    dfs = [df_0, df_1, df_2]
    dfs_labels = [df_0_label, df_1_label, df_2_label]
    return dfs, dfs_labels

def sensitivity(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (possible_positives + K.epsilon())

def specificity(y_true, y_pred):
    true_negatives = K.sum(K.round(K.clip((1-y_true) * (1-y_pred), 0, 1)))
    possible_negatives = K.sum(K.round(K.clip(1-y_true, 0, 1)))
    return true_negatives / (possible_negatives + K.epsilon())
df, labels = preprocess()
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.15)

In [28]:
def create_cluster_0_model():
    model = Sequential()
    model.add(Dense(5,input_dim=66,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[sensitivity])
    return model

In [29]:
def create_cluster_1_model():
    model = Sequential()
    model.add(Dense(5,input_dim=66,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[sensitivity])
    return model

In [30]:
def create_cluster_2_model():
    model = Sequential()
    model.add(Dense(5,input_dim=66,activation='relu'))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[sensitivity])
    return model

In [31]:
def run_pipeline():
    dfs, dfs_labels = preprocess()
    print('finished preprocess')
    print('created pipeline and running ...')
    skf = StratifiedKFold(n_splits=5)
    epochs = 500
    batch_size = 1028
    num_classes = 2
    df0 = df[0]
    labels0 = dfs_labels[0]
    df1 = df[1]
    labels1 = dfs_labels[1]
    df2 = df[2]
    labels2 = dfs_labels[2]
    X_train, X_test, y_train, y_test = train_test_split(df0, labels0, test_size=.2, random_state=2)
    nn_grid = create_cluster_0_model()
    nn_grid.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, shuffle=False)
    y_pred = np.around(nn_grid.predict(X_test))
    print('CLUSTER 0:')
    print(classification_report(y_test, y_pred))
    
    X_train, X_test, y_train, y_test = train_test_split(df1, labels1, test_size=.2, random_state=2)
    nn_grid = create_cluster_1_model()
    nn_grid.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, shuffle=False)
    y_pred = np.around(nn_grid.predict(X_test))
    print('CLUSTER 1:')
    print(classification_report(y_test, y_pred))
    
#     epochs = 100
#     batch_size = 124
    X_train, X_test, y_train, y_test = train_test_split(df2, labels2, test_size=.2, random_state=2)
    nn_grid = create_cluster_2_model()
    nn_grid.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, shuffle=False)
    y_pred = np.around(nn_grid.predict(X_test))
    print('CLUSTER 2:')
    print(classification_report(y_test, y_pred))

In [32]:
run_pipeline()

finished preprocess
created pipeline and running ...
CLUSTER 0:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1967
           1       0.00      0.00      0.00       209

   micro avg       0.90      0.90      0.90      2176
   macro avg       0.45      0.50      0.47      2176
weighted avg       0.82      0.90      0.86      2176

CLUSTER 1:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      3486
           1       0.39      0.03      0.05       392

   micro avg       0.90      0.90      0.90      3878
   macro avg       0.65      0.51      0.50      3878
weighted avg       0.85      0.90      0.86      3878

CLUSTER 2:
              precision    recall  f1-score   support

           0       0.87      0.99      0.92       997
           1       0.61      0.07      0.12       165

   micro avg       0.86      0.86      0.86      1162
   macro avg       0.74      0.53      0.5

In [33]:
run_pipeline()

finished preprocess
created pipeline and running ...
CLUSTER 0:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1967
           1       0.00      0.00      0.00       209

   micro avg       0.90      0.90      0.90      2176
   macro avg       0.45      0.50      0.47      2176
weighted avg       0.82      0.90      0.86      2176

CLUSTER 1:
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      3486
           1       0.37      0.19      0.25       392

   micro avg       0.88      0.88      0.88      3878
   macro avg       0.64      0.58      0.59      3878
weighted avg       0.86      0.88      0.87      3878

CLUSTER 2:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       997
           1       0.00      0.00      0.00       165

   micro avg       0.86      0.86      0.86      1162
   macro avg       0.43      0.50      0.4

In [None]:
# dfs, dfs_labels = preprocess()
# for df in dfs:
#     print(df.shape)