In [1]:
import pandas as pd
import pandas.api.types as ptypes
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("impute_check.csv")

In [4]:
df.shape

(131021, 147)

In [5]:
df.drop(['bilirubin_apache','albumin_apache','urineoutput_apache'],inplace=True,axis=1)

In [6]:
df.drop(['D','encounter_id','hospital_id','patient_id'],inplace=True,axis=1)

In [8]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]

In [9]:
to_drop

['apache_2_bodysystem_Gastrointestinal',
 'apache_2_bodysystem_Haematologic',
 'apache_2_bodysystem_Metabolic',
 'apache_2_bodysystem_Neurologic',
 'apache_2_bodysystem_Renal/Genitourinary',
 'apache_2_bodysystem_Respiratory',
 'apache_2_bodysystem_Trauma',
 'elective_surgery',
 'weight',
 'apache_post_operative',
 'd1_diasbp_noninvasive_max',
 'd1_diasbp_noninvasive_min',
 'd1_mbp_min',
 'd1_mbp_noninvasive_max',
 'd1_mbp_noninvasive_min',
 'd1_sysbp_noninvasive_max',
 'd1_sysbp_noninvasive_min',
 'h1_diasbp_noninvasive_max',
 'h1_diasbp_noninvasive_min',
 'h1_heartrate_min',
 'h1_mbp_min',
 'h1_mbp_noninvasive_max',
 'h1_mbp_noninvasive_min',
 'h1_sysbp_noninvasive_max',
 'h1_sysbp_noninvasive_min',
 'h1_temp_min',
 'd1_bun_max',
 'd1_bun_min',
 'd1_creatinine_max',
 'd1_creatinine_min',
 'd1_glucose_max',
 'd1_hemaglobin_min',
 'd1_hematocrit_max',
 'd1_hematocrit_min',
 'd1_platelets_min',
 'd1_sodium_min',
 'd1_wbc_max',
 'd1_wbc_min',
 'apache_4a_icu_death_prob']

In [10]:
df = df.drop(df[to_drop], axis=1)

In [11]:
df.shape

(131021, 101)

In [12]:
df.columns

Index(['train', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic',
       'ethnicity_Native American', 'ethnicity_Other/Unknown', 'gender_M',
       'icu_admit_source_Floor', 'icu_admit_source_Operating Room / Recovery',
       'icu_admit_source_Other Hospital',
       ...
       'd1_sodium_max', 'apache_4a_hospital_death_prob', 'aids', 'cirrhosis',
       'diabetes_mellitus', 'hepatic_failure', 'immunosuppression', 'leukemia',
       'lymphoma', 'solid_tumor_with_metastasis'],
      dtype='object', length=101)

In [13]:
train = df[df['train'] == 1]

In [14]:
train.shape

(91713, 101)

In [15]:
train.columns

Index(['train', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic',
       'ethnicity_Native American', 'ethnicity_Other/Unknown', 'gender_M',
       'icu_admit_source_Floor', 'icu_admit_source_Operating Room / Recovery',
       'icu_admit_source_Other Hospital',
       ...
       'd1_sodium_max', 'apache_4a_hospital_death_prob', 'aids', 'cirrhosis',
       'diabetes_mellitus', 'hepatic_failure', 'immunosuppression', 'leukemia',
       'lymphoma', 'solid_tumor_with_metastasis'],
      dtype='object', length=101)

In [16]:
test =  df[df['train'] == 0]

In [17]:
test.shape

(39308, 101)

In [18]:
Y = train['hospital_death']
X = train.drop(['train','hospital_death'],axis=1)

In [19]:
X.shape

(91713, 99)

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X)
# Apply transform to both the training set and the test set.
X = scaler.transform(X)

In [21]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.30, random_state=40,stratify=Y)


### Using Grid Search to get the best hyper parameters

In [23]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import GridSearchCV
def build_model():
    model = Sequential()
    model.add(Dense(99, input_dim=99, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

from keras.wrappers.scikit_learn import KerasClassifier
batch_size = [20, 60, 100]
epochs = [10, 50, 100]
model = KerasClassifier(build_fn=build_model, verbose=0)
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X, Y)



In [26]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.924460 using {'batch_size': 100, 'epochs': 10}
0.923664 (0.007859) with: {'batch_size': 20, 'epochs': 10}
0.908072 (0.009608) with: {'batch_size': 20, 'epochs': 50}
0.894039 (0.008821) with: {'batch_size': 20, 'epochs': 100}
0.924155 (0.006401) with: {'batch_size': 60, 'epochs': 10}
0.908301 (0.010545) with: {'batch_size': 60, 'epochs': 50}
0.899055 (0.010710) with: {'batch_size': 60, 'epochs': 100}
0.924460 (0.006857) with: {'batch_size': 100, 'epochs': 10}
0.909173 (0.010680) with: {'batch_size': 100, 'epochs': 50}
0.900592 (0.007519) with: {'batch_size': 100, 'epochs': 100}


### Using the best parameters as the hyper parameters for Keras Classification

In [30]:
from keras.models import Sequential
from keras.layers import Dense

def build_model():
    model = Sequential()
    model.add(Dense(99, input_dim=99, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

from keras.wrappers.scikit_learn import KerasClassifier
keras_model = build_model()
keras_model.fit(X_train, y_train, epochs=10, batch_size=100, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x207089a2e88>

In [None]:
from sklearn.metrics import roc_curve
y_pred_keras = keras_model.predict(X_test).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred_keras)

In [32]:
score = keras_model.evaluate(X_test, y_test, verbose=1)



In [33]:
print(score)

[0.199706832422345, 0.9265101552009583]


In [37]:
from sklearn.metrics import auc
auc_keras = auc(fpr_keras, tpr_keras)

In [38]:
print(auc_keras)

0.8826704072724074
