In [2]:
# Load in our libraries
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import matplotlib.pyplot as plt# Load in our libraries
import os
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold

In [4]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [5]:
train.head(3)

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,height,...,d1_arterial_po2_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,apache_3j_bodysystem,apache_2_bodysystem,gcs_total,cancer,liver_disease,other
0,1,66154,25312,118,0,68,3.123686,0,Caucasian,180.3,...,85.0,0.1,0.05,0,Sepsis,Cardiovascular,13,0,0,1
1,2,114252,59342,81,0,77,3.311273,0,Caucasian,160.0,...,51.0,0.47,0.29,0,Respiratory,Respiratory,5,0,0,1
2,3,119783,50777,118,0,25,3.464172,0,Caucasian,172.7,...,85.0,0.0,0.0,0,Metabolic,Metabolic,14,0,0,0


In [6]:
def dummies(train, test):
    columns = ['ethnicity', 'hospital_admit_source', 'icu_admit_source', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem', 'cancer', 'liver_disease', 'other']
    for column in columns:
        train[column] = train[column].apply(lambda x: str(x))
        test[column] = test[column].apply(lambda x: str(x))
        good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
        train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
        test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
        del train[column]
        del test[column]
    return train, test

In [7]:
train, test = dummies(train, test)

In [8]:
train.head(2)

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,height,icu_id,...,cancer_0,cancer_1,cancer_2,cancer_3,liver_disease_0,liver_disease_2,liver_disease_1,other_1,other_0,other_2
0,1,66154,25312,118,0,68,3.123686,0,180.3,92,...,1,0,0,0,1,0,0,1,0,0
1,2,114252,59342,81,0,77,3.311273,0,160.0,90,...,1,0,0,0,1,0,0,1,0,0


### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=30,
                             min_samples_split=3,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.loc[:, train.columns != 'hospital_death'], train.loc[:, train.columns == 'hospital_death'])
print("%.4f" % rf.oob_score_)

  # This is added back by InteractiveShellApp.init_path()


0.9224


In [10]:
pd.concat((pd.DataFrame(train.loc[:, train.columns != 'hospital_death'].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
45,apache_4a_hospital_death_prob,0.077562
46,apache_4a_icu_death_prob,0.06763
22,d1_spo2_min,0.034173
42,d1_arterial_ph_min,0.03135
21,d1_heartrate_min,0.024514
23,d1_temp_max,0.024267
41,d1_arterial_ph_max,0.02304
24,d1_temp_min,0.022024
16,heart_rate_apache,0.021308
38,d1_wbc_min,0.021067


In [11]:
predictions = rf.predict(test.loc[:, test.columns != 'hospital_death'])
predictions = pd.DataFrame(predictions, columns=['hospital_death'])
test1 = pd.read_csv(os.path.join('.', 'test.csv'))
predictions = pd.concat((test1.loc[:, test1.columns == 'encounter_id'], predictions), axis = 1)
predictions.to_csv('y_test15.csv', sep=",", index = False)

### XGBoost

In [12]:
xgboost = xgb.XGBClassifier(n = 25)
xgboost.fit(train.loc[:, train.columns != 'hospital_death'], train.loc[:, train.columns == 'hospital_death'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n=25, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [16]:
xgpredictions = xgboost.predict(test.loc[:, test.columns != 'hospital_death'])
xgpredictions = pd.DataFrame(xgpredictions, columns=['hospital_death'])
test4 = pd.read_csv(os.path.join('.', 'test.csv'))
predictions = pd.concat((test4.loc[:, test4.columns == 'encounter_id'], xgpredictions), axis = 1)
predictions.to_csv('submission_xgboost.csv', sep=",", index = False)

### Neural Network

In [24]:
import tensorflow as tf
#creating log directory
TENSORBOARD_SUMMARIES_DIR = './tmp/neuralnet_logs'
def prepare_log_dir():
    '''Clears the log file then creates new directory to place
        the tensorbard log file.''' 
    if tf.gfile.Exists(TENSORBOARD_SUMMARIES_DIR):
        tf.gfile.DeleteRecursively(TENSORBOARD_SUMMARIES_DIR)
    tf.gfile.MakeDirs(TENSORBOARD_SUMMARIES_DIR)

In [25]:
#creating next batch for training
def batch_gen(X,Y,batchsize = 512):
    for i in np.arange(0, Y.shape[0], batchsize):
        end = min(X.shape[0], i+batchsize)
        yield(X[i:end,:],Y[i:end,:])

In [45]:
#create placeholder for X and y and dropout rate
tf.logging.set_verbosity(tf.logging.INFO)

x = tf.placeholder(tf.float32, shape = [None,116], name = 'input_data')
y = tf.placeholder(tf.float32, shape = [None,2], name = 'input_labels')
y_cls = tf.argmax(y,1)

discard_rate = tf.placeholder(tf.float32, name='discard_rate')
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

In [46]:
def cnn_model(features):
    
    #dense layer 1
    dense_layer_1 = tf.layers.dense(inputs = features, units = 32, activation = tf.nn.relu)
    
    #dense layer 3
    dense_layer_3 = tf.layers.dense(inputs = dense_layer_1, units = 16, activation = tf.nn.relu)
    
    #logits layer
    logits = tf.layers.dense(inputs = dense_layer_3, units = 2)
    
    return logits

In [47]:
#initialize variables for training
epochs = 4
num_examples = train.shape[0]

In [48]:
prepare_log_dir()

In [52]:
#predictor model
prediction = cnn_model(train[:,train.loc[train.columns != 'hospital death']])
prediction_cls = tf.argmax(prediction,1)

#softmax loss
loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(onehot_labels = train[:,train.loc[train.columns == 'hospital_death']], logits = prediction))

#We use Adam Optimizer
optimizer = tf.train.AdamOptimizer().minimize(loss)

IndexError: Item wrong length 116 instead of 50000.

In [53]:
train.shape

(50000, 116)