In [22]:
from __future__ import print_function

# Import libraries
import numpy as np
import pandas as pd
import matplotlib
import sklearn
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties # for unicode fonts
import sys
import datetime as dt
import mp_utils as mp

from collections import OrderedDict

# used to print out pretty pandas dataframes
from IPython.display import display, HTML

from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# used to impute mean for data and standardize for computational stability
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# logistic regression is our favourite model ever
from sklearn import linear_model
from sklearn import ensemble

# used to calculate AUROC/accuracy
from sklearn import metrics

# used to create confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

# gradient boosting - must download package https://github.com/dmlc/xgboost
import xgboost as xgb

#from eli5 import show_weights

# default colours for prettier plots
col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']

%matplotlib inline

In [24]:
df = pd.read_csv('X_design_matrix.csv').set_index('icustay_id')

# create X by dropping idxK and the outcome
y = df['death'].values
idxK = df['idxK']
X = df.drop(['death','idxK'],axis=1).values

print('{} observations. Outcome rate: {:2.2f}%.'.format(X.shape[0],
                                                        100.0*np.mean(y)))

50488 observations. Outcome rate: 11.20%.


    colsample_bytree     - 0.7 	[ 0.7 0.7 0.7 0.7 0.7 ]
    silent               - 1.0 	[ 1 1 1 1 1 ]
    learning_rate        - 0.01 	[ 0.01 0.01 0.01 0.01 0.01 ]
    n_estimators         - 1000.0 	[ 1000 1000 1000 1000 1000 ]
    subsample            - 0.8 	[ 0.8 0.8 0.8 0.8 0.8 ]
    objective            - skipping as it is a string
    max_depth            - 9.0 	[ 9 6 9 9 6 ]

In [26]:
# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
np.random.seed(7390984)

# parameters from grid search
xgb_mdl = xgb.XGBClassifier(colsample_bytree=0.7, silent=1,
                            learning_rate = 0.01, n_estimators=1000,
                            subsample=0.8, max_depth=9)

models = {'xgb': xgb_mdl,
          'lasso': linear_model.LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000),
          'logreg': linear_model.LogisticRegression(fit_intercept=True),
          'l2': linear_model.LogisticRegressionCV()
          #'rf': ensemble.RandomForestClassifier()
         }


# create k-fold indices
K = 5 # number of folds
idxK = np.random.permutation(X.shape[0])
idxK = np.mod(idxK,K)

mdl_val = dict()
results_val = dict()
pred_val = dict()
pred_val_merged = dict()
for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = dict()
    pred_val_merged[mdl] = np.zeros(X.shape[0])
    
    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = sklearn.base.clone(estimator).fit(X[idxK != k, :], y[idxK != k])

        # get prediction on this dataset
        if mdl in ('lasso','ridge'):
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]
            
        pred_val_merged[mdl][idxK==k] = curr_prob
        pred_val[mdl][k] = curr_prob

        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)
        
        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
    
tar_val = dict()
for k in range(K):
    tar_val[k] = y[idxK==k]

2017-07-06 08:44:36.522302 - Finished fold 1 of 5. AUROC 0.893.
2017-07-06 08:44:44.736272 - Finished fold 2 of 5. AUROC 0.896.
2017-07-06 08:44:51.897262 - Finished fold 3 of 5. AUROC 0.893.
2017-07-06 08:44:58.932115 - Finished fold 4 of 5. AUROC 0.890.
2017-07-06 08:45:06.853988 - Finished fold 5 of 5. AUROC 0.888.
2017-07-06 08:48:32.392924 - Finished fold 1 of 5. AUROC 0.924.
2017-07-06 08:51:58.636281 - Finished fold 2 of 5. AUROC 0.920.
2017-07-06 08:55:32.974751 - Finished fold 3 of 5. AUROC 0.921.
2017-07-06 08:59:12.003701 - Finished fold 4 of 5. AUROC 0.918.
2017-07-06 09:03:00.009820 - Finished fold 5 of 5. AUROC 0.919.
2017-07-06 09:03:03.046011 - Finished fold 1 of 5. AUROC 0.893.
2017-07-06 09:03:06.030786 - Finished fold 2 of 5. AUROC 0.896.
2017-07-06 09:03:08.907647 - Finished fold 3 of 5. AUROC 0.893.
2017-07-06 09:03:11.748804 - Finished fold 4 of 5. AUROC 0.890.
2017-07-06 09:03:15.194134 - Finished fold 5 of 5. AUROC 0.887.
2017-07-06 09:03:18.332521 - Finished fo

In [27]:
# average AUROC + min/max
for mdl in models:
    curr_score = np.zeros(K)
    for k in range(K):
        curr_score[k] = metrics.roc_auc_score(tar_val[k], pred_val[mdl][k])
    print('{}\t{:0.3f} [{:0.3f}, {:0.3f}]'.format(mdl, np.mean(curr_score), np.min(curr_score), np.max(curr_score)))

l2	0.892 [0.888, 0.896]
xgb	0.920 [0.918, 0.924]
logreg	0.892 [0.887, 0.896]
lasso	0.888 [0.882, 0.894]


In [28]:
# average AUPRC + min/max
for mdl in models:
    curr_score = np.zeros(K)
    for k in range(K):
        curr_score[k] = metrics.average_precision_score(tar_val[k], pred_val[mdl][k])
    print('{}\t{:0.3f} [{:0.3f}, {:0.3f}]'.format(mdl, np.mean(curr_score), np.min(curr_score), np.max(curr_score)))

l2	0.588 [0.568, 0.597]
xgb	0.665 [0.654, 0.669]
logreg	0.588 [0.568, 0.597]
lasso	0.579 [0.557, 0.593]
