In [2]:
from __future__ import print_function

# Import libraries
import numpy as np
import pandas as pd
import matplotlib
import sklearn
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties # for unicode fonts
import sys
import datetime as dt
import mp_utils as mp

from collections import OrderedDict

# used to print out pretty pandas dataframes
from IPython.display import display, HTML

from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# used to impute mean for data and standardize for computational stability
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# logistic regression is our favourite model ever
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV # l2 regularized regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier

# used to calculate AUROC/accuracy
from sklearn import metrics

# used to create confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn import model_selection

# gradient boosting - must download package https://github.com/dmlc/xgboost
import xgboost as xgb

#from eli5 import show_weights

# default colours for prettier plots
col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']

%matplotlib inline

In [3]:
df = pd.read_csv('X_design_matrix.csv').set_index('icustay_id')

# create X by dropping idxK and the outcome
y = df['death'].values
idxK = df['idxK'].values
K = len(np.unique(idxK))
subject_id = df['subject_id'].values
X = df.drop(['death','idxK','subject_id'],axis=1).values
print('{} observations. Outcome rate: {:2.2f}%.'.format(X.shape[0], 100.0*np.mean(y)))

50488 observations. Outcome rate: 11.20%.


In [None]:
estimator = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can work well
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
params = {'objective':['binary:logistic'],
              'learning_rate': [0.01,0.05], #so called `eta` value
              'max_depth': [3,6,9],
              #'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [500,1000]}


mdl_val = dict()
results_val = dict()
pred_val = dict()
pred_val_merged = dict()

mdl = 'xgb'

print('=============== {} ==============='.format(mdl))
mdl_val[mdl] = list()
results_val[mdl] = list() # initialize list for scores
pred_val[mdl] = dict()
pred_val_merged[mdl] = np.zeros(X.shape[0])

for k in range(K):
    # train the model using all but the kth fold

    idxK_internal = list(model_selection.GroupKFold(n_splits=3).split(X[idxK!=k,:],y[idxK!=k],subject_id[idxK!=k]))
    
    curr_mdl = model_selection.GridSearchCV(sklearn.base.clone(estimator), params, n_jobs=4, 
                       cv=idxK_internal,
                       scoring='roc_auc',
                       verbose=2, refit=True)

    curr_mdl = curr_mdl.fit(X[idxK != k, :], y[idxK != k])

    # get prediction on this dataset
    if mdl == 'lasso':
        curr_prob = curr_mdl.predict(X[idxK == k, :])
    else:
        curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
        curr_prob = curr_prob[:,1]

    pred_val_merged[mdl][idxK==k] = curr_prob
    pred_val[mdl][k] = curr_prob

    # calculate score (AUROC)
    curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

    # add score to list of scores
    results_val[mdl].append(curr_score)

    # save the current model
    mdl_val[mdl].append(curr_mdl)

    print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.01, n_estimators=500, subsample=0.8, objective=binary:logistic, max_depth=3 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.01, n_estimators=500, subsample=0.8, objective=binary:logistic, max_depth=3 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.01, n_estimators=500, subsample=0.8, objective=binary:logistic, max_depth=3 
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.01, n_estimators=1000, subsample=0.8, objective=binary:logistic, max_depth=3 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.01, n_estimators=500, subsample=0.8, objective=binary:logistic, max_depth=3, total= 1.1min
[CV] colsample_bytree=0.7, silent=1, learning_rate=0.01, n_estimators=1000, subsample=0.8, objective=binary:logistic, max_depth=3 
[CV]  colsample_bytree=0.7, silent=1, learning_rate=0.01, n_estimators=500, subsample=0.8, objective=binary:logistic, max_depth=3, total= 1.1m

In [29]:
# average AUROC + min/max

print('AUROC')
mdl = 'xgb'

curr_score = np.zeros(K)
for k in range(K):
    curr_score[k] = metrics.roc_auc_score(y[idxK == k], pred_val[mdl][k])
print('{}\t{:0.3f} [{:0.3f}, {:0.3f}]'.format(mdl, np.mean(curr_score), np.min(curr_score), np.max(curr_score)))

# average AUPRC + min/max
print()
print('AUPRC')
curr_score = np.zeros(K)
for k in range(K):
    curr_score[k] = metrics.average_precision_score(y[idxK == k], pred_val[mdl][k])
print('{}\t{:0.3f} [{:0.3f}, {:0.3f}]'.format(mdl, np.mean(curr_score), np.min(curr_score), np.max(curr_score)))

AUROC
xgb	0.916 [0.911, 0.922]

AUPRC
xgb	0.654 [0.639, 0.669]


In [70]:
import copy
# combine parameters together for the outer 5-fold
for i, clf in enumerate(mdl_val[mdl]):
    if i==0:
        best_params = copy.deepcopy(clf.best_params_)
        # convert each element to a list
        for p in best_params:
            best_params[p] = [best_params[p]]
    else:
        for p in best_params:
            best_params[p].append(clf.best_params_[p])
            
print(best_params)

{'colsample_bytree': [0.7, 0.7, 0.7, 0.7, 0.7], 'silent': [1, 1, 1, 1, 1], 'learning_rate': [0.01, 0.01, 0.01, 0.01, 0.01], 'n_estimators': [1000, 1000, 1000, 1000, 1000], 'subsample': [0.8, 0.8, 0.8, 0.8, 0.8], 'objective': ['binary:logistic', 'binary:logistic', 'binary:logistic', 'binary:logistic', 'binary:logistic'], 'max_depth': [9, 6, 9, 9, 6]}


In [76]:
# for each parameter, print out the median parameter and list
for p in best_params:
    if 'str' in str(type(best_params[p][0])):
        print('{:20s} - skipping as it is a string'.format(p))
        continue
    print('{:20s} - {} \t[ '.format(p, np.median(best_params[p])),end='')
    for k in best_params[p]:
        print('{}'.format(k),end=' ')
    print(']')

colsample_bytree     - 0.7 	[ 0.7 0.7 0.7 0.7 0.7 ]
silent               - 1.0 	[ 1 1 1 1 1 ]
learning_rate        - 0.01 	[ 0.01 0.01 0.01 0.01 0.01 ]
n_estimators         - 1000.0 	[ 1000 1000 1000 1000 1000 ]
subsample            - 0.8 	[ 0.8 0.8 0.8 0.8 0.8 ]
objective            - skipping as it is a string
max_depth            - 9.0 	[ 9 6 9 9 6 ]


In [34]:
# all parameters
for i, clf in enumerate(mdl_val[mdl]):
    print('=== FOLD {} ==='.format(i))
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

=== FOLD 0 ===
0.894 (+/-0.010) for {'colsample_bytree': 0.7, 'silent': 1, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'binary:logistic', 'max_depth': 3}
0.907 (+/-0.010) for {'colsample_bytree': 0.7, 'silent': 1, 'learning_rate': 0.01, 'n_estimators': 1000, 'subsample': 0.8, 'objective': 'binary:logistic', 'max_depth': 3}
0.909 (+/-0.010) for {'colsample_bytree': 0.7, 'silent': 1, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'binary:logistic', 'max_depth': 6}
0.915 (+/-0.010) for {'colsample_bytree': 0.7, 'silent': 1, 'learning_rate': 0.01, 'n_estimators': 1000, 'subsample': 0.8, 'objective': 'binary:logistic', 'max_depth': 6}
0.912 (+/-0.011) for {'colsample_bytree': 0.7, 'silent': 1, 'learning_rate': 0.01, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'binary:logistic', 'max_depth': 9}
0.916 (+/-0.011) for {'colsample_bytree': 0.7, 'silent': 1, 'learning_rate': 0.01, 'n_estimators': 1000, 'subsample': 0.8, 'objective