# Model A

Model A is trained using random time points for each patient. We first train this model, then evaluate it on a separate dataset with data extracted at fixed lead times from mortality for the patients who died in-hospital.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib
import sklearn
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties # for unicode fonts
import psycopg2
import sys
import datetime as dt
import mp_utils as mp

from sklearn.pipeline import Pipeline

# used for train/test splits and cross validation
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

# used to impute mean for data and standardize for computational stability
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# logistic regression is our favourite model ever
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV # l2 regularized regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier

# used to calculate AUROC/accuracy
from sklearn import metrics

# used to create confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score

# gradient boosting - must download package https://github.com/dmlc/xgboost
import xgboost as xgb

# default colours for prettier plots
col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']

%matplotlib inline

from __future__ import print_function

In [None]:
# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

# exclusion criteria:
#   - less than 16 years old
#   - stayed in the ICU less than 4 hours
#   - never have any chartevents data (i.e. likely administrative error)
query = \
"""
with t1 as
(
select ie.icustay_id
    , adm.HOSPITAL_EXPIRE_FLAG
    , ROW_NUMBER() over (partition by ie.subject_id order by intime) as rn
from icustays ie
inner join admissions adm
    on ie.hadm_id = adm.hadm_id
inner join patients pat
    on ie.subject_id = pat.subject_id
    and ie.intime > (pat.dob + interval '16' year)
where adm.HAS_CHARTEVENTS_DATA = 1
and 
not (
       (lower(diagnosis) like '%organ donor%' and deathtime is not null)
    or (lower(diagnosis) like '%donor account%' and deathtime is not null)
    )
and (ie.outtime - ie.intime) >= interval '4' hour
)
select 
    icustay_id
    , HOSPITAL_EXPIRE_FLAG
from t1
"""
co = pd.read_sql_query(query,con)
co.set_index('icustay_id',inplace=True)

# extract static vars into a separate dataframe
df_static = pd.read_sql_query('select * from mpap_static_vars',con)
for dtvar in ['intime','outtime','deathtime']:
    df_static[dtvar] = pd.to_datetime(df_static[dtvar])
df_static.set_index('icustay_id',inplace=True)

cur.close()
con.close()

vars_static = [u'male', u'emergency', u'age',
               u'cmed', u'csurg', u'surg', u'nsurg',
               u'surg_other', u'traum', u'nmed',
               u'omed', u'ortho', u'gu', u'gyn', u'ent']

In [None]:
seeds = {'base': 473010,
    'base_nodeathfix': 217632,
    '00': 724311,
    '04': 952227,
    '08': 721297,
    '16': 968879,
    '24': 608972,
    'fixed': 585794,
    'wt8': 176381,
    'wt16': 658229,
    'wt24': 635170,
    'wt8_00': 34741,
    'wt8_08': 95467,
    'wt8_16': 85349,
    'wt8_24': 89642}


models = {'xgb': xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05),
          'lasso': LassoCV(cv=5,fit_intercept=True,normalize=True),
          'logreg': LogisticRegression(fit_intercept=True),
          'rf': RandomForestClassifier(),
          #'svm': GridSearchCV(sklearn.svm.SVC(kernel='rbf',class_weight='balanced',probability=False),
          #                   svm_parameters, cv=5, scoring='roc_auc')
         }

## Model 1: Using random time segments

The above reported cross-validation performance in a variety of settings. We're also interested in *evaluating* the same model in the various settings. That is, training a model using random offsets, and then evaluating how it performs 4 hours before death, 8 hours, etc.

In [None]:
reload(mp)
path_for_data = ''
analyses = ['base', '00', '04', '08', '16', '24']

# extract the data used to train the model
data_ext = 'base'
np.random.seed(seed=seeds[data_ext])

# load the data into a numpy array
X, y, X_header = mp.load_design_matrix(co,
                                       df_additional_data=df_static[vars_static],
                                       data_ext=data_ext,
                                      path=path_for_data)


    
# load into a dictionary the other various datasets/models
X_val = dict()
y_val = dict()
X_header_val = dict()
results_val = dict() # stores AUROCs across datasets
mdl_val = dict() # stores the model trained across k-folds

for i, data_ext in enumerate(analyses):

    # load the data into a numpy array
    X_val[data_ext], y_val[data_ext], X_header_val[data_ext] = mp.load_design_matrix(co,
                                           df_additional_data=df_static[vars_static],
                                           data_ext=data_ext, path=path_for_data)
    results_val[data_ext] = dict()
    
print('{} - Finished loading data'.format(dt.datetime.now()))

np.random.seed(seed=seeds[data_ext])

# create k-fold indices
K = 5 # number of folds
idxK = np.random.permutation(X.shape[0])
idxK = np.mod(idxK,K)

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()


    for data_ext in X_val:
        results_val[data_ext][mdl] = list() # initialize list for scores

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        for data_ext in X_val:
            # get prediction on this dataset
            if mdl == 'lasso':
                curr_mdl.predict(X_val[data_ext][idxK == k, :])
            else:
                curr_prob = curr_mdl.predict_proba(X_val[data_ext][idxK == k, :])
                curr_prob = curr_prob[:,1]

            # calculate score (AUROC)
            curr_score = metrics.roc_auc_score(y_val[data_ext][idxK == k], curr_prob)

            # add score to list of scores
            results_val[data_ext][mdl].append(curr_score)

            # save the current model
            mdl_val[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}.'.format(dt.datetime.now(), k+1, K))

In [None]:
print('=============== {} ==============='.format(mdl))
mdl_val[mdl] = list()


for data_ext in X_val:
    results_val[data_ext][mdl] = list() # initialize list for scores

if mdl == 'xgb':
    # no pre-processing of data necessary for xgb
    estimator = Pipeline([(mdl, models[mdl])])

else:
    estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                      strategy="mean",
                                      axis=0)),
                  ("scaler", StandardScaler()),
                  (mdl, models[mdl])]) 

for k in range(K):
    # train the model using all but the kth fold
    curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

    for data_ext in X_val:
        # get prediction on this dataset
        if mdl == 'lasso':
            curr_mdl.predict(X_val[data_ext][idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X_val[data_ext][idxK == k, :])
            curr_prob = curr_prob[:,1]

        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y_val[data_ext][idxK == k], curr_prob)

        # add score to list of scores
        results_val[data_ext][mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)

    print('{} - Finished fold {} of {}.'.format(dt.datetime.now(), k+1, K))

In [None]:
results_val[data_ext]

In [None]:
pretty_labels = {'xgb': 'GB', 'rf': 'RF', 'logreg': 'LR'}

In [None]:
# plot a figure of the results
xi_str = ['00','04','08','16','24']
xi = [int(x) for x in xi_str]

plt.figure(figsize=[8,5])

for m, mdl in enumerate(['rf','xgb','logreg']):
    all_score = list()
    for i, x in enumerate(xi_str):
        curr_score = results_val[x][mdl]

        plt.plot(int(x) * np.ones(len(curr_score)), curr_score,
                marker=marker[m], color=col[m],
                markersize=10, linewidth=2, linestyle=':')

        all_score.append(np.median(curr_score))
        
    # plot a line through the mean across all evaluations

    plt.plot(xi, all_score,
            marker=marker[m], color=col[m],
            markersize=10, linewidth=2, linestyle='-',
            label=pretty_labels[mdl])

plt.gca().set_xticks(np.linspace(0,24,7))
plt.gca().set_xlim([-1,25])
plt.gca().invert_xaxis()
plt.xlabel('Lead time (hours)',fontsize=18)
plt.ylabel('AUROC',fontsize=18)
plt.legend(loc='lower left',fontsize=16)

ax = plt.gca()

for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(16) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(16) 
    
plt.grid()
plt.savefig('auroc_over_time.pdf')
plt.show()

Repeat the same experiment as above, but this time, let's train a model with the outcome "did the patient die in the next 24 hours?"

In [None]:
X.shape

In [None]:
# extract the data
data_ext = 'base'

# load the data into a numpy array
X, y, X_header = mp.load_design_matrix(co,
                                       df_additional_data=df_static[vars_static],
                                       data_ext=data_ext,
                                       diedWithin=24, path=path_for_data)

# load into a dictionary the other various datasets/models
X_val = dict()
y_val = dict()
X_header_val = dict()
results_val_dw24 = dict() # stores AUROCs across datasets
mdl_val_dw24 = dict() # stores the model trained across k-folds

for i, data_ext in enumerate(analyses):

    # load the data into a numpy array
    X_val[data_ext], y_val[data_ext], X_header_val[data_ext] = mp.load_design_matrix(co,
                                           df_additional_data=df_static[vars_static],
                                           data_ext='_' + data_ext, path=path_for_data)
    results_val_dw24[data_ext] = dict()
    
print('{} - Finished loading data'.format(dt.datetime.now()))

np.random.seed(seed=seeds[data_ext])
# create k-fold indices
K = 5 # number of folds
idxK = np.random.permutation(X.shape[0])
idxK = np.mod(idxK,K)


for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val_dw24[mdl] = list()


    for data_ext in X_val:
        results_val_dw24[data_ext][mdl] = list() # initialize list for scores

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        for data_ext in X_val:
            # get prediction on this dataset
            if mdl == 'lasso':
                curr_mdl.predict(X_val[data_ext][idxK == k, :])
            else:
                curr_prob = curr_mdl.predict_proba(X_val[data_ext][idxK == k, :])
                curr_prob = curr_prob[:,1]

            # calculate score (AUROC)
            curr_score = metrics.roc_auc_score(y_val[data_ext][idxK == k], curr_prob)

            # add score to list of scores
            results_val_dw24[data_ext][mdl].append(curr_score)

            # save the current model
            mdl_val_dw24[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}.'.format(dt.datetime.now(), k+1, K))

In [None]:
# plot a figure of the results
xi_str = ['00','04','08','16','24']
xi = [int(x) for x in xi_str]

plt.figure(figsize=[8,6])

for m, mdl in enumerate(['rf','xgb','logreg']):
    all_score = list()
    for i, x in enumerate(xi_str):
        curr_score = results_val_dw24[x][mdl]

        plt.plot(int(x) * np.ones(len(curr_score)), curr_score,
                marker=marker[m], color=col[m],
                markersize=10, linewidth=2, linestyle=':')

        all_score.append(np.median(curr_score))
        
    # plot a line through the mean across all evaluations

    plt.plot(xi, all_score,
            marker=marker[m], color=col[m],
            markersize=10, linewidth=2, linestyle='-',
            label=pretty_labels[mdl])

plt.gca().set_xticks(np.linspace(0,24,7))
plt.gca().set_xlim([-1,25])
plt.gca().invert_xaxis()
plt.legend(loc='lower center',fontsize=16)
plt.xlabel('Lead time (hours)',fontsize=18)
plt.ylabel('AUROC',fontsize=18)

ax = plt.gca()

for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(16) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(16) 

plt.grid()
plt.savefig('auroc_over_time_dw24.pdf')
plt.show()