# Model A

Model A is trained using random time points for each patient. We first train this model, then evaluate it on a separate dataset with data extracted at fixed lead times from mortality for the patients who died in-hospital.

In [None]:
from __future__ import print_function

# Import libraries
import numpy as np
import pandas as pd
import matplotlib
import sklearn
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties # for unicode fonts
import psycopg2
import sys
import datetime as dt
import mp_utils as mp

# used to print out pretty pandas dataframes
from IPython.display import display, HTML

from sklearn.pipeline import Pipeline

# used for train/test splits and cross validation
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

# used to impute mean for data and standardize for computational stability
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# logistic regression is our favourite model ever
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV # l2 regularized regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier

# used to calculate AUROC/accuracy
from sklearn import metrics

# used to create confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score

# gradient boosting - must download package https://github.com/dmlc/xgboost
import xgboost as xgb

# default colours for prettier plots
col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']

%matplotlib inline

# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'
query_schema = 'SET search_path to public,' + schema_name + ';'

In [None]:
# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser)

In [None]:
# exclusion criteria:
#   - less than 16 years old
#   - stayed in the ICU less than 4 hours
#   - never have any chartevents data (i.e. likely administrative error)
query = query_schema + \
"""
select 
    subject_id, hadm_id, icustay_id
from mp_cohort
where excluded = 0
"""
co = pd.read_sql_query(query,con)

# extract static vars into a separate dataframe
df_static = pd.read_sql_query(query_schema + 'select * from mp_static_data', con)
#for dtvar in ['intime','outtime','deathtime']:
#    df_static[dtvar] = pd.to_datetime(df_static[dtvar])

vars_static = [u'is_male', u'emergency_admission', u'age',
               # services
               u'service_nusrg',u'service_tsurg',u'service_gu',
               u'service_cmed',u'service_gyn',u'service_traum',
               u'service_ent',u'service_omed',u'service_psurg',
               u'service_ortho',u'service_surg',u'service_nmed',
               u'service_csurg',u'service_vsurg',
               # ethnicities
               u'race_black',u'race_hispanic',u'race_asian',u'race_other',
               # phatness
               u'height', u'weight', u'bmi']

In [None]:
# get ~5 million rows containing data from errbody
# this takes a little bit of time to load into memory (~2 minutes)

# %%time results
# CPU times: user 42.8 s, sys: 1min 3s, total: 1min 46s
# Wall time: 2min 7s

df = pd.read_sql_query(query_schema + 'select * from mp_data', con)
df.drop('subject_id',axis=1,inplace=True)
df.drop('hadm_id',axis=1,inplace=True)
df.sort_values(['icustay_id','hr'],axis=0,ascending=True,inplace=True)
print(df.shape)

In [None]:
# get death information
df_death = pd.read_sql_query(query_schema + """
select 
co.icustay_id
, ceil(extract(epoch from (co.outtime - co.intime))/60.0/60.0) as dischtime_hours
, ceil(extract(epoch from (adm.deathtime - co.intime))/60.0/60.0) as deathtime_hours
, case when adm.deathtime is null then 0 else 1 end as death
from mp_cohort co
inner join admissions adm
on co.hadm_id = adm.hadm_id
where co.excluded = 0
""", con)

## Create dataframe with design matrix

Takes ~2 seconds.

In [None]:
reload(mp)
time_dict = mp.generate_times(df_death, T=4, analysis_type='base')
df_data = mp.get_design_matrix(df, time_dict, T=8, t_extra=24)

# load the data into a numpy array
X = df_data.merge(df_death[['icustay_id','death']], left_index=True, right_on='icustay_id').values
y = X[:,-1]
X = X[:,0:-1]
X_header = df_data.columns

## Model 1: Using random time segments

The above reported cross-validation performance in a variety of settings. We're also interested in *evaluating* the same model in the various settings. That is, training a model using random offsets, and then evaluating how it performs 4 hours before death, 8 hours, etc.

In [None]:
models = {'xgb': xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05),
          'lasso': LassoCV(cv=5,fit_intercept=True,normalize=True),
          'logreg': LogisticRegression(fit_intercept=True),
          'rf': RandomForestClassifier(),
          #'svm': GridSearchCV(sklearn.svm.SVC(kernel='rbf',class_weight='balanced',probability=False),
          #                   svm_parameters, cv=5, scoring='roc_auc')
         }

In [None]:
# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold

# create k-fold indices
K = 5 # number of folds
idxK = np.random.permutation(X.shape[0])
idxK = np.mod(idxK,K)

mdl_val = dict()
results_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]

        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}.'.format(dt.datetime.now(), k+1, K))

In [None]:
pretty_labels = {'xgb': 'GB', 'rf': 'RF', 'logreg': 'LR', 'lasso': 'LASSO'}
plt.figure(figsize=[8,5])
for m, mdl in enumerate(results_val):
    curr_score = results_val[mdl]
    plt.plot(m*np.ones(len(curr_score)), curr_score,
            marker=marker[m], color=col[m],
            markersize=10, linewidth=2, linestyle=':',
            label=pretty_labels[mdl])

plt.ylabel('AUROC',fontsize=18)
plt.legend(loc='best',fontsize=18)
plt.xlim([-1,m+1])
plt.ylim([0.7,1.0])
plt.grid()
plt.gca().set_xticks(np.linspace(0,m,m+1))
plt.gca().set_xticklabels([pretty_labels[x] for x in results_val.keys()])
for tick in plt.gca().xaxis.get_major_ticks():
    tick.label.set_fontsize(20) 

plt.show()

## Old code for training model using one design matrix and applying to other design matrices

In [None]:
reload(mp)
# load the data into a numpy array
X, y, X_header = mp.load_design_matrix(df, df_dict)

# load into a dictionary the other various datasets/models
X_val = dict()
y_val = dict()
X_header_val = dict()
results_val = dict() # stores AUROCs across datasets
mdl_val = dict() # stores the model trained across k-folds

for i, data_ext in enumerate(analyses):

    # load the data into a numpy array
    X_val[data_ext], y_val[data_ext], X_header_val[data_ext] = mp.load_design_matrix(co,
                                           df_additional_data=df_static[vars_static],
                                           data_ext=data_ext, path=path_for_data)
    results_val[data_ext] = dict()
    
print('{} - Finished loading data'.format(dt.datetime.now()))

np.random.seed(seed=seeds[data_ext])

# create k-fold indices
K = 5 # number of folds
idxK = np.random.permutation(X.shape[0])
idxK = np.mod(idxK,K)

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()


    for data_ext in X_val:
        results_val[data_ext][mdl] = list() # initialize list for scores

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        for data_ext in X_val:
            # get prediction on this dataset
            if mdl == 'lasso':
                curr_mdl.predict(X_val[data_ext][idxK == k, :])
            else:
                curr_prob = curr_mdl.predict_proba(X_val[data_ext][idxK == k, :])
                curr_prob = curr_prob[:,1]

            # calculate score (AUROC)
            curr_score = metrics.roc_auc_score(y_val[data_ext][idxK == k], curr_prob)

            # add score to list of scores
            results_val[data_ext][mdl].append(curr_score)

            # save the current model
            mdl_val[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}.'.format(dt.datetime.now(), k+1, K))

## TODO

Update below code ... currently outdated

Repeat the same experiment as above, but this time, let's train a model with the outcome "did the patient die in the next 24 hours?"

In [None]:
# extract the data
data_ext = 'base'

# load the data into a numpy array
X, y, X_header = mp.load_design_matrix(co,
                                       df_additional_data=df_static[vars_static],
                                       data_ext=data_ext,
                                       diedWithin=24, path=path_for_data)

# load into a dictionary the other various datasets/models
X_val = dict()
y_val = dict()
X_header_val = dict()
results_val_dw24 = dict() # stores AUROCs across datasets
mdl_val_dw24 = dict() # stores the model trained across k-folds

for i, data_ext in enumerate(analyses):

    # load the data into a numpy array
    X_val[data_ext], y_val[data_ext], X_header_val[data_ext] = mp.load_design_matrix(co,
                                           df_additional_data=df_static[vars_static],
                                           data_ext='_' + data_ext, path=path_for_data)
    results_val_dw24[data_ext] = dict()
    
print('{} - Finished loading data'.format(dt.datetime.now()))

np.random.seed(seed=seeds[data_ext])
# create k-fold indices
K = 5 # number of folds
idxK = np.random.permutation(X.shape[0])
idxK = np.mod(idxK,K)


for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val_dw24[mdl] = list()


    for data_ext in X_val:
        results_val_dw24[data_ext][mdl] = list() # initialize list for scores

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        for data_ext in X_val:
            # get prediction on this dataset
            if mdl == 'lasso':
                curr_mdl.predict(X_val[data_ext][idxK == k, :])
            else:
                curr_prob = curr_mdl.predict_proba(X_val[data_ext][idxK == k, :])
                curr_prob = curr_prob[:,1]

            # calculate score (AUROC)
            curr_score = metrics.roc_auc_score(y_val[data_ext][idxK == k], curr_prob)

            # add score to list of scores
            results_val_dw24[data_ext][mdl].append(curr_score)

            # save the current model
            mdl_val_dw24[mdl].append(curr_mdl)

        print('{} - Finished fold {} of {}.'.format(dt.datetime.now(), k+1, K))

In [None]:
# plot a figure of the results
xi_str = ['00','04','08','16','24']
xi = [int(x) for x in xi_str]

plt.figure(figsize=[8,6])

for m, mdl in enumerate(['rf','xgb','logreg']):
    all_score = list()
    for i, x in enumerate(xi_str):
        curr_score = results_val_dw24[x][mdl]

        plt.plot(int(x) * np.ones(len(curr_score)), curr_score,
                marker=marker[m], color=col[m],
                markersize=10, linewidth=2, linestyle=':')

        all_score.append(np.median(curr_score))
        
    # plot a line through the mean across all evaluations

    plt.plot(xi, all_score,
            marker=marker[m], color=col[m],
            markersize=10, linewidth=2, linestyle='-',
            label=pretty_labels[mdl])

plt.gca().set_xticks(np.linspace(0,24,7))
plt.gca().set_xlim([-1,25])
plt.gca().invert_xaxis()
plt.legend(loc='lower center',fontsize=16)
plt.xlabel('Lead time (hours)',fontsize=18)
plt.ylabel('AUROC',fontsize=18)

ax = plt.gca()

for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(16) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(16) 

plt.grid()
plt.savefig('auroc_over_time_dw24.pdf')
plt.show()