In [None]:
from __future__ import print_function

# Import libraries
import numpy as np
import pandas as pd
import matplotlib
import sklearn
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties # for unicode fonts
import psycopg2
import sys
import datetime as dt
import mp_utils as mp

from collections import OrderedDict

# used to print out pretty pandas dataframes
from IPython.display import display, HTML

from sklearn.pipeline import Pipeline

# used to impute mean for data and standardize for computational stability
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# logistic regression is our favourite model ever
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV # l2 regularized regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier

# used to calculate AUROC/accuracy
from sklearn import metrics

# used to create confusion matrix
from sklearn.metrics import confusion_matrix

# gradient boosting - must download package https://github.com/dmlc/xgboost
import xgboost as xgb

%matplotlib inline

# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'
query_schema = 'SET search_path to public,' + schema_name + ';'

In [None]:
USE_SQL=0

if USE_SQL:
    # Connect to local postgres version of mimic
    con = psycopg2.connect(dbname=dbname, user=sqluser)

    # exclusion criteria:
    #   - less than 16 years old
    #   - stayed in the ICU less than 4 hours
    #   - never have any chartevents data (i.e. likely administrative error)
    query = query_schema + \
    """
    select 
        subject_id, hadm_id, icustay_id
    from mp_cohort
    where excluded = 0
    """
    co = pd.read_sql_query(query,con)

    # extract static vars into a separate dataframe
    df_static = pd.read_sql_query(query_schema + 'select * from mp_static_data', con)
    #for dtvar in ['intime','outtime','deathtime']:
    #    df_static[dtvar] = pd.to_datetime(df_static[dtvar])

    vars_static = [u'is_male', u'emergency_admission', u'age',
                   # services
                   u'service_any_noncard_surg',
                   u'service_any_card_surg',
                   u'service_cmed',
                   u'service_traum',
                   u'service_nmed',
                   # ethnicities
                   u'race_black',u'race_hispanic',u'race_asian',u'race_other',
                   # phatness
                   u'height', u'weight', u'bmi']


    # get ~5 million rows containing data from errbody
    # this takes a little bit of time to load into memory (~2 minutes)

    # %%time results
    # CPU times: user 42.8 s, sys: 1min 3s, total: 1min 46s
    # Wall time: 2min 7s

    df = pd.read_sql_query(query_schema + 'select * from mp_data', con)
    df.drop('subject_id',axis=1,inplace=True)
    df.drop('hadm_id',axis=1,inplace=True)
    df.sort_values(['icustay_id','hr'],axis=0,ascending=True,inplace=True)
    print(df.shape)

    # get death information
    df_death = pd.read_sql_query(query_schema + """
    select 
    co.subject_id, co.hadm_id, co.icustay_id
    , ceil(extract(epoch from (co.outtime - co.intime))/60.0/60.0) as dischtime_hours
    , ceil(extract(epoch from (adm.deathtime - co.intime))/60.0/60.0) as deathtime_hours
    , case when adm.deathtime is null then 0 else 1 end as death
    from mp_cohort co
    inner join admissions adm
    on co.hadm_id = adm.hadm_id
    where co.excluded = 0
    """, con)

    # get severity scores
    df_soi = pd.read_sql_query(query_schema + """
    select 
    co.icustay_id
    , case when adm.deathtime is null then 0 else 1 end as death
    , sa.saps
    , sa2.sapsii
    , aps.apsiii
    , so.sofa
    , lo.lods
    , oa.oasis
    from mp_cohort co
    inner join admissions adm
    on co.hadm_id = adm.hadm_id
    left join saps sa
    on co.icustay_id = sa.icustay_id
    left join sapsii sa2
    on co.icustay_id = sa2.icustay_id
    left join apsiii aps
    on co.icustay_id = aps.icustay_id
    left join sofa so
    on co.icustay_id = so.icustay_id
    left join lods lo
    on co.icustay_id = lo.icustay_id
    left join oasis oa
    on co.icustay_id = oa.icustay_id
    where co.excluded = 0
    """, con)

In [None]:
USE_CSV=1

if USE_CSV:
    co = pd.read_csv('df_cohort.csv')
    
    # convert the inclusion flags to boolean
    for c in co.columns:
        if c[0:10]=='inclusion_':
            co[c] = co[c].astype(bool)
    df = pd.read_csv('df_data.csv')
    df_static = pd.read_csv('df_static_data.csv')
    df_soi = pd.read_csv('df_soi.csv')

In [None]:
# manually define the time dictionary as admission+24 hours
# since everything is relative to admission, we just fix the time to be 24 for all patients
time_dict = df_death.copy().set_index('icustay_id')
time_dict['windowtime'] = 24
time_dict = time_dict['windowtime'].to_dict()
df_data = mp.get_design_matrix(df, time_dict, W=24, W_extra=24)

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[vars_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(df_death.set_index('icustay_id')[['death']], left_index=True, right_index=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = vars_static + [x for x in df_data.columns.values]

In [None]:
# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = {'xgb': xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05),
          'lasso': LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000),
          'logreg': LogisticRegression(fit_intercept=True),
          'rf': RandomForestClassifier()
         }


# create k-fold indices
K = 5 # number of folds
idxK = np.random.permutation(X.shape[0])
idxK = np.mod(idxK,K)

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]
        
        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])
        
        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)
        
        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        
# calculate performance of severity of illness scores
for mdl in ['saps','sapsii','apsiii','sofa','lods','oasis']:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()
    
    for k in range(K):
        curr_prob = df_soi.loc[idxK == k, mdl].values
        
        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])
        
        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)
        
        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))

In [None]:
# average AUROC + min/max
for mdl in ['saps','sapsii','apsiii','sofa','lods','oasis']:
    curr_score = np.asarray(results_val[mdl],dtype=float)
    print('{}\t{:0.3f} [{:0.3f}, {:0.3f}]'.format(mdl, np.mean(curr_score), np.min(curr_score), np.max(curr_score)))

# average AUROC + min/max
for mdl in models:
    curr_score = np.asarray(results_val[mdl],dtype=float)
    print('{}\t{:0.3f} [{:0.3f}, {:0.3f}]'.format(mdl, np.mean(curr_score), np.min(curr_score), np.max(curr_score)))

code below is half complete ... aims to calculate other scores with models

```python
metrics_eval = OrderedDict([['AUROC', metrics.roc_auc_score],
               ['LogLoss', metrics.log_loss],
               ['MeanSqErr', metrics.brier_score_loss]
                            ])

scores = list()
for mdl in models:
    #scores[mdl] = list()
    for k in range(K):
        # get predictions
        curr_prob = pred_val[mdl][k]
        curr_tar = tar_val[mdl][k]
        
        # calculate score
        print('{:10s}\t{:0.3f}\t{:0.3f}\t{:0.3f}'.format(mdl,
                                                         metrics.roc_auc_score(curr_tar, curr_prob),
                                                        metrics.log_loss(curr_tar, curr_prob),
                                                        metrics.brier_score_loss(curr_tar, curr_prob)))
```

# Evaluate other time intervals

* 12 hour
* 24 hour
* 48 hour
* 72 hour

Require the patient to stay at least 4 hours. First define the same K-folds to be repeatedly used throughout.

In [None]:
K=5

# get unique subject_id (this is needed later)
sid = np.sort(np.unique(df_death['subject_id'].values))

# assign k-fold
idxK_sid = np.random.permutation(sid.shape[0])
idxK_sid = np.mod(idxK_sid,K)

# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, df_death['subject_id'].values)

# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]

## 12 hours

In [None]:
# == 12 hours == #
W=12

# manually define the time dictionary as admission+24 hours
# since everything is relative to admission, we just fix the time to be 24 for all patients
time_dict = df_death.copy().set_index('icustay_id')
time_dict['windowtime'] = W
time_dict = time_dict['windowtime'].to_dict()
df_data = mp.get_design_matrix(df, time_dict, W=24, W_extra=24)

# get a list of icustay_id who stayed at least 12 hours
iid_min_stay = df.groupby('icustay_id')['hr'].max() >= W
iid_min_stay=iid_min_stay.index[iid_min_stay.values].values

print('Looking at the first 12 hours of the ICU stay.')
print('Reducing sample size from {} to {} ({:2.2f}%) to ensure patients stayed long enough.'.format(
        df_data.shape[0], iid_min_stay.shape[0], iid_min_stay.shape[0]*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_min_stay,:]
print('')

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[vars_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(df_death.set_index('icustay_id')[['death']], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(df_death.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = vars_static + [x for x in df_data.columns.values]

# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = {'xgb': xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05),
          'lasso': LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000),
          'logreg': LogisticRegression(fit_intercept=True),
          'rf': RandomForestClassifier()
         }

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]
        
        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])
        
        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)
        
        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        
# create a pointer for above dicts with new var names
# we will likely re-use the dicts in subsequent calls for getting model perfomances
mdl_val_12 = mdl_val
results_val_12 = results_val
pred_val_12 = pred_val
tar_val_12 = tar_val

## 24 hours

In [None]:
W=24

# manually define the time dictionary as admission+24 hours
# since everything is relative to admission, we just fix the time to be 24 for all patients
time_dict = df_death.copy().set_index('icustay_id')
time_dict['windowtime'] = W
time_dict = time_dict['windowtime'].to_dict()
df_data = mp.get_design_matrix(df, time_dict, W=24, W_extra=24)

# get a list of icustay_id who stayed at least 12 hours
iid_min_stay = df.groupby('icustay_id')['hr'].max() >= W
iid_min_stay=iid_min_stay.index[iid_min_stay.values].values

print('Looking at the first 12 hours of the ICU stay.')
print('Reducing sample size from {} to {} ({:2.2f}%) to ensure patients stayed long enough.'.format(
        df_data.shape[0], iid_min_stay.shape[0], iid_min_stay.shape[0]*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_min_stay,:]
print('')

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[vars_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(df_death.set_index('icustay_id')[['death']], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(df_death.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = vars_static + [x for x in df_data.columns.values]

# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = {'xgb': xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05),
          'lasso': LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000),
          'logreg': LogisticRegression(fit_intercept=True),
          'rf': RandomForestClassifier()
         }

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]
        
        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])
        
        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)
        
        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        
# create a pointer for above dicts with new var names
# we will likely re-use the dicts in subsequent calls for getting model perfomances
mdl_val_24 = mdl_val
results_val_24 = results_val
pred_val_24 = pred_val
tar_val_24 = tar_val

## 48 hours

In [None]:
W=48

# manually define the time dictionary as admission+24 hours
# since everything is relative to admission, we just fix the time to be 24 for all patients
time_dict = df_death.copy().set_index('icustay_id')
time_dict['windowtime'] = W
time_dict = time_dict['windowtime'].to_dict()
df_data = mp.get_design_matrix(df, time_dict, W=24, W_extra=24)

# get a list of icustay_id who stayed at least 12 hours
iid_min_stay = df.groupby('icustay_id')['hr'].max() >= W
iid_min_stay=iid_min_stay.index[iid_min_stay.values].values

print('Looking at the first {} hours of the ICU stay.'.format(W))
print('Reducing sample size from {} to {} ({:2.2f}%) to ensure patients stayed long enough.'.format(
        df_data.shape[0], iid_min_stay.shape[0], iid_min_stay.shape[0]*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_min_stay,:]
print('')

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[vars_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(df_death.set_index('icustay_id')[['death']], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(df_death.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = vars_static + [x for x in df_data.columns.values]

# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = {'xgb': xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05),
          'lasso': LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000),
          'logreg': LogisticRegression(fit_intercept=True),
          'rf': RandomForestClassifier()
         }

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]
        
        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])
        
        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)
        
        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        
# create a pointer for above dicts with new var names
# we will likely re-use the dicts in subsequent calls for getting model perfomances
mdl_val_48 = mdl_val
results_val_48 = results_val
pred_val_48 = pred_val
tar_val_48 = tar_val

## 72 hours

In [None]:
W=72

# manually define the time dictionary as admission+24 hours
# since everything is relative to admission, we just fix the time to be 24 for all patients
time_dict = df_death.copy().set_index('icustay_id')
time_dict['windowtime'] = W
time_dict = time_dict['windowtime'].to_dict()
df_data = mp.get_design_matrix(df, time_dict, W=24, W_extra=24)

# get a list of icustay_id who stayed at least 12 hours
iid_min_stay = df.groupby('icustay_id')['hr'].max() >= W
iid_min_stay=iid_min_stay.index[iid_min_stay.values].values

print('Looking at the first {} hours of the ICU stay.'.format(W))
print('Reducing sample size from {} to {} ({:2.2f}%) to ensure patients stayed long enough.'.format(
        df_data.shape[0], iid_min_stay.shape[0], iid_min_stay.shape[0]*100.0 / df_data.shape[0]))
df_data = df_data.loc[iid_min_stay,:]
print('')

# load the data into a numpy array

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[vars_static], how='left', left_index=True, right_index=True)
# next, add in the outcome: death in hospital
X = X.merge(df_death.set_index('icustay_id')[['death']], left_index=True, right_index=True)

# map above K-fold indices to this dataset
X = X.merge(df_death.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)
# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)
# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]
# drop the subject_id column
X.drop('subject_id',axis=1,inplace=True)

# convert to numpy data (assumes target, death, is the last column)
X = X.values
y = X[:,-1]
X = X[:,0:-1]
X_header = vars_static + [x for x in df_data.columns.values]

# Rough timing info:
#     rf - 3 seconds per fold
#    xgb - 30 seconds per fold
# logreg - 4 seconds per fold
#  lasso - 8 seconds per fold
models = {'xgb': xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05),
          'lasso': LassoCV(cv=5,fit_intercept=True,normalize=True,max_iter=10000),
          'logreg': LogisticRegression(fit_intercept=True),
          'rf': RandomForestClassifier()
         }

mdl_val = dict()
results_val = dict()
pred_val = dict()
tar_val = dict()

for mdl in models:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()

    if mdl == 'xgb':
        # no pre-processing of data necessary for xgb
        estimator = Pipeline([(mdl, models[mdl])])

    else:
        estimator = Pipeline([("imputer", Imputer(missing_values='NaN',
                                          strategy="mean",
                                          axis=0)),
                      ("scaler", StandardScaler()),
                      (mdl, models[mdl])]) 

    for k in range(K):
        # train the model using all but the kth fold
        curr_mdl = estimator.fit(X[idxK != k, :],y[idxK != k])

        # get prediction on this dataset
        if mdl == 'lasso':
            curr_prob = curr_mdl.predict(X[idxK == k, :])
        else:
            curr_prob = curr_mdl.predict_proba(X[idxK == k, :])
            curr_prob = curr_prob[:,1]
        
        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK == k])
        
        # calculate score (AUROC)
        curr_score = metrics.roc_auc_score(y[idxK == k], curr_prob)

        # add score to list of scores
        results_val[mdl].append(curr_score)

        # save the current model
        mdl_val[mdl].append(curr_mdl)
        
        print('{} - Finished fold {} of {}. AUROC {:0.3f}.'.format(dt.datetime.now(), k+1, K, curr_score))
        
# create a pointer for above dicts with new var names
# we will likely re-use the dicts in subsequent calls for getting model perfomances
mdl_val_72 = mdl_val
results_val_72 = results_val
pred_val_72 = pred_val
tar_val_72 = tar_val

# Summarize

In [None]:
# print out model performance
for mdl in ['saps','sapsii','apsiii','sofa','lods','oasis']:
    print('=============== {} ==============='.format(mdl))
    mdl_val[mdl] = list()
    results_val[mdl] = list() # initialize list for scores
    pred_val[mdl] = list()
    tar_val[mdl] = list()
    
    for k in range(K):
        curr_prob = df_soi.loc[idxK == k, mdl].values
        
        pred_val[mdl].append(curr_prob)
        tar_val[mdl].append(y[idxK != k])