In [None]:
# Import libraries
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties # for unicode fonts
import psycopg2
import sys
import datetime as dt
import mp_utils as mp

from sklearn.pipeline import Pipeline

# used for train/test splits and cross validation
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

# used to impute mean for data and standardize for computational stability
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# logistic regression is our favourite model ever
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV # l2 regularized regression
from sklearn.linear_model import LassoCV

# used to calculate AUROC/accuracy
from sklearn import metrics

# used to create confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score

# gradient boosting - must download package https://github.com/dmlc/xgboost
import xgboost as xgb

# default colours for prettier plots
col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
# "Tableau 20" colors as RGB.   
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]  
  
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.    
for i in range(len(tableau20)):    
    r, g, b = tableau20[i]    
    tableau20[i] = (r / 255., g / 255., b / 255.)

marker = ['v','o','d','^','s','>','+']
ls = ['-','-','-','-','-','s','--','--']

# bigger font !
plt.rcParams.update({'font.size': 22})

%matplotlib inline

from __future__ import print_function

In [None]:

print('The scikit-learn version is {}.'.format(xgb.__version__))

In [None]:
# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser)
cur = con.cursor()
cur.execute('SET search_path to ' + schema_name)

# exclusion criteria:
#   - less than 16 years old
#   - stayed in the ICU less than 4 hours
#   - never have any chartevents data (i.e. likely administrative error)
query = \
"""
with t1 as
(
select ie.icustay_id
    , adm.HOSPITAL_EXPIRE_FLAG
    , ROW_NUMBER() over (partition by ie.subject_id order by intime) as rn
from icustays ie
inner join admissions adm
    on ie.hadm_id = adm.hadm_id
inner join patients pat
    on ie.subject_id = pat.subject_id
    and ie.intime > (pat.dob + interval '16' year)
where adm.HAS_CHARTEVENTS_DATA = 1
and 
not (
       (lower(diagnosis) like '%organ donor%' and deathtime is not null)
    or (lower(diagnosis) like '%donor account%' and deathtime is not null)
    )
and (ie.outtime - ie.intime) >= interval '4' hour
)
select 
    icustay_id
    , HOSPITAL_EXPIRE_FLAG
from t1
"""
co = pd.read_sql_query(query,con)
co.set_index('icustay_id',inplace=True)

# extract static vars into a separate dataframe
df_static = pd.read_sql_query('select * from mpap_static_vars',con)
for dtvar in ['intime','outtime','deathtime']:
    df_static[dtvar] = pd.to_datetime(df_static[dtvar])
df_static.set_index('icustay_id',inplace=True)

cur.close()
con.close()

vars_static = [u'male', u'emergency', u'age',
               u'cmed', u'csurg', u'surg', u'nsurg',
               u'surg_other', u'traum', u'nmed',
               u'omed', u'ortho', u'gu', u'gyn', u'ent']

# Plot data from example patient's time-series

In [None]:
# define the patient
iid = 200001

df = pd.read_csv('./data/' + str(iid) + '.0.csv')
df.set_index('charttime_elapsed',inplace=True)

# load in this patient's starttime from the actual experiment
df_offset = pd.read_csv('./data_with_treatment/icustays_offset_base.csv')
df_offset.set_index('icustay_id',inplace=True)

starttime = df_offset.loc[iid, 'starttime']
dischtime = pd.to_datetime(df_offset.loc[iid,'outtime']) - pd.to_datetime(df_offset.loc[iid,'intime'])
print('{} discharged at {} (time elapsed {:2.2f} hours).'.format(iid, dischtime, dischtime / np.timedelta64(1,'h')))
dischtime = dischtime / np.timedelta64(1,'h')
df.head()

In [None]:
# Two subplots, the axes array is 1-d
f, axarr = plt.subplots(2, sharex=True, figsize=[10,10])

pretty_labels = {'heartrate': 'Heart rate',
                 'meanbp': 'Mean blood pressure',
                 'resprate': 'Respiratory rate',
                 'spo2': 'Peripheral oxygen saturation',
                 'temp': 'Temperature',
                 'bg_ph': 'pH',
                 'bg_bicarbonate': 'Serum bicarbonate',
                 'hemoglobin': 'Hemoglobin',
                 'potassium': 'Potassium',
                 'inr': 'International normalized ratio',
                 'bg_lactate': 'Lactate',
                 'wbc': 'White blood cell count'}
#var_list = df.columns

# first plot all the vitals in subfigure 1
var_vitals = [u'heartrate', u'meanbp', u'resprate', u'temp', u'spo2']

i=0
t_scale = 60.0*60.0 # divide by this to get from seconds to t_unit
t_unit = 'Hours elapsed'
for v in var_vitals:
    idx = ~df[v].isnull()
    if np.sum(idx) > 0:
        axarr[0].plot(df.loc[idx,v].index/t_scale, df.loc[idx,v].values, '--',
                      label=pretty_labels[v],
                      marker=marker[np.mod(i,7)],
                      color=tableau20[i], linewidth=2)
        i+=1
        

axarr[0].set_ylim([0,150])
y_lim = axarr[0].get_ylim()

# add ICU discharge
axarr[0].plot([dischtime,dischtime], y_lim, 'k:',linewidth=3)

# add a horizontal line to represent the window
rect = matplotlib.patches.Rectangle( (starttime/60.0-4, y_lim[0]), 4, y_lim[1], color='#bdbdbd'  )
axarr[0].add_patch(rect)
#axarr[0].text(starttime/60.0-4-2,4, 'window',fontsize=16)

axarr[0].set_ylabel('Vital signs',fontsize=16)
axarr[0].legend(shadow=True, fancybox=True,loc='upper center', bbox_to_anchor=(0.5, 1.1),ncol=3)
# next plot the labs in subfigure 2
var_labs = ['bg_ph', 'bg_bicarbonate', 'hemoglobin', 
            'potassium', 'inr', 'bg_lactate', 'wbc']

# u'aniongap', u'albumin', u'bands',
# u'bicarbonate', u'bilirubin', u'creatinine', u'chloride', u'glucose',
# u'hematocrit', u'hemoglobin', u'inr', u'lactate', u'platelet',
# u'potassium', u'ptt', u'pt', u'sodium', u'bun', u'wbc'
    
i=0
t_scale = 60.0*60.0 # divide by this to get from seconds to t_unit
t_unit = 'Hours elapsed since ICU admission'
for v in var_labs:
    idx = ~df[v].isnull()
    if np.sum(idx) > 0:
        axarr[1].plot(df.loc[idx,v].index/t_scale, df.loc[idx,v].values, '--',
                      label=pretty_labels[v],
                      marker=marker[np.mod(i,7)], markersize=8,
                      color=tableau20[i], linewidth=2)
        i+=1

axarr[1].set_ylim([0,10])
y_lim = axarr[1].get_ylim()

# add ICU discharge
axarr[1].plot([dischtime,dischtime], y_lim, 'k:',linewidth=3)

# add a horizontal line to represent the window
y_mid = axarr[1].get_ylim()
rect = matplotlib.patches.Rectangle( (starttime/60.0-4-24, y_mid[0]), 28, y_mid[1], color='#bdbdbd')
axarr[1].add_patch(rect)

axarr[1].set_xlabel(t_unit,fontsize=16)
axarr[1].set_ylabel('Laboratory measurements',fontsize=16)
axarr[1].legend(shadow=True, fancybox=True,loc='upper center', bbox_to_anchor=(0.5, 1.1),ncol=3)
plt.savefig('example_patient.pdf')
plt.show()

In [None]:
y_mid[1]

# Plot data from an example patient

In [None]:
# define the patient
iid = 200001

In [None]:
# get extra data about treatments/code status
inputs = mp.query_infusions(iid)
inputs['icustarttimehr'] = inputs['icustarttime'] / np.timedelta64(1,'h')
inputs['icuendtimehr'] = inputs['icuendtime'] / np.timedelta64(1,'h')

codestatus = mp.query_codestatus(iid) # takes 2-3 minutes
if codestatus.shape[0]>0:
    codestatus['icutimehr'] = codestatus['icutime'] / np.timedelta64(1,'h')

In [None]:
# Plot sample data over first 24 hours from admission to ICU
# Credit: Randal Olson for styling (http://www.randalolson.com/2014/06/28/)

# Prepare the size of the figure
fig = plt.figure(figsize=(22, 20))

# Remove the plot frame lines. 
ax = plt.subplot(111)    
ax.spines["top"].set_visible(False)    
ax.spines["bottom"].set_visible(True)    
ax.spines["right"].set_visible(False)    
ax.spines["left"].set_visible(True)    
  
# Ensure that the axis ticks only show up on the bottom and left of the plot.      
ax.get_xaxis().tick_bottom()    
ax.get_yaxis().tick_left() 

# Plot input/output events
plt.plot(inputs.icustarttimehr[inputs.amountuom=='mL'], 
         inputs.amount[inputs.amountuom=='mL'].cumsum()/100, 
         color=tableau20[9], lw=2.5,
         marker='o', markersize=6, label='Intake volume, dL')

#plt.plot(outputs.icutimehr, 
#         outputs.value.cumsum()/100, 
#         color=tableau20[10], lw=2.5,
#         marker='o', markersize=6, label='Output volume, dL')

# Plot intravenous meds

iv_meds = [x for x in inputs.label.unique()
           if x != 'Gastric Meds'
           and x != 'PO Intake']



for m, med in enumerate(iv_meds):
    y_loc = 165-(m*5) # where on the y-axis the data is plotted
    unit = [x for x in inputs.loc[inputs['label']==med,'rateuom'].unique()
        if x != '']
    if len(unit)>0:
        unit = ', ' + str(unit[0])
    else:
        unit = ''
    
    plt.text(np.max(inputs.icuendtimehr)+0.5,y_loc,med + unit,fontsize=17)
    
    
    for i,row in inputs.loc[(inputs["label"] == med) & (inputs["rate"] > 0)].iterrows():
        plt.plot([row['icustarttimehr'],row['icuendtimehr']],[y_loc]*2,
                 color=tableau20[16], lw=4,marker='o', markersize=6)
        plt.text(row['icustarttimehr'], y_loc,
                 str(round(row['rate'],1)),
                 fontsize=15)
        
    if inputs.loc[(inputs["label"] == med) & (inputs["rate"] > 0)].shape[0] == 0:
        plt.plot(inputs.icustarttimehr[inputs.label==med],
                  [y_loc+1.5]*len(inputs[inputs.label==med]),
                  color=tableau20[16], lw=0, marker='o', markersize=6)
        
        # dashed line to guide the eye
        plt.plot([0,np.max(inputs.icuendtimehr)],
                  [y_loc+1.5]*2,
                  color=[0.3,0.3,0.3], lw=2, linestyle=':', markersize=6)

# Plot code status
if codestatus.shape[0]>0:
    plt.text(-10,220,'Code status',fontsize=17) 
    for i, txt in enumerate(codestatus.value[codestatus.label=='Code Status'].values):
            plt.annotate(txt, (codestatus.icutimehr[codestatus.label=='Code Status'].
                               values[i],220),fontsize=17)
        
plt.legend(loc=5,fontsize=18)
plt.xlabel('Time after admission to the intensive care unit, hours', fontsize=22)
plt.ylabel('Measurement, absolute value', fontsize=22)
plt.yticks(np.arange(0, 140, 20))

ax.set_ylim([0,170])
plt.show()

In [None]:
# plot histogram of a given data element
all_var = df.columns
var = all_var[15]
#var = 'moribund'

xi = df.loc[~df[var].isnull(),var].values
xi = np.percentile(xi,[0.5,99.5])
xi_bins = np.linspace(xi[0],xi[1],num=50)
xi_delta = xi_bins[1] - xi_bins[0]

plt.figure(figsize=[7,7])
idx0 = df['hospital_expire_flag'] == 0
plt.hist(df.loc[idx0, var].values, bins=(xi_bins)-xi_delta/4, range=xi,
         label='Survived', color=col[1], width=xi_delta, alpha=0.7, lw=0, normed=True)
plt.hist(df.loc[~idx0, var].values, bins=xi_bins, range=xi,
         label='Expired', color=col[0], width=xi_delta, alpha=0.7, lw=0, normed=True)
plt.xlabel(var,fontsize=14)
plt.legend(loc='best')
plt.show()