In [1]:


from __future__ import print_function 

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import sys
import datetime as dt
import mp_utils as mp

USE_SQL=0
USE_CSV=1

In [None]:
# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'
query_schema = 'SET search_path to public,' + schema_name + ';'


if USE_SQL:
    # Connect to local postgres version of mimic
    con = psycopg2.connect(dbname=dbname, user=sqluser)

    # exclusion criteria:
    #   - less than 16 years old
    #   - stayed in the ICU less than 4 hours
    #   - never have any chartevents data (i.e. likely administrative error)
    query = query_schema + \
    """
    select 
        subject_id, hadm_id, icustay_id
    from mp_cohort
    where excluded = 0
    """
    co = pd.read_sql_query(query,con)

    # extract static vars into a separate dataframe
    df_static = pd.read_sql_query(query_schema + 'select * from mp_static_data', con)
    #for dtvar in ['intime','outtime','deathtime']:
    #    df_static[dtvar] = pd.to_datetime(df_static[dtvar])

    vars_static = [u'is_male', u'emergency_admission', u'age',
                   # services
                   u'service_any_noncard_surg',
                   u'service_any_card_surg',
                   u'service_cmed',
                   u'service_traum',
                   u'service_nmed',
                   # ethnicities
                   u'race_black',u'race_hispanic',u'race_asian',u'race_other',
                   # phatness
                   u'height', u'weight', u'bmi']


    # get ~5 million rows containing data from errbody
    # this takes a little bit of time to load into memory (~2 minutes)

    # %%time results
    # CPU times: user 42.8 s, sys: 1min 3s, total: 1min 46s
    # Wall time: 2min 7s

    df = pd.read_sql_query(query_schema + 'select * from mp_data', con)
    df.drop('subject_id',axis=1,inplace=True)
    df.drop('hadm_id',axis=1,inplace=True)
    df.sort_values(['icustay_id','hr'],axis=0,ascending=True,inplace=True)
    print(df.shape)

    # get death information
    df_death = pd.read_sql_query(query_schema + """
    select 
    co.subject_id, co.hadm_id, co.icustay_id
    , ceil(extract(epoch from (co.outtime - co.intime))/60.0/60.0) as dischtime_hours
    , ceil(extract(epoch from (adm.deathtime - co.intime))/60.0/60.0) as deathtime_hours
    , case when adm.deathtime is null then 0 else 1 end as death
    from mp_cohort co
    inner join admissions adm
    on co.hadm_id = adm.hadm_id
    where co.excluded = 0
    """, con)

    # get severity scores
    df_soi = pd.read_sql_query(query_schema + """
    select 
    co.icustay_id
    , case when adm.deathtime is null then 0 else 1 end as death
    , sa.saps
    , sa2.sapsii
    , aps.apsiii
    , so.sofa
    , lo.lods
    , oa.oasis
    from mp_cohort co
    inner join admissions adm
    on co.hadm_id = adm.hadm_id
    left join saps sa
    on co.icustay_id = sa.icustay_id
    left join sapsii sa2
    on co.icustay_id = sa2.icustay_id
    left join apsiii aps
    on co.icustay_id = aps.icustay_id
    left join sofa so
    on co.icustay_id = so.icustay_id
    left join lods lo
    on co.icustay_id = lo.icustay_id
    left join oasis oa
    on co.icustay_id = oa.icustay_id
    where co.excluded = 0
    """, con)
    
    # get censoring information
    df_censor = pd.read_sql_query(query_schema + """
    select co.icustay_id, min(cs.charttime) as censortime
    , ceil(extract(epoch from min(cs.charttime-co.intime) )/60.0/60.0) as censortime_hours
    from mp_cohort co 
    inner join mp_code_status cs
    on co.icustay_id = cs.icustay_id
    where cmo+dnr+dni+dncpr+cmo_notes>0
    and co.excluded = 0
    group by co.icustay_id
    """, con)

In [2]:
if USE_CSV:
    co = pd.read_csv('df_cohort.csv')
    
    # convert the inclusion flags to boolean
    for c in co.columns:
        if c[0:10]=='inclusion_':
            co[c] = co[c].astype(bool)
    df = pd.read_csv('df_data.csv')
    df_static = pd.read_csv('df_static_data.csv')
    df_censor = pd.read_csv('df_censor.csv')
    df_soi = pd.read_csv('df_soi.csv')

In [3]:
df_static.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,is_male,curr_service,service_med,service_cmed,service_omed,service_nmed,service_nsurg,...,service_any_card_surg,age,race_black,race_hispanic,race_asian,race_other,emergency_admission,height,weight,bmi
0,3,145834,211552,1,VSURG,0,0,0,0,0,...,1,76.5246,0,0,0,0,1,179.07,96.8,30.187677
1,4,185777,294638,0,MED,1,0,0,0,0,...,0,47.845,0,0,0,0,1,,53.6,
2,6,107064,228232,0,SURG,0,0,0,0,0,...,0,65.9398,0,0,0,0,0,,,
3,9,150750,220597,1,NMED,0,0,0,1,0,...,0,41.7887,0,0,0,1,1,182.88,104.0,31.095741
4,11,194540,229441,0,NSURG,0,0,0,0,1,...,0,50.1476,0,0,0,0,1,,,


In [4]:
co.columns

Index([u'subject_id', u'hadm_id', u'icustay_id', u'intime', u'outtime', u'age',
       u'gender', u'ethnicity', u'dischtime_hours', u'deathtime_hours',
       u'censortime_hours', u'icu_los', u'hosp_los',
       u'death_48hr_post_icu_admit', u'death_icu', u'hospital_expire_flag',
       u'death_in_hospital', u'death_30dy_post_icu_admit',
       u'death_30dy_post_icu_disch', u'death_30dy_post_hos_disch',
       u'death_6mo_post_hos_disch', u'death_1yr_post_hos_disch',
       u'death_2yr_post_hos_disch', u'death_30dy_post_hos_admit',
       u'exclusion_over_15', u'exclusion_valid_data', u'exclusion_stay_lt_4hr',
       u'exclusion_organ_donor', u'excluded', u'inclusion_over_16',
       u'inclusion_over_18', u'inclusion_stay_ge_12hr',
       u'inclusion_stay_ge_17hr', u'inclusion_stay_ge_24hr',
       u'inclusion_stay_ge_48hr', u'inclusion_stay_le_500hr',
       u'inclusion_first_admission', u'inclusion_only_mimicii',
       u'inclusion_only_micu', u'inclusion_non_alc_icd9',
       u'incl

In [9]:
idxRem = np.zeros(co.shape[0],dtype=bool)
for c in co.columns:
    if 'exclusion_' in c:
        print('{:5g} - {:2.2f}% - {}'.format(co[c].sum(),
                                             co[c].mean()*100.0,
                                            c))
        
        idxRem[co[c].values==1] = True
    

print('{:5g} - {:2.2f}% - {}'.format(np.sum(idxRem),
                                     np.mean(idxRem)*100.0,
                                    'total removed'))


print('{:5g} - {:2.2f}% - {}'.format(np.sum(~idxRem),
                                     np.mean(~idxRem)*100.0,
                                    'final cohort'))


 8101 - 13.17% - exclusion_over_15
 1347 - 2.19% - exclusion_valid_data
 3641 - 5.92% - exclusion_stay_lt_4hr
    4 - 0.01% - exclusion_organ_donor
 9447 - 15.35% - total removed
52085 - 84.65% - final cohort


In [10]:
print(df_static.shape)
print(df['icustay_id'].nunique())

(52058, 31)

In [6]:
# generate k-fold indices
np.random.seed(111)
K = 5 # number of folds

# get unique subject_id
sid = np.sort(np.unique(df_static['subject_id'].values))

# assign k-fold
idxK_sid = np.random.permutation(sid.shape[0])
idxK_sid = np.mod(idxK_sid,K)

In [None]:
var_min, var_max, var_first, var_last, var_sum, var_first_early, var_last_early, var_static = mp.vars_of_interest()

# create window time for each patient
df_tmp=df_death.copy().merge(df_censor, how='left', left_on='icustay_id', right_on='icustay_id')
time_dict = mp.generate_times(df_tmp, T=2, seed=111, censor=True)

# generate windows
df_data = mp.get_design_matrix(df, time_dict, W=8, W_extra=24)

# remove icustay_ids if they were censored (made DNR) before icu admission, or close enough to that
idx = df_censor.loc[df_censor['censortime_hours']<=0, 'icustay_id']
print('Removed {} icustay_id as they were censored on/before ICU admission.'.format((idx.shape[0])))
df_data.drop(idx, axis=0, inplace=True)

# first, the data from static vars from df_static
X = df_data.merge(df_static.set_index('icustay_id')[var_static], how='left', left_index=True, right_index=True)

# next, add in the outcome: death in hospital
X = X.merge(df_death.set_index('icustay_id')[['death']], left_index=True, right_index=True)

# generate K-fold indices
X = X.merge(df_death.set_index('icustay_id')[['subject_id']], left_index=True, right_index=True)

# get indices which map subject_ids in sid to the X dataframe
idxMap = np.searchsorted(sid, X['subject_id'].values)

# use these indices to map the k-fold integers
idxK = idxK_sid[idxMap]

# add idxK to design matrix
X['idxK'] = idxK

# write to file
X.to_csv('X_design_matrix.csv')