In [305]:
%load_ext autoreload
%autoreload 2
import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss
from tqdm import tqdm
import functools

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

import torch, torch.utils.data as utils, torch.nn as nn, torch.nn.functional as F, torch.optim as optim
from torch.autograd import Variable
from torch.nn.parameter import Parameter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
DATA_FILEPATH = '/data/adarsh/mimic_iii_extraction/all_hourly_data.h5'

In [8]:
# All flattened version
vitals  = pd.read_hdf(DATA_FILEPATH, 'vitals_labs')
# flatten multi index
vitals.columns = [' '.join(col).strip() for col in vitals.columns.values]
vitals = vitals.reset_index()
statics = pd.read_hdf(DATA_FILEPATH, 'patients').reset_index()
icd_codes = pd.read_hdf(DATA_FILEPATH, 'codes').reset_index()
treatments = pd.read_hdf(DATA_FILEPATH, 'interventions').reset_index()

In [139]:
# unflattened version
vitals  = pd.read_hdf(DATA_FILEPATH, 'vitals_labs').reset_index(['hadm_id','icustay_id'])
statics = pd.read_hdf(DATA_FILEPATH, 'patients').reset_index(['hadm_id','icustay_id'])
icd_codes = pd.read_hdf(DATA_FILEPATH, 'codes').reset_index(['hadm_id','icustay_id'])
treatments = pd.read_hdf(DATA_FILEPATH, 'interventions').reset_index(['hadm_id','icustay_id'])

In [202]:
saps_vital_features = ['heart rate',
                       'systolic blood pressure',
                       'temperature',
                       'blood urea nitrogen',
                       'white blood cell count',
                       'potassium',
                       'sodium',
                       'bicarbonate',
                       'bilirubin',
                       'glascow coma scale total',
                       'partial pressure of oxygen',
                       'fraction inspired oxygen'
                      ]
saps_static_features = ['age', 'admission_type']

aids_icd = ['042', '043', '044']



metastatic_cancer_icd = [str(x) 
                         for x in list(range(19600, 19920)) + 
                         list(range(20970, 20976)) +
                         ['20979', '78951']
                        ] 

hematologic_malignancy_icd = [str(x) 
                              for x in 
                              list(range(20000, 20239)) + 
                              list(range(20240, 20249)) +
                              list(range(20250, 20303)) + 
                              list(range(20310, 20313)) +
                              list(range(20302, 20383)) +
                              list(range(20400, 20523)) + 
                              list(range(20580, 20703)) +
                              list(range(20720, 20893)) +
                              list(range(23860, 23870)) +
                              list(range(27330, 27340))
                             ]



In [228]:
subject_ids = np.array(sorted(statics.index.unique().values))

In [338]:
# taken from https://github.com/MIT-LCP/mimic-code/blob/52d7df53348a6e25dfbe795c0e28c389efc40be9/mimic-iii/concepts/severityscores/sapsii.sql
def check_icd_AIDS(code_list):
    def AIDS_check(code):
        substr = code[0:3]
        return (substr <= '044') and (substr >= '042')
    check_list = [AIDS_check(code) for code in code_list]
    return(np.any(check_list))

In [339]:
# taken from https://github.com/MIT-LCP/mimic-code/blob/52d7df53348a6e25dfbe795c0e28c389efc40be9/mimic-iii/concepts/severityscores/sapsii.sql
def check_icd_metastatic_cancer(code_list):
    def cancer_check(code):
        substr = code[0:4]
        check1 = (substr <= '1991') and (substr >= '1960')
        check2 = (code <= '20975') and (code >= '20970')
        check3 = (code == '20979') or (code == '78951')
        return(check1 or check2 or check3)
    
    check_list = [cancer_check(code) for code in code_list]
    return(np.any(check_list))

In [344]:
# taken from https://github.com/MIT-LCP/mimic-code/blob/52d7df53348a6e25dfbe795c0e28c389efc40be9/mimic-iii/concepts/severityscores/sapsii.sql
def check_icd_hem_malignancy(code_list):
    def malig_check(code):
        substr = code[0:4]
        check1 = substr in ['2386', '2733']
        check2 = '20000' <= code <= '20238'
        check3 = '20240' <= code <= '20248'
        check4 = '20250' <= code <= '20302'
        check5 = '20310' <= code <= '20312'
        check6 = '20302' <= code <= '20382'
        check7 = '20400' <= code <= '20522'
        check8 = '20580' <= code <= '20702'
        check9 = '20720' <= code <= '20892'
        
        return(np.any([check1, check2, check3, check4, check5, check6, check7, check8, check9]))
        
    check_list = [malig_check(code) for code in code_list]
    return(np.any(check_list))

In [357]:
# This is slow

def extract_chronic_conditions(subject_ids):
    aids_list = []
    cancer_list = []
    malignancy_list = []

    for i, sid in enumerate(tqdm(subject_ids)):
        by_hour_codes = icd_codes.loc[sid].query('hours_in < 24')['icd9_codes'].values
        pt_collected_codes = list(functools.reduce(lambda a, b: set(a) | set(b), by_hour_codes))

        aids_list.append(1 * check_icd_AIDS(pt_collected_codes))
        cancer_list.append(1 * check_icd_metastatic_cancer(pt_collected_codes))
        malignancy_list.append(1 * check_icd_hem_malignancy(pt_collected_codes))
    
    icd_data = np.vstack([aids_list, cancer_list, malignancy_list]).T
    icd_df = pd.DataFrame(data=icd_data, 
                          index=subject_ids,
                          columns=['AIDS', 'metastatic cancer', 'hematologic malignancy']
                         )
    icd_df.index.name = 'subject_id'
    return(icd_df)

In [358]:
extracted_chronic_df = extract_chronic_conditions(subject_ids)

100%|██████████| 34386/34386 [05:09<00:00, 111.08it/s]

(34386, 3)





In [409]:
def extract_vitals(subject_ids):
    # get extreme values of the first 24 hours of data
    column_names = []
    columns = []
    
    for i, vital_feature in enumerate(tqdm(saps_vital_features)):
        vital_feature_df = (vitals.
                            loc[subject_ids].
                            query('hours_in < 24')
                            [vital_feature]
                            ['mean']
                            .groupby('subject_id')
                           )
        min_values = vital_feature_df.min().values
        max_values = vital_feature_df.max().values
        
        columns.append([min_values, max_values])
        column_names += [vital_feature + " min", vital_feature + " max"]

    columns = np.vstack(columns).T
    vitals_df = pd.DataFrame(data=columns, columns=column_names, index=subject_ids)
    vitals_df.index.name = 'subject_id'
    
    return(vitals_df)

In [410]:
extracted_vitals_df = extract_vitals(subject_ids)

100%|██████████| 12/12 [00:45<00:00,  3.77s/it]


In [277]:
extracted_statics_df = pd.get_dummies(statics.loc[subject_ids][saps_static_features])

In [372]:
# training model to predict hospital mortality
outcome_df = statics.loc[subject_ids][['mort_hosp']]

In [487]:
# very simple imputation strategy for vitals
missing_value = -9999
imputed_vitals_df = extracted_vitals_df.fillna(missing_value)

In [488]:
ml_ready_df = outcome_df.join([imputed_vitals_df, extracted_chronic_df, extracted_statics_df])

In [405]:
candidate_static_shortcuts = ['ethnicity', 'insurance']

In [441]:
candidate_treatment_shortcuts = ['vent', 'vaso', 'adenosine', 'dobutamine',
       'dopamine', 'epinephrine', 'isuprel', 'milrinone', 'norepinephrine',
       'phenylephrine', 'vasopressin', 'colloid_bolus', 'crystalloid_bolus',
       'nivdurations']

In [442]:
static_shortcut_df = pd.get_dummies(statics.loc[subject_ids][candidate_static_shortcuts])

In [446]:
treatment_shortcut_df = (treatments.
                         loc[subject_ids].
                         query('hours_in < 24')[candidate_treatment_shortcuts].
                         groupby('subject_id').
                         sum()
                        )

In [447]:
treatment_shortcut_df = treatment_shortcut_df.apply(lambda x: (x > 0) * 1, axis=1)

In [461]:
missing_df = (extracted_vitals_df.
              loc[subject_ids][[x for x in extracted_vitals_df.columns if 'min' in x]].
              isnull().
              apply(lambda x: x*1)
             )
missing_df.columns = [" ".join(x.split()[:-1] + ['missing']) for x in missing_df.columns]

In [484]:
candidate_shortcuts_df = static_shortcut_df.join([treatment_shortcut_df, missing_df])

In [489]:
candidate_shortcuts_df.to_csv('candidate_shortcuts.csv')

In [490]:
ml_ready_df.to_csv('model_dataset.csv')