In [1]:
import os, sys
import pandas as pd
import numpy as np
import params as pm
import datetime as dt
import json
import pickle
import tqdm


# Select cohorts

In [20]:
def gather_cohort(adults=True, patient_weight=True, icd_diagnoses=[], min_los=1, max_los=8, verbose=False):
    cohort = pd.read_csv("data/mimic-iv-1.0/icu/icustays.csv.gz",
                              compression='gzip')
    # Filter for adults with icustay length between 1 and 8 days
    cohort = cohort[cohort['los'] >= min_los]
    cohort = cohort[cohort['los'] <= max_los]
    cohort = cohort[['subject_id', 'stay_id', 'hadm_id', 'intime', 'outtime', 'los']]

    ages = pd.read_csv("data/mimic-iv-1.0/core/patients.csv.gz",
                              compression='gzip')
    ages = ages[ages['anchor_age'] >= 18]
    ages = ages[['subject_id', 'gender', 'anchor_age']]

    admissions = pd.merge(cohort, ages, on=['subject_id'])

    # Admissions + Age information
    admissions = pd.merge(admissions, ages, on=['subject_id', 'gender', 'anchor_age'], how='inner')
    admissions
    if patient_weight:
        if verbose: print("Adding information about patientweight")
        # Add patient weight information
        weights = pd.read_csv("data/mimic-iv-1.0/icu/procedureevents.csv.gz",
                              compression='gzip')
        weights = weights[['stay_id', 'hadm_id', 'patientweight']]
        admissions = pd.merge(admissions, weights, on=['hadm_id', 'stay_id']).drop_duplicates()
    admissions
    
    if len(icd_diagnoses) > 0:
        if verbose: print("Filtering for ICD diagnoses")
        diagnoses_icd = pd.read_csv("data/mimic-iv-1.0/hosp/diagnoses_icd.csv.gz",
                              compression='gzip')
        diagnoses_icd = diagnoses_icd[diagnoses_icd[['icd_code', 'icd_version']].apply(tuple, axis=1).isin(icd_diagnoses)]
        diagnoses = diagnoses_icd[['hadm_id', 'icd_code', 'icd_version']]
        admissions = pd.merge(admissions, diagnoses, on=['hadm_id']).drop_duplicates()

    hadm_ids = admissions.hadm_id.unique()

    return hadm_ids, admissions

# Gather dataframes with basic vitals + meds + labs for each cohort

In [13]:
'''
Gathers all ICU level medications. 
'''
def gather_meds(meds_dict, verbose=False):
    meds_ids = []
    for key in meds_dict:
        names = meds_dict[key]
        meds_ids += names

    if verbose: print("Gathering meds")    
    meds = pd.read_csv("data/mimic-iv-1.0/icu/inputevents.csv.gz",
                              compression='gzip')
    meds = meds[meds['itemid'].isin(meds_ids)]
    meds = meds[meds['amount'] > 0]
    meds = meds[meds['amount'] < 9999]
    
    if verbose: print("Saving dataframes")
    meds = meds.drop_duplicates()
    meds.to_pickle('meds')
    return meds

'''
Gather all vitals records. 
'''
def gather_vitals(vitals_dict, verbose=False):
    vitals_itemids = []
    for key in vitals_dict:
        itemid = vitals_dict[key]
        vitals_itemids.extend(itemid)
    if verbose: print("Gathering vitals")
    vitals = pd.read_csv("data/mimic-iv-1.0/icu/chartevents.csv.gz",
                              compression='gzip')
    vitals = vitals[vitals['itemid'].isin(vitals_itemids)]
    vitals = vitals[vitals['valuenum'] >= 0]
    vitals = vitals[vitals['valuenum'] < 9999]
    
    if verbose: print("Saving dfs")
    filtered_vitals_df = vitals.drop_duplicates()
    filtered_vitals_df.to_pickle('vitals')

    return filtered_vitals_df

'''
Gather urine output. 
'''
def gather_outputs(output_itemids, verbose=False):

    if verbose: print("Gathering outputs")
    outputs = pd.read_csv("data/mimic-iv-1.0/icu/outputevents.csv.gz",
                              compression='gzip')
    ouptuts = outputs[outputs['itemid'].isin(output_itemids)]
    outputs = outputs[outputs['value'] >= 0]
    outputs = outputs[outputs['value'] < 99999]

    filtered_output_df = outputs

    if verbose: print("Saving dataframes")
    filtered_output_df.to_pickle('output')
    return filtered_output_df

'''
Gathers information about ICU level labs. 
'''
def gather_labs(labs_dict, verbose=False):
    labs_itemids = []
    for key in labs_dict:
        itemids = labs_dict[key]
        labs_itemids += itemids

    if verbose: print("Gathering labs")
    labs = pd.read_csv("data/mimic-iv-1.0/icu/chartevents.csv.gz",
                              compression='gzip')
    labs = labs[labs['itemid'].isin(labs_itemids)]
    labs = labs[labs['valuenum'] >= 0]
    labs = labs[labs['valuenum'] < 99999]
    
    filtered_labs_df = labs.drop_duplicates()

    if verbose: print("Saving dataframes")
    filtered_labs_df.to_pickle('labs')
    return filtered_labs_df

# ChartFrames for both cohorts

In [14]:
def buildTimeFrame(df_adm, delta=6):

    # Get admit and discharge time in numeric form, round down/up respectively to the nearest hour:
    start = pd.to_datetime(df_adm.intime.unique()).tolist()[0]
    start -= dt.timedelta(minutes=start.minute, seconds=start.second, microseconds=start.microsecond)

    end = pd.to_datetime(df_adm.outtime.unique()).tolist()[0]
    end -= dt.timedelta(minutes=end.minute, seconds=end.second, microseconds=end.microsecond)
    end += dt.timedelta(hours=1)

    times = []
    curr = start
    while curr < end:
        times.append(curr)
        curr += dt.timedelta(hours=delta)
    timeFrame = pd.DataFrame(data={'timestamp': times}, index=times)
    return timeFrame

'''
Build time-indexed dataframe of each patient admission, with resampled values of all variables.
hadm_id: hadm_id of the patient stay
tables: all of the tables generated in _load_data() 
popmeans: population level means used for imputation
verbose: flag for printing progress
timedelta: length in hours of the time interval for the rows
'''
def chartFrames(hadm_id, tables, verbose=False, timedelta=6):

    adm = 1; morb = 0; vits = 1; lbs = 1; mds = 1; prcs = 0; vnt = 0
    admissions, meds, vitals, labs, outputs = tables

    df_adm = admissions[admissions.hadm_id == hadm_id].drop_duplicates()
    chart = buildTimeFrame(df_adm)

    if adm:
        if verbose: print('Admission Data')
        for var in ['stay_id', 'anchor_age', 'patientweight', 'los', 'gender']:
            chart[var.lower()] = df_adm[var].head(1).item()
        chart['gender'] = (chart['gender'] == 'F').astype(int)

    if morb:
        if verbose: print("Comorbidities")
        df_comorbs = comorbs[comorbs.hadm_id == hadm_id]
        for subpop in self.params.comorbidities_dict:
            subpop_df = df_comorbs[df_comorbs.long_title.isin(self.params.comorbidities_dict[subpop])]
            if subpop_df.empty:
                chart[subpop] = 0
            else:
                chart[subpop] = 1
        chart['expired'] = 0
    if lbs:
        if verbose: print('Lab tests')  # Using result date
        df_labs = labs[labs.hadm_id == hadm_id].drop_duplicates()

        # ICU Labs
        for k in sorted(list(pm.labs_dict_icu.keys())):
            chart[k.lower()] = np.nan
            for t in chart.timestamp:
                subset = df_labs[df_labs['itemid'].isin(pm.labs_dict_icu[k])]
                for i in subset.index:
                    if ((pd.to_datetime(subset.loc[i, 'charttime']) >= t) & (
                        pd.to_datetime(subset.loc[i, 'charttime']) < t + dt.timedelta(hours=timedelta))):
                        if k not in ['Ca', 'Glucose', 'CPK', 'PTH', 'LDH', 'AST', 'ALT']:
                            chart.loc[t, k.lower()] = np.nanmean([chart.loc[t, k.lower()], subset.loc[i, 'valuenum']])
                        else:
                            chart.loc[t, k.lower()] = 1

            chart[k.lower()] = chart[k.lower()].fillna(method='ffill', limit=3).fillna(value=0)

    if vits:
        if verbose: print('Vitals')
        df_vits = vitals[vitals.hadm_id == hadm_id]
        for k in sorted(list(pm.vitals_dict_icu.keys())):
            if k == "URINE_OUTPUT":
                continue
            chart[k.lower()] = np.nan
            for t in chart.timestamp:
                subset = df_vits[df_vits.itemid.isin(pm.vitals_dict_icu[k])]
                for i in subset.index:
                    try:
                        if ((pd.to_datetime(subset.loc[i, 'charttime']) >= t) & (
                            pd.to_datetime(subset.loc[i, 'charttime']) < t + dt.timedelta(hours=timedelta))):
                            chart.loc[t, k.lower()] = np.nanmean([chart.loc[t, k.lower()], subset.loc[i, 'value']])
                    except:
                        try:
                            if ((pd.to_datetime(subset.loc[i, 'charttime']) >= t).any() & (
                                pd.to_datetime(subset.loc[i, 'charttime']) < t + dt.timedelta(hours=timedelta)).any()):
                                chart.loc[t, k.lower()] = np.nanmean([chart.loc[t, k.lower()], subset.loc[i, 'value']])
                        except:
                            pass
            chart[k.lower()] = chart[k.lower()].fillna(method='ffill').fillna(value=0)


    if mds:
        if verbose: print('Medications')
        df_meds = meds[meds.hadm_id==hadm_id].drop_duplicates()
        for k in sorted(list(pm.meds_dict_icu.keys())):
            chart[k.lower()] = 0
            subset = df_meds[df_meds.itemid.isin(pm.meds_dict_icu[k])]
            for t in chart.timestamp:
                for i, row in subset.iterrows():
                    if row.amountuom == 'dose':
                        continue
                    if ((pd.to_datetime(row.starttime) >= t) and (pd.to_datetime(row.starttime) <  t + dt.timedelta(hours=timedelta))):
                        if k in ['K-IV', 'K-nonIV', 'Mg-IV', 'Mg-nonIV', 'P-IV', 'P-nonIV']:
                            scaler = 1
                            if row.amountuom == 'grams':
                                scaler = 1000
                            elif row.amountuom == 'mcg':
                                scaler = 0.001
                            elif row.amountuom == 'pg':
                                scaler = 1e-9
                            elif row.amountuom == 'ml':
                                scaler = 1
                            chart.loc[t, k.lower()] += scaler * float(row.amount)
                            td = pd.to_datetime(row.endtime) - pd.to_datetime(row.starttime)
                            hours = td.days * 24 + td.seconds // 3600
                            chart.loc[t, 'hours-'+k.lower()] = hours
                        else:
                            chart.loc[t, k.lower()] += 1
                    if row.endtime is not np.nan:
                        if ((pd.to_datetime(row.starttime) <= t) and (pd.to_datetime(row.endtime) > t)):
                            if k in ['K-IV', 'K-nonIV', 'Mg-IV', 'Mg-nonIV', 'P-IV', 'P-nonIV']:
                                scaler = 1
                                if row.amountuom == 'grams':
                                    scaler = 1000
                                elif row.amountuom == 'mcg':
                                    scaler = 0.001
                                elif row.amountuom == 'pg':
                                    scaler = 1e-9
                                elif row.amountuom == 'ml':
                                    scaler = 1
                                chart.loc[t, k.lower()] += scaler * float(row.amount)
                                td = pd.to_datetime(row.endtime) - pd.to_datetime(row.starttime)
                                hours = td.days * 24 + td.seconds // 3600
                                chart.loc[t, 'hours-'+k.lower()] = hours
                            else:
                                chart.loc[t, k.lower()] += 1

    chart = chart[~np.isnat(chart.timestamp)]
    chart = chart.dropna()
    if verbose: print('Done!')

    return chart

In [15]:
def gridBatch(patient_ids, tables, name):
    batchCharts = pd.DataFrame()

    for i, vnum in tqdm.tqdm(enumerate(patient_ids)):
        try:
            chart = chartFrames(vnum, tables, verbose=False)
            batchCharts = batchCharts.append(chart, ignore_index=True)
            if i % 50 == 0:
                batchCharts.to_csv(str(i) + '_' + str(name)+ '_checkpoint.csv', index=False)
        except:
            print("Issue in : " + str(vnum))
    print('Batch done!')
    batchCharts.to_csv(name + '_allFrames.csv', index=False)

# Generate tabular data

In [18]:
# Run this cell if you have saved dataframes
meds = pickle.load(open('meds', 'rb'))
vitals = pickle.load(open('vitals', 'rb'))
outputs = pickle.load(open('output', 'rb'))
labs = pickle.load(open('labs', 'rb'))

## Melanoma

In [21]:
diagnoses_icd = pd.read_csv("data/mimic-iv-1.0/hosp/d_icd_diagnoses.csv")
melanoma_codes = diagnoses_icd[diagnoses_icd['long_title'].str.contains('melanoma')]
melanoma = []
for c, v in zip(melanoma_codes['icd_code'], melanoma_codes['icd_version']):
    melanoma.append((c, v))
hadm_ids, admissions = gather_cohort(icd_diagnoses=melanoma, verbose=True)

Adding information about patientweight
Filtering for ICD diagnoses


In [22]:
gridBatch(hadm_ids, (admissions, meds, vitals, labs, outputs), 'melanoma')

561it [24:43,  1.81s/it]

Batch done!





# Heart Disease

In [23]:
cad_names = ['Atherosclerotic heart disease of native coronary artery without angina pectoris',
'Atherosclerotic heart disease of native coronary artery with unstable angina pectoris', 
'Atherosclerotic heart disease of native coronary artery with angina pectoris with documented spasm', 
'Atherosclerotic heart disease of native coronary artery with other forms of angina pectoris', 
'Atherosclerotic heart disease of native coronary artery with unspecified angina pectoris']
diagnoses_icd = pd.read_csv("data/mimic-iv-1.0/hosp/d_icd_diagnoses.csv")
hd_codes = diagnoses_icd[diagnoses_icd['long_title'].isin(cad_names)]
hd = []
for c, v in zip(hd_codes['icd_code'], hd_codes['icd_version']):
    hd.append((c, v))
hadm_ids, admissions = gather_cohort(icd_diagnoses=hd, verbose=True)
admissions

Adding information about patientweight
Filtering for ICD diagnoses


Unnamed: 0,subject_id,stay_id,hadm_id,intime,outtime,los,gender,anchor_age,patientweight,icd_code,icd_version
0,10215159,34755606,24039782,2127-12-14 20:04:00,2127-12-20 21:47:17,6.071725,F,67,73.0,I2510,10
1,10215159,38137388,24039782,2127-12-22 06:57:18,2127-12-25 22:27:46,3.646157,F,67,73.0,I2510,10
2,12974563,32563675,29618057,2138-11-13 23:30:01,2138-11-15 16:25:19,1.705069,F,72,90.7,I2510,10
3,12974563,36274915,24320856,2139-04-15 16:32:41,2139-04-17 00:23:17,1.326806,F,72,850.0,I2510,10
4,12687112,37445058,26132667,2162-05-31 18:08:45,2162-06-04 10:16:13,3.671852,M,57,119.1,I2510,10
...,...,...,...,...,...,...,...,...,...,...,...
7461,14263294,39928603,27354668,2126-03-04 09:57:14,2126-03-05 11:50:07,1.078391,M,61,76.0,I2510,10
7462,10836444,39934059,25551438,2170-12-09 13:50:58,2170-12-10 17:23:26,1.147546,M,47,95.0,I2510,10
7463,16979986,39944977,29542651,2159-10-06 07:34:56,2159-10-07 22:34:31,1.624711,M,79,73.9,I2510,10
7464,19113885,39978206,25409746,2122-06-25 02:13:00,2122-06-27 16:35:53,2.599225,M,84,67.2,I2510,10


In [None]:
gridBatch(hadm_ids, (admissions, meds, vitals, labs, outputs), 'hd')

10it [00:35,  4.55s/it]