## Create Creatinine dataframe and I/O dataframe from STM db
* This code first queries the STM db to create a pandas dataframe of encounters with creatinine measurement.
    * If there is a creatinine dataframe that was already queried previously, then the code loads the dataframe
* The creatinine dataframe is then filtered by age, length of stay, minimum stable time from admission. Only the longest encounter is included if patient has more than a single encounter.
* Based on the creatinine filtered dataframe, AKI stage is assigned to each creatinine measurement, and AKI onset is defined.
* For encounters without AKI onset, reference time is assigned randomly after observation window + prediction window.
* I/O dataframe is created by querying each feature according to the reference time determined from creatinine dataframe, predefined prediction window and observation window.
    * If there is a feature dataframe that was already queried previosly, but not filtered, then the code loads and queries that dataframe.


### Updates:
* 07/30/2016: Used to create scr_stm2. New criteria to calculate scr_rate is used. Used to create io_stm3. io_stm3 is created based on scr_stm2 and does not include composite variables, such as si, osi, oi.

In [1]:
import pandas as pd
import numpy as np
import stm_utilities as stm
import pickle
import os
import time
from scipy import signal, stats, io
import itertools

stmdb = stm.queryDB()
fileDir = os.path.dirname("__file__")

In [2]:
f1 = open(os.path.join(fileDir, "pickle_files_stm", "feature_stats.pkl"), 'rb')
feature_stats = pickle.load(f1)
f1.close()

f2 = open(os.path.join(fileDir, "pickle_files_stm", "feature_dict_stm.pkl"), 'rb')
feature_dict = pickle.load(f2)
f2.close()

feature_wanted = ['albumin', 'creatinine', 'glucose', 'hemoglobin', 'hr', 
                  'lactic_acid', 'ndbp', 'nsbp', 'ph', 'platelet',
                  'potassium', 'ratio_pao2_flo2', 'spo2', 'temperature', 
                  'urine', 'wbc']
feature_ids = {feature: feature_dict[feature] for feature in feature_wanted}

In [3]:
stable_time = 12
timelag_all = list(-1*np.arange(25))
timewin_all = [12, 6]

combination = [(x,y) for x in timelag_all for y in timewin_all]
mask = [abs(x)>=abs(y) for (x,y) in combination]

combination = list(itertools.compress(combination,mask))

In [4]:
combination

[(-6, 6),
 (-7, 6),
 (-8, 6),
 (-9, 6),
 (-10, 6),
 (-11, 6),
 (-12, 12),
 (-12, 6),
 (-13, 12),
 (-13, 6),
 (-14, 12),
 (-14, 6),
 (-15, 12),
 (-15, 6),
 (-16, 12),
 (-16, 6),
 (-17, 12),
 (-17, 6),
 (-18, 12),
 (-18, 6),
 (-19, 12),
 (-19, 6),
 (-20, 12),
 (-20, 6),
 (-21, 12),
 (-21, 6),
 (-22, 12),
 (-22, 6),
 (-23, 12),
 (-23, 6),
 (-24, 12),
 (-24, 6)]

In [5]:
scr_huge = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_creatinine_df.pkl'))
if 'age' not in scr_huge.columns:
    encounter_ids = scr_huge.encounter_id.unique()
    encounter_census = stmdb.getEncounterData()
    mask_finite = (encounter_census.gender != 'NaN') & (~pd.isnull(encounter_census.age_at_admit)) & (~pd.isnull(encounter_census.adm_tstamp))
    encounter_census = encounter_census.loc[mask_finite, :]
    encounter_census = encounter_census.loc[np.in1d(encounter_census.encounter_id, encounter_ids)]
    encounter_census = stm.filterEncPerPat(encounter_census)

    scr_huge = scr_huge.merge(encounter_census, how='inner', on='encounter_id')
    
    if scr_huge.valueUOM.unique()[0] != 'mg/dl':
        scr_huge.loc[:, 'value'] = scr_huge.loc[:, 'value'] * 0.01131
        scr_huge.loc[:, 'valueUOM'] = 'mg/dl'

    # change age in days
    # change column name
#     if np.sum(np.in1d(scr_huge.columns, 'age')) == 0:
    scr_huge.loc[:, 'age_at_admit'] = scr_huge.loc[:, 'age_at_admit'] * 365
    scr_huge.loc[scr_huge.gender=='1.0', 'gender'] = 'M'
    scr_huge.loc[scr_huge.gender=='2.0', 'gender'] = 'F'

    scr_huge.rename(columns={'tstamp': 'charttime', 'age_at_admit': 'age',
                             'adm_tstamp': 'intime', 'discharge_tstamp': 'outtime',
                             'gender': 'sex'}, inplace=True)   
    
    
    scr_huge.to_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_creatinine_df.pkl'))

In [6]:
scr_huge.columns

Index([u'id', u'encounter_id', u'attr_concept_code', u'attr_concept_label',
       u'attr_short_label', u'attr_long_label', u'intv_concept_code',
       u'intv_concept_label', u'intv_short_label', u'intv_long_label',
       u'value', u'valueUOM', u'charttime', u'patient_id', u'age', u'sex',
       u'intime', u'outtime', u'ICU_LOS_min', u'is_24hr_readmit',
       u'is_discharged', u'is_deceased', u'is_transferred', u'dtime_hr',
       u'bs_scr', u'scr_rate', u'AKI_stage', u'reftime'],
      dtype='object')

In [7]:
if not os.path.exists(os.path.join(fileDir, 'scr_stm')):
        os.makedirs(os.path.join(fileDir, 'scr_stm'))

if not os.path.exists(os.path.join(fileDir, 'io_stm')):
    os.makedirs(os.path.join(fileDir, 'io_stm'))

for timelag, timewin in combination:
    print("time lag:{}, time window: {}".format(timelag, timewin))
    fname_stm_scr_tot = "stm_onset_scr_tlag{:03d}_stime{:03d}_tot.pkl".format(int(abs(timelag)), int(stable_time))
    fname_stm_scr_tot = os.path.join(fileDir, "scr_stm", fname_stm_scr_tot)
    fname_stm_scr_aki = "stm_onset_scr_tlag{:03d}_stime{:03d}_aki.pkl".format(int(abs(timelag)), int(stable_time))
    fname_stm_scr_aki = os.path.join(fileDir, "scr_stm", fname_stm_scr_aki)
    fname_stm_scr_con = "stm_onset_scr_tlag{:03d}_stime{:03d}_con.pkl".format(int(abs(timelag)), int(stable_time))
    fname_stm_scr_con = os.path.join(fileDir, "scr_stm", fname_stm_scr_con)
    
    try:
        stm_scr_tot = pd.read_pickle(fname_stm_scr_tot)
    except:        
        stm_scr_tot = stmdb.getScrDF(scr_huge, ex_age=True, ex_los=True,
                                     ex_aki_adm=True, aki_adm_hr=np.max([stable_time, abs(timelag)]),
                                     enc_per_pat=True)
        stm_scr_tot.to_pickle(fname_stm_scr_tot)        
        
    try:
        stm_scr_aki = pd.read_pickle(fname_stm_scr_aki)
    except:
        stm_scr_aki = stmdb.getScrDF(pre_scr_df=stm_scr_tot, ex_age=True, ex_los=True,
                                     ex_aki_adm=True, aki_adm_hr=np.max([stable_time, abs(timelag)]),
                                     enc_per_pat=True, ex_noaki=True)
        stm_scr_aki.to_pickle(fname_stm_scr_aki)
        
    try:
        stm_scr_con = pd.read_pickle(fname_stm_scr_con)
    except:
        stm_scr_con = stm_scr_tot.loc[~np.in1d(stm_scr_tot.encounter_id, stm_scr_aki.encounter_id.unique()),:]
        stm_scr_con.to_pickle(fname_stm_scr_con)
        
    fname_stm_io_aki = "stm_onset_io_tlag{:03d}_twin{:03d}_aki.pkl".format(int(abs(timelag)), int(timewin))
    fname_stm_io_aki = os.path.join(fileDir, "io_stm", fname_stm_io_aki)
    fname_stm_io_con = "stm_onset_io_tlag{:03d}_twin{:03d}_con.pkl".format(int(abs(timelag)), int(timewin))
    fname_stm_io_con = os.path.join(fileDir, "io_stm", fname_stm_io_con)
    
    if os.path.isfile(fname_stm_io_aki):
        print("AKI group dataframe already exists..")        
    else:
        print("creating AKI group io dataframe")
        stm_onset_io_aki = stmdb.getIOMatrix(stm_scr_aki, feature_ids, feature_stats, 
                                             timelag, timewin=timewin)
        stm_onset_io_aki.to_pickle(fname_stm_io_aki)
    
    if os.path.isfile(fname_stm_io_con):
        print("Control group dataframe already exists..")
    else:
        print("creating Control group io dataframe")
        stm_onset_io_con = stmdb.getIOMatrix(stm_scr_con, feature_ids, feature_stats, 
                                             timelag, timewin=timewin)
        stm_onset_io_con.to_pickle(fname_stm_io_con)    

time lag:-6, time window: 6
creating AKI group io dataframe
platelet: {'attr_concept_code': [61928009], 'intv_concept_code': [61928009]}
wbc: {'attr_concept_code': [767002], 'intv_concept_code': [767002]}
hemoglobin: {'attr_concept_code': [104142005], 'intv_concept_code': [313995005]}
urine: {'attr_concept_code': [365678000], 'intv_concept_code': [365678000]}
potassium: {'attr_concept_code': [59573005], 'intv_concept_code': [59573005]}
ndbp: {'attr_concept_code': [271650006], 'intv_concept_code': [17146006]}
nsbp: {'attr_concept_code': [271649006], 'intv_concept_code': [17146006]}
hr: {'attr_concept_code': [364075005], 'intv_concept_code': [364075005]}
lactic_acid: {'attr_concept_code': [14395009], 'intv_concept_code': [3926003]}
temperature: {'attr_concept_code': [386725007], 'intv_concept_code': [258710007]}
spo2: {'attr_concept_code': [250554003], 'intv_concept_code': [250554003]}
creatinine: {'attr_concept_code': [113075003], 'intv_concept_code': [113075003]}
albumin: {'attr_concep

In [8]:
%debug

ERROR: No traceback has been produced, nothing to debug.


In [9]:
len(scr_huge.encounter_id.unique())

1388

## Create SCr dataframe and I/O dataframe from ISM db

In [10]:
# import pedAKI_ism_gen_scr_io as pakio
# for tlag, twin in combination:
#     print("time lag: {}, time window: {}".format(tlag, twin))
#     pakio.genIO_onset(tlag, twin, stable_time)