### Script to filter patients by inclusion criteria
* Age between 1 month and 21 years old
* Length of stay more than 24 hours
* Single encounter per patient (N/A to Banner since there is no patient_id)
* Stable for at least the first 12 hours from admission
  * Tag AKI stage for each creatinine measurement
  * Assign reference time as the AKI onset time for AKI group
  * Assign random reference time for control group

In [1]:
import os
import pandas as pd
import numpy as np
fileDir = os.path.dirname('__file__')

In [26]:
enc_df = pd.read_csv(os.path.join(fileDir, 'csv_banner', 'Banner_encounters.csv'))

In [27]:
enc_df.columns

Index([u'Unnamed: 0', u'encntr_id', u'reg_dt_tm', u'mrn', u'disch_dt_tm',
       u'gender', u'race', u'deceased_flag', u'age', u'LOSm'],
      dtype='object')

In [28]:
# Precondition the encounter dataframe before merging to the item's dataframe
enc_df.reg_dt_tm = pd.to_datetime(enc_df.reg_dt_tm)
enc_df.disch_dt_tm = pd.to_datetime(enc_df.reg_dt_tm)
enc_df.gender[enc_df.gender=='Female'] = 'F'
enc_df.gender[enc_df.gender=='Male'] = 'M'
enc_df = enc_df.loc[:, ['encntr_id', 'reg_dt_tm', 'disch_dt_tm', 
                        'gender', 'race', 'deceased_flag', 'age', 
                        'LOSm']]
enc_df.drop_duplicates(keep='first', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [30]:
def filterByAge(enc_df):
    age_mask = (enc_df.age>=1/12.) & (enc_df.age<=21.)
    enc_df = enc_df.loc[age_mask, :]
    return enc_df

def filterByLOS(enc_df):
    los_mask = enc_df.LOSm>=24*60
    enc_df = enc_df.loc[los_mask, :]
    return enc_df

In [31]:
# Load the creatinine dataframe
cr_df = pd.read_pickle(os.path.join(fileDir, 'item_df_banner', 'banner_creatinine_df.pkl'))

* ** Apply patient inclusion criteria for Age and LOS **

In [32]:
enc_df = filterByAge(enc_df)
print(len(enc_df.encntr_id.unique()))
enc_df = filterByLOS(enc_df)
print(len(enc_df.encntr_id.unique()))
print(len(enc_df.encntr_id))

3891
3358
3358


In [33]:
enc_df.rename(columns={'encntr_id': 'ENCNTR_ID'}, inplace=True)
cr_df_big = pd.merge(cr_df, enc_df, on='ENCNTR_ID', how='inner')

In [34]:
print(len(cr_df.ENCNTR_ID.unique()))
print(len(cr_df_big.ENCNTR_ID.unique()))
print(len(cr_df_big))

1880
1688
6383


In [35]:
cr_df_big.sort_values(by=['ENCNTR_ID', 'EVENT_END_DT_TM'], inplace=True)

In [36]:
cr_df_big.columns

Index([u'EVENT_CD', u'RESULT_UNITS_CD', u'ENCNTR_ID', u'MRN', u'EVENT_ID',
       u'EVENT_NAME', u'EVENT_MEASURE', u'EVENT_END_DT_TM', u'ORDER_ID',
       u'RESULT_VAL_num', u'reg_dt_tm', u'disch_dt_tm', u'gender', u'race',
       u'deceased_flag', u'age', u'LOSm'],
      dtype='object')

In [37]:
cr_df_big.dtypes

EVENT_CD                    int64
RESULT_UNITS_CD             int64
ENCNTR_ID                  object
MRN                        object
EVENT_ID                   object
EVENT_NAME                 object
EVENT_MEASURE              object
EVENT_END_DT_TM    datetime64[ns]
ORDER_ID                   object
RESULT_VAL_num            float64
reg_dt_tm          datetime64[ns]
disch_dt_tm        datetime64[ns]
gender                     object
race                       object
deceased_flag               int64
age                       float64
LOSm                      float64
dtype: object

In [38]:
path2normscr = os.path.join(os.path.dirname("__file__"), "csv_files", "ped_normal_scr.csv")
norm_scr_lim = pd.DataFrame.from_csv(path2normscr)
age_yr_ll = np.array(norm_scr_lim.low_age)
age_yr_ul = np.array(norm_scr_lim.upp_age)

def tagBSCr(group):
    age_yr = group.age.unique()[0]
    age_mask = (age_yr_ll < age_yr) & (age_yr_ul >= age_yr)

    sex = group.gender.unique()[0]

    sex_arr = norm_scr_lim.sex.as_matrix().astype(str)
    sex_mask = [sex in normsex for normsex in sex_arr]
    group['bs_scr'] = norm_scr_lim.upp_scr[age_mask & sex_mask].unique()[0]

    return group

def tagDtimeHr(group):
    dtime_hr = (group.loc[:, 'EVENT_END_DT_TM'] \
                - group.loc[group.index[0], 'reg_dt_tm']).astype('timedelta64[m]') / 60
    group['dtime_hr'] = dtime_hr

    return group

In [44]:
def tagAKI(group):
    dtime_hr = group.dtime_hr

    for idx, row in group.iterrows():

        # Calculate average rate of change of SCr within 48 hour time window
        # charttime -48h is used as the time window
        # time_mask = (dtime_hr > dtime_hr[idx] - 30) \
        #             & (dtime_hr <= dtime_hr[idx] + 30)
        scr_rate = np.NaN
        group.loc[idx, 'scr_rate'] = scr_rate
        if row.dtime_hr>=48:
            time_mask = (dtime_hr >= row.dtime_hr - 48) & (dtime_hr <= row.dtime_hr)
            y = np.array(group.RESULT_VAL_num[time_mask])
            x = np.array(group.dtime_hr[time_mask])
            A = np.vstack([x, np.ones(len(x))]).T
            if A.shape[0]>1:
                slope, intercept = np.linalg.lstsq(A,y)[0]
                scr_rate = slope*48
                group.loc[idx, 'scr_rate'] = scr_rate
        cur_val = row.RESULT_VAL_num

        if (cur_val >= 3.0 * group.bs_scr[idx]) or (cur_val >= 4.0):
            AKI_stage = 3
        elif cur_val >= 2.0 * group.bs_scr[idx]:
            AKI_stage = 2
        elif (cur_val >= 1.5 * group.bs_scr[idx]) or (scr_rate >= 0.3):
            AKI_stage = 1
        else:
            AKI_stage = 0
        group.loc[idx, 'AKI_stage'] = AKI_stage    
    return group

In [81]:
norm_scr_lim

Unnamed: 0,low_age,upp_age,sex,low_scr,upp_scr
0,0.08,1,FM,0.2,0.4
1,1.0,2,FM,0.2,0.5
2,2.0,4,FM,0.3,0.7
3,4.0,6,FM,0.3,0.7
4,6.0,8,FM,0.2,0.6
5,8.0,10,FM,0.3,0.7
6,10.0,12,FM,0.3,0.9
7,12.0,15,FM,0.4,0.9
8,15.0,21,M,0.6,1.2
9,15.0,21,F,0.5,1.0


* **Tag baseline creatinine, time from admission, AKI stage for each creatinine measurement**

In [45]:
if 'bs_scr' not in cr_df_big.columns:
    cr_df_big = cr_df_big.groupby('ENCNTR_ID').apply(tagBSCr)
if 'dtime_hr' not in cr_df_big.columns:
    cr_df_big = cr_df_big.groupby('ENCNTR_ID').apply(tagDtimeHr)
if 'AKI_stage' not in cr_df_big.columns:
    cr_df_big = cr_df_big.groupby('ENCNTR_ID').apply(tagAKI)

* **Include patients that were stable for the first 12 hours from admission**

In [72]:
stableTime = 12
cr_df_big = cr_df_big.groupby('ENCNTR_ID'). \
            filter(lambda group: group.loc[group.dtime_hr<=stableTime, 'AKI_stage'].sum()==0)

In [73]:
maxAKI = cr_df_big.groupby('ENCNTR_ID')['AKI_stage'].max()

In [75]:
nb_AKI = np.sum(maxAKI>0)
nb_Stable = np.sum(maxAKI==0)
print('Number of AKI pateints: {}'.format(nb_AKI))
print('Number of Stable pateints: {}'.format(nb_Stable))

Number of AKI pateints: 62
Number of Stable pateints: 1593


In [76]:
import random
def randomDate(_start, _end, _prop):
    rtime = _start + _prop * (_end - _start)
    return rtime

def tagRefTime(group, lag=-24):
    if group.AKI_stage.sum()>0:
        group['reftime'] = group.EVENT_END_DT_TM[group.AKI_stage>0].min()
    else:
        stime = group.reg_dt_tm.unique()[0] + np.timedelta64(int(abs(lag)*60), 'm')
        etime = group.EVENT_END_DT_TM.max()
        rand_reftime = randomDate(stime, etime, random.random())
        group['reftime'] = rand_reftime
    return group

* **Tag reference time**

In [77]:
if 'reftime' not in cr_df_big.columns:
    cr_df_big = cr_df_big.groupby('ENCNTR_ID').apply(tagRefTime)

In [78]:
cr_df_big.loc[:, ['EVENT_END_DT_TM', 'RESULT_VAL_num', 'reg_dt_tm', 
                  'disch_dt_tm', 'age', 'bs_scr', 'dtime_hr', 'scr_rate',
                  'AKI_stage', 'reftime'] ]

Unnamed: 0,EVENT_END_DT_TM,RESULT_VAL_num,reg_dt_tm,disch_dt_tm,age,bs_scr,dtime_hr,scr_rate,AKI_stage,reftime
44,2041-05-21 00:52:00,0.26,2041-05-21 05:42:00,2041-05-21 05:42:00,5.7,0.7,-4.833333,,0.0,2041-05-22 06:36:48.632052663
43,2041-05-21 08:45:49,0.27,2041-05-21 05:42:00,2041-05-21 05:42:00,5.7,0.7,3.050000,,0.0,2041-05-22 06:36:48.632052663
42,2041-05-22 07:10:00,0.35,2041-05-21 05:42:00,2041-05-21 05:42:00,5.7,0.7,25.466667,,0.0,2041-05-22 06:36:48.632052663
22,2041-09-08 07:40:00,0.27,2041-09-08 12:28:00,2041-09-08 12:28:00,0.1,0.4,-4.800000,,0.0,2041-09-09 23:58:15.819718597
20,2041-09-09 06:50:00,0.27,2041-09-08 12:28:00,2041-09-08 12:28:00,0.1,0.4,18.366667,,0.0,2041-09-09 23:58:15.819718597
21,2041-09-10 06:10:00,0.20,2041-09-08 12:28:00,2041-09-08 12:28:00,0.1,0.4,41.700000,,0.0,2041-09-09 23:58:15.819718597
41,2042-01-08 21:01:00,0.71,2042-01-09 02:42:00,2042-01-09 02:42:00,14.4,0.9,-5.683333,,0.0,2042-01-09 22:24:34.088934702
19,2040-10-02 14:15:00,0.29,2040-10-02 18:24:00,2040-10-02 18:24:00,1.1,0.5,-4.150000,,0.0,2040-10-03 10:52:02.879788282
39,2042-01-30 06:03:00,0.26,2042-01-29 13:32:00,2042-01-29 13:32:00,0.4,0.4,16.516667,,0.0,2042-02-01 08:38:36.838702922
40,2042-01-31 06:00:00,0.28,2042-01-29 13:32:00,2042-01-29 13:32:00,0.4,0.4,40.466667,,0.0,2042-02-01 08:38:36.838702922


In [80]:
cr_df_big.dtypes

EVENT_CD                    int64
RESULT_UNITS_CD             int64
ENCNTR_ID                  object
MRN                        object
EVENT_ID                   object
EVENT_NAME                 object
EVENT_MEASURE              object
EVENT_END_DT_TM    datetime64[ns]
ORDER_ID                   object
RESULT_VAL_num            float64
reg_dt_tm          datetime64[ns]
disch_dt_tm        datetime64[ns]
gender                     object
race                       object
deceased_flag               int64
age                       float64
LOSm                      float64
bs_scr                    float64
dtime_hr                  float64
scr_rate                  float64
AKI_stage                 float64
reftime            datetime64[ns]
dtype: object

In [79]:
cr_df_big.to_pickle(os.path.join(fileDir, 'item_df_banner', 'banner_creatinine_df.pkl'))