In [1]:
import os
import pandas as pd
import numpy as np
import pickle

In [2]:
fileDir = os.path.dirname('__file__')
f_dict_banner = os.path.join(fileDir, 'pickle_files_banner', 'feature_dict_banner.pkl')
item_dic_banner = pickle.load(open(f_dict_banner, 'rb'))

In [3]:
features = ['albumin', 'creatinine', 'glucose', 
            'hemoglobin', 'hr', 'lactic_acid', 
            'ndbp', 'nsbp', 'platelet', 'potassium', 
            'spo2', 'temperature', 'urine', 'wbc']

item_dic_pedAKI = {item: item_dic_banner[item] for item in features}

feature_stats = pickle.load(open(os.path.join(fileDir, "pickle_files_banner", "feature_stats_banner.pkl"), 'rb'))

In [4]:
timelag_all = list(-1*np.arange(25))
timewin_all = [12, 6]

combination = [(x,y) for x in timelag_all for y in timewin_all if abs(x)>=abs(y)]

In [5]:
def getLastVal(_series):
    tmp_series = _series[~pd.isnull(_series)]
    try:
        return tmp_series.values[-1]
    except:
        return np.float64(np.nan)

In [6]:
def getIODF(cr_df, feature_ids, feature_stats, timelag, timewin=6):
    
    cr_df.rename(columns={'ENCNTR_ID': 'encounter_id'}, inplace=True)
    
    fileDir = os.path.dirname('__file__')
    enc_df = pd.read_csv(os.path.join(fileDir, 'csv_banner', 'Banner_encounters.csv'))
    enc_df.reg_dt_tm = pd.to_datetime(enc_df.reg_dt_tm)
    enc_df.disch_dt_tm = pd.to_datetime(enc_df.reg_dt_tm)
    enc_df.gender[enc_df.gender=='Female'] = 'F'
    enc_df.gender[enc_df.gender=='Male'] = 'M'
    enc_df = enc_df.loc[:, ['encntr_id', 'reg_dt_tm', 'disch_dt_tm', 
                            'gender', 'race', 'deceased_flag', 'age', 
                            'LOSm']]
    enc_df.drop_duplicates(keep='first', inplace=True)
    enc_df.rename(columns={'encntr_id': 'encounter_id', 'gender': 'sex'}, inplace=True)
    
    encounter_ids = cr_df.encounter_id.unique()
    

    default_entry = ['encounter_id', 'age', 'sex']
    glob_mat = enc_df.loc[np.in1d(enc_df.encounter_id, encounter_ids), default_entry]

    out_col = pd.DataFrame(cr_df.groupby('encounter_id')['AKI_stage'].max())
    out_col.reset_index(inplace=True)
    out_col.columns = ['encounter_id', 'AKI_stage']
    glob_mat = glob_mat.merge(out_col, on="encounter_id", how="left")
    glob_mat.age = glob_mat.age*365.


    for feature in feature_ids:
        print("{}: {}".format(feature, feature_ids[feature]))
        f_df = os.path.join(fileDir, 'item_df_banner', 'banner_{}_df.pkl'.format(feature))
        item_df = pd.read_pickle(f_df)
        item_df.rename(columns={'ENCNTR_ID': 'encounter_id'}, inplace=True)
        item_df = item_df.loc[np.in1d(item_df.encounter_id, encounter_ids), :]        

        if 'age' not in item_df.columns:
            item_df = item_df.merge(enc_df, how='inner', on='encounter_id')
        
        item_df.loc[:, 'age'] = item_df.loc[:, 'age'] * 365.
        item_df.rename(columns={'EVENT_END_DT_TM': 'charttime',
                                'reg_dt_tm': 'intime',
                                'disch_dt_tm': 'outtime'}, inplace=True)

        item_df.sort_values(by=['encounter_id', 'charttime'], inplace=True)
            
        if 'reftime' not in item_df.columns:
            enc_reft = cr_df.groupby('encounter_id')['reftime'].unique().to_frame()
            enc_reft = enc_reft.reset_index()
            enc_reft['reftime'] = np.hstack(enc_reft.reftime)
            item_df = item_df.merge(enc_reft, on='encounter_id', how='inner')

        item_df['fromtime'] = item_df.reftime + np.timedelta64(int(timelag * 60), 'm')
        item_df['totime'] = item_df.reftime + np.timedelta64(int((timelag + timewin) * 60), 'm')        
        time_mask = (item_df.charttime > item_df.fromtime) & (item_df.charttime < item_df.totime)
        item_df = item_df.loc[time_mask, :]
        item_df_grouped = item_df.groupby('encounter_id')['RESULT_VAL_num']

        for stats in feature_stats[feature]:
            col_inmat = pd.Series()
            label_full = feature + "_" + stats            
            if stats == 'mean':
                col_inmat = item_df_grouped.mean()
            elif stats == 'median':
                col_inmat = item_df_grouped.median()
            elif stats == 'max':
                col_inmat = item_df_grouped.max()
            elif stats == 'min':
                col_inmat = item_df_grouped.min()
            elif stats == 'unique':
                col_inmat = np.hstack(item_df_grouped.unique())
            elif stats == 'last':
                col_inmat = item_df_grouped.apply(getLastVal)

            col_inmat = pd.DataFrame(col_inmat)
            col_inmat.reset_index(inplace=True)
            col_inmat.columns = ['encounter_id', label_full]

            glob_mat = glob_mat.merge(col_inmat, on='encounter_id', how="left")

    return glob_mat

In [7]:
if not os.path.exists(os.path.join(fileDir, 'scr_banner')):
        os.makedirs(os.path.join(fileDir, 'scr_banner'))
if not os.path.exists(os.path.join(fileDir, 'io_banner')):
    os.makedirs(os.path.join(fileDir, 'io_banner'))

cr_df_big = pd.read_pickle(os.path.join(fileDir, 'item_df_banner', 'banner_creatinine_df.pkl'))
stime = 12
for timelag, timewin in combination:
    print("time lag:{}, time window: {}".format(timelag, timewin))
    fname_banner_scr_tot = "banner_onset_scr_tlag{:03d}_stime{:03d}_tot.pkl".format(int(abs(timelag)), int(stime))
    fname_banner_scr_tot = os.path.join(fileDir, "scr_banner", fname_banner_scr_tot)
    fname_banner_scr_aki = "banner_onset_scr_tlag{:03d}_stime{:03d}_aki.pkl".format(int(abs(timelag)), int(stime))
    fname_banner_scr_aki = os.path.join(fileDir, "scr_banner", fname_banner_scr_aki)
    fname_banner_scr_con = "banner_onset_scr_tlag{:03d}_stime{:03d}_con.pkl".format(int(abs(timelag)), int(stime))
    fname_banner_scr_con = os.path.join(fileDir, "scr_banner", fname_banner_scr_con)
    
    stableTime = max(abs(timelag), 12)
    try:
        banner_scr_tot = pd.read_pickle(fname_banner_scr_tot)
    except:
        banner_scr_tot = cr_df_big.groupby('ENCNTR_ID'). \
                         filter(lambda group: group.loc[group.dtime_hr<=stableTime, 'AKI_stage'].sum()==0)
        banner_scr_tot.to_pickle(fname_banner_scr_tot)
        
    try:
        banner_scr_aki = pd.read_pickle(fname_banner_scr_aki)
    except:
        banner_scr_aki = banner_scr_tot.groupby('ENCNTR_ID'). \
                         filter(lambda group: group.AKI_stage.sum()>0)
        banner_scr_aki.to_pickle(fname_banner_scr_aki)
        
    try:
        banner_scr_con = pd.read_pickle(fname_banner_scr_con)
    except:
        banner_scr_con = banner_scr_tot.loc[~np.in1d(banner_scr_tot.ENCNTR_ID, banner_scr_aki.ENCNTR_ID.unique()),:]
        banner_scr_con.to_pickle(fname_banner_scr_con)
        
    fname_banner_io_aki = "banner_onset_io_tlag{:03d}_twin{:03d}_aki.pkl".format(int(abs(timelag)), int(timewin))
    fname_banner_io_aki = os.path.join(fileDir, "io_banner", fname_banner_io_aki)
    fname_banner_io_con = "banner_onset_io_tlag{:03d}_twin{:03d}_con.pkl".format(int(abs(timelag)), int(timewin))
    fname_banner_io_con = os.path.join(fileDir, "io_banner", fname_banner_io_con)
    
    if os.path.isfile(fname_banner_io_aki):
        print("AKI group dataframe already exists..")        
    else:
        print("creating AKI group io dataframe")
        banner_onset_io_aki = getIODF(banner_scr_aki, item_dic_pedAKI, feature_stats, 
                                      timelag, timewin=timewin)
        banner_onset_io_aki.to_pickle(fname_banner_io_aki)
    
    if os.path.isfile(fname_banner_io_con):
        print("Control group dataframe already exists..")
    else:
        print("creating Control group io dataframe")
        banner_onset_io_con = getIODF(banner_scr_con, item_dic_pedAKI, feature_stats, 
                                      timelag, timewin=timewin)
        banner_onset_io_con.to_pickle(fname_banner_io_con)    

time lag:-6, time window: 6
creating AKI group io dataframe


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


platelet: 682006
wbc: 681997
hemoglobin: 463628023
urine: 3141699
potassium: 681988
ndbp: 3059689
nsbp: 3059679
hr: 3049471
lactic_acid: 245357455
temperature: 3144901
creatinine: 681994
albumin: 705919
spo2: 3053467
glucose: 15616636
creating Control group io dataframe
platelet: 682006
wbc: 681997
hemoglobin: 463628023
urine: 3141699
potassium: 681988
ndbp: 3059689
nsbp: 3059679
hr: 3049471
lactic_acid: 245357455
temperature: 3144901
creatinine: 681994
albumin: 705919
spo2: 3053467
glucose: 15616636
time lag:-7, time window: 6
creating AKI group io dataframe
platelet: 682006
wbc: 681997
hemoglobin: 463628023
urine: 3141699
potassium: 681988
ndbp: 3059689
nsbp: 3059679
hr: 3049471
lactic_acid: 245357455
temperature: 3144901
creatinine: 681994
albumin: 705919
spo2: 3053467
glucose: 15616636
creating Control group io dataframe
platelet: 682006
wbc: 681997
hemoglobin: 463628023
urine: 3141699
potassium: 681988
ndbp: 3059689
nsbp: 3059679
hr: 3049471
lactic_acid: 245357455
temperature: 314