### Add composite variabels (SI, OSI, OI) to I/O dataframe for STM
* Query or load the dataframes that are required to to calculate the composite variables
* Calculate the composite variabels
* Add he composite variables to the original I/O dataframe

In [1]:
import stm_utilities as stm
import numpy as np
import pandas as pd
import pickle
import os

stmdb = stm.queryDB()

In [2]:
def queryDF4composites(encounter_ids, stm_instance=None,
                       hr_df=None, nsbp_df=None,
                       map_df=None, fio2_df=None,
                       spo2_df=None, pao2_df=None):

    try:
        if hr_df is None:
            hr_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_hr_df.pkl'))
        hr_df = hr_df.loc[np.in1d(hr_df.encounter_id, encounter_ids),:]
    except:
        hr_df = stmdb.getFeatureData(encounter_ids=encounter_ids,
                                     feature_ids={'attr_concept_code': [364075005],
                                                  'intv_concept_code': [364075005]})
#     print('hr dataframe queried ..')

    try:
        if nsbp_df is None:
            nsbp_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_nsbp_df.pkl'))
        nsbp_df = nsbp_df.loc[np.in1d(nsbp_df.encounter_id, encounter_ids), :]
    except:
        nsbp_df = stmdb.getFeatureData(encounter_ids=encounter_ids,
                                       feature_ids={'attr_concept_code': [271649006],
                                                    'intv_concept_code': [17146006]})
#     print('nsbp dataframe queried ..')

    try:
        if map_df is None:
            map_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_map_df.pkl'))
        map_df = map_df.loc[np.in1d(map_df.encounter_id, encounter_ids), :]
    except:
        map_df = stmdb.getFeatureData(encounter_ids=encounter_ids,
                                      feature_ids={'attr_concept_code': [259010008],
                                                   'intv_concept_code': [284019003]})
#     print('map dataframe queried ..')

    try:
        if fio2_df is None:
            fio2_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_fio2_df.pkl'))
        fio2_df = fio2_df.loc[np.in1d(fio2_df.encounter_id, encounter_ids), :]
    except:
        fio2_df = stmdb.getFeatureData(encounter_ids=encounter_ids,
                                       feature_ids={'attr_concept_code': [250774007],
                                                    'intv_concept_code': [250774007]})
#     print('fio2 dataframe queried ..')

    try:
        if spo2_df is None:
            spo2_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_spo2_df.pkl'))
        spo2_df = spo2_df.loc[np.in1d(spo2_df.encounter_id, encounter_ids),:]
    except:
        spo2_df = stmdb.getFeatureData(encounter_ids=encounter_ids,
                                       feature_ids={'attr_concept_code': [250554003],
                                                    'intv_concept_code': [250554003]})
#     print('spo2 dataframe queried ..')

    try:
        if pao2_df is None:
            pao2_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_pao2_df.pkl'))
        pao2_df = pao2_df.loc[np.in1d(pao2_df.encounter_id, encounter_ids), :]
    except:
        pao2_df = stmdb.getFeatureData(encounter_ids=encounter_ids,
                                       feature_ids={'attr_concept_code': [250546000],
                                                    'intv_concept_code': [250546000]})
#     print('pao2 dataframe queried ..')
    
    return hr_df, nsbp_df, map_df, fio2_df, spo2_df, pao2_df

def filtObservation(item_df, scr_df, timelag, timewin):
    enc_reft = scr_df.groupby('encounter_id')['reftime'].unique().to_frame()
    enc_reft = enc_reft.reset_index()
    enc_reft['reftime'] = np.hstack(enc_reft.reftime)
    
#     item_df = item_df.merge(scr_df.loc[:, ['encounter_id', 'reftime']], on='encounter_id', how='inner')
    item_df = item_df.merge(enc_reft, on='encounter_id', how='inner')
    item_df.rename(columns={'tstamp': 'charttime'}, inplace=True)
    if timelag > 0:
        item_df['fromtime'] = item_df.reftime
        item_df['totime'] = item_df.reftime + np.timedelta64(int(timelag * 60), 'm')
        # time_mask = (_df.charttime > _df.reftime) & (_df.charttime < _df.totime)
    else:
        item_df['fromtime'] = item_df.reftime + np.timedelta64(int(timelag * 60), 'm')
        item_df['totime'] = item_df.reftime + np.timedelta64(int((timelag + timewin) * 60), 'm')
    time_mask = (item_df.charttime > item_df.fromtime) & (item_df.charttime < item_df.totime)
    item_df = item_df.loc[time_mask, :]
    
    return item_df

In [3]:
def convertMapUOM(map_df):
    if map_df.valueUOM.unique()[0] == 'mmHg':
        map_df.value = map_df.value*float(1.36)
        map_df.valueUOM = 'cmH2O'
    return map_df

def convertFio2UOM(fio2_df):
    if fio2_df.valueUOM.unique()[0] == '%':
        fio2_df.value = fio2_df.value*0.01
        fio2_df.valueUOM = 'fraction'
    return fio2_df

def convertPao2UOM(pao2_df):
    if pao2_df.valueUOM.unique()[0] == 'kPa':
        pao2_df.value = pao2_df.value*7.5006
        pao2_df.valueUOM = 'mmHg'
    return pao2_df

In [4]:
# si_df_filtered = hr_df_filtered.groupby('encounter_id').apply(getSI, nsbp_df_filtered)
# si_df_filtered = si_df_filtered.loc[:, ['encounter_id', 'charttime', 'reftime', 'fromtime', 'totime', 'si']]
# si_df_filtered.rename(columns={'si':'value'}, inplace=True)
# si_df_filtered['valueUOM'] = 'bpm/mmHg'

def getSI(hr_df_group, nsbp_df_filtered):
    nsbp_df_group = nsbp_df_filtered.loc[nsbp_df_filtered.encounter_id
                                         ==hr_df_group.encounter_id.unique()[0], :]
    hr_df_group['si'] = np.nan
    tdiff = np.timedelta64(60, 'm')
    if not nsbp_df_group.empty:
        for idx, row in hr_df_group.iterrows():
            idxmin = abs(nsbp_df_group.charttime-row.charttime).idxmin()
            valmin = abs(nsbp_df_group.charttime-row.charttime).min()
            
            if valmin<=np.timedelta64(60,'m'):
                try:
                    hr_df_group.si[idx] = row.value/float(nsbp_df_group.value[idxmin])
                except:
                    pass
    
    return hr_df_group

def getSIDF(hr_df, nsbp_df):
    si_df = hr_df.groupby('encounter_id').apply(getSI, nsbp_df)
    si_df = si_df.loc[:, ['encounter_id', 'charttime', 'reftime', 'fromtime', 'totime', 'si']]
    si_df.rename(columns={'si':'value'}, inplace=True)
    si_df['valueUOM'] = 'bpm/mmHg'
    si_df = si_df.replace([np.inf, -np.inf], np.nan)
    return si_df
    
def getOSI(spo2_df_group, map_df_filtered, fio2_df_filtered):
    map_df_group = map_df_filtered.loc[map_df_filtered.encounter_id
                                       ==spo2_df_group.encounter_id.unique()[0], :]
    fio2_df_group = fio2_df_filtered.loc[fio2_df_filtered.encounter_id
                                         ==spo2_df_group.encounter_id.unique()[0], :]
    spo2_df_group['osi'] = np.nan
    tdiff = np.timedelta64(60, 'm')
    if not (map_df_group.empty or fio2_df_group.empty):
        for idx, row in spo2_df_group.iterrows():
            idxmin_map = abs(map_df_group.charttime-row.charttime).idxmin()
            valmin_map = abs(map_df_group.charttime-row.charttime).min()
            idxmin_fio2 = abs(fio2_df_group.charttime-row.charttime).idxmin()
            valmin_fio2 = abs(fio2_df_group.charttime-row.charttime).min()
            
            if (valmin_map<=np.timedelta64(60,'m')) and (valmin_fio2<=np.timedelta64(60,'m')):
                try:
                    spo2_df_group.osi[idx] = (map_df_group.value[idxmin_map]
                                              *fio2_df_group.value[idxmin_fio2]*100)/float(row.value)
                except:
                    pass
    
    return spo2_df_group

def getOSIDF(spo2_df, map_df, fio2_df):
    osi_df = spo2_df.groupby('encounter_id').apply(getOSI, map_df, fio2_df)
    osi_df = osi_df.loc[:, ['encounter_id', 'charttime', 'reftime', 'fromtime', 'totime', 'osi']]
    osi_df.rename(columns={'osi':'value'}, inplace=True)
    osi_df['valueUOM'] = 'cmH2O'
    osi_df = osi_df.replace([np.inf, -np.inf], np.nan)
    return osi_df

def getOI(pao2_df_group, map_df_filtered, fio2_df_filtered):
    map_df_group = map_df_filtered.loc[map_df_filtered.encounter_id
                                       ==pao2_df_group.encounter_id.unique()[0], :]
    fio2_df_group = fio2_df_filtered.loc[fio2_df_filtered.encounter_id
                                         ==pao2_df_group.encounter_id.unique()[0], :]
    pao2_df_group['oi'] = np.nan
    tdiff = np.timedelta64(60, 'm')
    if not (map_df_group.empty or fio2_df_group.empty):
        for idx, row in pao2_df_group.iterrows():
            idxmin_map = abs(map_df_group.charttime-row.charttime).idxmin()
            valmin_map = abs(map_df_group.charttime-row.charttime).min()
            idxmin_fio2 = abs(fio2_df_group.charttime-row.charttime).idxmin()
            valmin_fio2 = abs(fio2_df_group.charttime-row.charttime).min()
            
            if (valmin_map<=np.timedelta64(60,'m')) and (valmin_fio2<=np.timedelta64(60,'m')):
                try:
                    pao2_df_group.oi[idx] = (map_df_group.value[idxmin_map]
                                             *fio2_df_group.value[idxmin_fio2]*100)/(row.value*1.36)
                except:
                    pass
    
    return pao2_df_group

def getOIDF(pao2_df, map_df, fio2_df):
    oi_df = pao2_df.groupby('encounter_id').apply(getOI, map_df, fio2_df)
    oi_df = oi_df.loc[:, ['encounter_id', 'charttime', 'reftime', 'fromtime', 'totime', 'oi']]
    oi_df.rename(columns={'oi':'value'}, inplace=True)
    oi_df['valueUOM'] = '%'
    oi_df = oi_df.replace([np.inf, -np.inf], np.nan)
    return oi_df

In [5]:
def getLastValDF(group):
    
    tmp_group = group.loc[~pd.isnull(group.value),:]
    if not tmp_group.empty:
        return tmp_group.value[tmp_group.charttime.idxmax()]
    else:
        return np.NaN
    
def add2IO(io_df, feature_dfs, stats):
    feature_stats = [(feature, stat) for feature in feature_dfs for stat in stats]
    for feature, stat in feature_stats:
        feature_df = feature_dfs[feature]
        ft_name = feature+'_'+stat
#         print(ft_name)
        if stat !='last':
            ft_stat_df = eval("feature_df.groupby('encounter_id')['value']."+stat+"().reset_index()")
        else:
            ft_stat_sr = feature_df.groupby('encounter_id').apply(getLastValDF)
            ft_stat_df = pd.DataFrame(ft_stat_sr)
            ft_stat_df.rename(columns={0: 'value'}, inplace=True)
            ft_stat_df = ft_stat_df.reset_index()
            
        io_df = io_df.merge(ft_stat_df, on='encounter_id', how='left')
        io_df.rename(columns={'value': ft_name}, inplace=True)
    return io_df

In [6]:
fileDir = os.path.dirname("__file__")
timelag = np.arange(-6,-25,-1)
timewin = [6, 12]
stabletime = list([12])
combination = [(tlag, twin, stime) for tlag in timelag for twin in timewin 
               for stime in stabletime if abs(tlag)>=twin]
# combination = combination[16:]
# combination

In [7]:
if not os.path.exists(os.path.join(fileDir, 'io_stm2')):
    os.makedirs(os.path.join(fileDir, 'io_stm2'))

hr_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_hr_df.pkl'))
nsbp_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_nsbp_df.pkl'))
map_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_map_df.pkl'))
fio2_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_fio2_df.pkl'))
spo2_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_spo2_df.pkl'))
pao2_df = pd.read_pickle(os.path.join(fileDir, 'item_df_stm', 'stm_pao2_df.pkl'))


for tlag, twin, stime in combination:
    print('timelag: {}, timewindow: {}'.format(tlag, twin))
    
    fname_scr_df = os.path.join(fileDir, 'scr_stm', 
                                'stm_onset_scr_tlag{:03d}_stime{:03d}_tot.pkl'.format(abs(tlag), stime))
    fname_io_df_aki = os.path.join(fileDir, 'io_stm', 
                                   'stm_onset_io_tlag{:03d}_twin{:03d}_aki.pkl'.format(abs(tlag), twin))
    fname_io_df_con = os.path.join(fileDir, 'io_stm', 
                                   'stm_onset_io_tlag{:03d}_twin{:03d}_con.pkl'.format(abs(tlag), twin))
    
    # Tag reference time to the creatinine dataframe
    scr_df = pd.read_pickle(fname_scr_df)
    if 'reftime' not in scr_df.columns:
        scr_df = stmdb.getScrDF(pre_scr_df=scr_df, ex_age=True, ex_los=True, 
                                ex_aki_adm=True, aki_adm_hr=12, enc_per_pat=True)
        scr_df.to_pickle(fname_scr_df)
    
    # Load I/O matrix
    io_df_aki = pd.read_pickle(fname_io_df_aki)
    io_df_con = pd.read_pickle(fname_io_df_con)
    
    # Query dataframes to calculate the composite predictors
    encounter_ids_aki = io_df_aki.encounter_id.unique()
    encounter_ids_con = io_df_con.encounter_id.unique()
    
    (hr_df_aki, nsbp_df_aki, map_df_aki, 
     fio2_df_aki, spo2_df_aki, pao2_df_aki) = queryDF4composites(encounter_ids_aki,
                                                                 hr_df=hr_df, nsbp_df=nsbp_df,
                                                                 map_df=map_df, fio2_df=fio2_df, 
                                                                 spo2_df=spo2_df, pao2_df=pao2_df)
    (hr_df_con, nsbp_df_con, map_df_con, 
     fio2_df_con, spo2_df_con, pao2_df_con) = queryDF4composites(encounter_ids_con,
                                                                 hr_df=hr_df, nsbp_df=nsbp_df, 
                                                                 map_df=map_df, fio2_df=fio2_df, 
                                                                 spo2_df=spo2_df, pao2_df=pao2_df)
    print('Queried the dataframes ...')
    
    # Filter dataframes with the observation window
    hr_df_filtered_aki = filtObservation(hr_df_aki, scr_df, tlag, twin)
    nsbp_df_filtered_aki = filtObservation(nsbp_df_aki, scr_df, tlag, twin)
    map_df_filtered_aki = filtObservation(map_df_aki, scr_df, tlag, twin)
    fio2_df_filtered_aki = filtObservation(fio2_df_aki, scr_df, tlag, twin)
    spo2_df_filtered_aki = filtObservation(spo2_df_aki, scr_df, tlag, twin)
    pao2_df_filtered_aki = filtObservation(pao2_df_aki, scr_df, tlag, twin)
    
    hr_df_filtered_con = filtObservation(hr_df_con, scr_df, tlag, twin)
    nsbp_df_filtered_con = filtObservation(nsbp_df_con, scr_df, tlag, twin)
    map_df_filtered_con = filtObservation(map_df_con, scr_df, tlag, twin)
    fio2_df_filtered_con = filtObservation(fio2_df_con, scr_df, tlag, twin)
    spo2_df_filtered_con = filtObservation(spo2_df_con, scr_df, tlag, twin)
    pao2_df_filtered_con = filtObservation(pao2_df_con, scr_df, tlag, twin)
    print('Required dataframes filtered by observation window ...')
    
    # Convert UOM to be consistent with ISM
    map_df_filtered_aki = convertMapUOM(map_df_filtered_aki)
    map_df_filtered_con = convertMapUOM(map_df_filtered_con)
    
    fio2_df_filtered_aki = convertFio2UOM(fio2_df_filtered_aki)
    fio2_df_filtered_con = convertFio2UOM(fio2_df_filtered_con)
    
    pao2_df_filtered_aki = convertPao2UOM(pao2_df_filtered_aki)
    pao2_df_filtered_con = convertPao2UOM(pao2_df_filtered_con)
    print('UOM converted ...')
    
    # Get Shock index (SI) dataframe
    si_df_filtered_aki = getSIDF(hr_df_filtered_aki, nsbp_df_filtered_aki)
    si_df_filtered_con = getSIDF(hr_df_filtered_con, nsbp_df_filtered_con)
    print('Got Shock Index ...')
    
    # Get Oxygenation Saturation Index (OSI) dataframe
    osi_df_filtered_aki = getOSIDF(spo2_df_filtered_aki, map_df_filtered_aki, 
                                   fio2_df_filtered_aki)
    osi_df_filtered_con = getOSIDF(spo2_df_filtered_con, map_df_filtered_con, 
                                   fio2_df_filtered_con)
    print('Got Oxygenation Saturation Index ...')
    
    # Get Oxygenation Index (OI) dataframe
    oi_df_filtered_aki = getOIDF(pao2_df_filtered_aki, map_df_filtered_aki,
                                 fio2_df_filtered_aki)
    oi_df_filtered_con = getOIDF(pao2_df_filtered_con, map_df_filtered_con,
                                 fio2_df_filtered_con)
    print('Got Oxygenation Index ...')
    
    # Add feature statistics to the I/O dataframe
    feature_dfs_aki = {'si': si_df_filtered_aki, 'osi': osi_df_filtered_aki, 'oi': oi_df_filtered_aki}
    feature_dfs_con = {'si': si_df_filtered_con, 'osi': osi_df_filtered_con, 'oi': oi_df_filtered_con}
    stats = ['min', 'max', 'mean', 'last', 'median']
    
    io_df_full_aki = add2IO(io_df_aki, feature_dfs_aki, stats)
    io_df_full_con = add2IO(io_df_con, feature_dfs_con, stats)
    print('Added to the I/O dataframe')    
    
    fname_io_full_aki2 = os.path.join(fileDir, 'io_stm2', 
                                      'stm_onset_io_tlag{:03d}_twin{:03d}_aki.pkl'.format(abs(tlag), twin))
    fname_io_full_con2 = os.path.join(fileDir, 'io_stm2', 
                                      'stm_onset_io_tlag{:03d}_twin{:03d}_con.pkl'.format(abs(tlag), twin))
    io_df_full_aki.to_pickle(fname_io_full_aki2)
    io_df_full_con.to_pickle(fname_io_full_con2)
    
    print('timelag: {}, timewindow: {} finished!! \n'.format(tlag, twin))
    

timelag: -6, timewindow: 6
Queried the dataframes ...
Required dataframes filtered by observation window ...
UOM converted ...
Got Shock Index ...
Got Oxygenation Saturation Index ...
Got Oxygenation Index ...
Added to the I/O dataframe
timelag: -6, timewindow: 6 finished!! 

timelag: -7, timewindow: 6
Queried the dataframes ...
Required dataframes filtered by observation window ...
UOM converted ...
Got Shock Index ...
Got Oxygenation Saturation Index ...
Got Oxygenation Index ...
Added to the I/O dataframe
timelag: -7, timewindow: 6 finished!! 

timelag: -8, timewindow: 6
Queried the dataframes ...
Required dataframes filtered by observation window ...
UOM converted ...
Got Shock Index ...
Got Oxygenation Saturation Index ...
Got Oxygenation Index ...
Added to the I/O dataframe
timelag: -8, timewindow: 6 finished!! 

timelag: -9, timewindow: 6
Queried the dataframes ...
Required dataframes filtered by observation window ...
UOM converted ...
Got Shock Index ...
Got Oxygenation Satura

### Tag reference time to the creatinine dataframe

### Load previous I/O matrix

In [8]:
# io_df_aki = pd.read_pickle(fname_io_df_aki)
# io_df_con = pd.read_pickle(fname_io_df_con)

### Query dataframes

In [9]:
# hr_df_filtered = filtObservation(hr_df, scr_df, timelag, timewin)
# nsbp_df_filtered = filtObservation(nsbp_df, scr_df, timelag, timewin)
# map_df_filtered = filtObservation(map_df, scr_df, timelag, timewin)
# fio2_df_filtered = filtObservation(fio2_df, scr_df, timelag, timewin)
# spo2_df_filtered = filtObservation(spo2_df, scr_df, timelag, timewin)
# pao2_df_filtered = filtObservation(pao2_df, scr_df, timelag, timewin)

In [10]:
# print(pao2_df_filtered.valueUOM.unique()[0])
# print(fio2_df_filtered.valueUOM.unique()[0])
# print(map_df_filtered.valueUOM.unique()[0])

In [11]:
# si_df_filtered

In [12]:
# osi_df_filtered.columns

In [13]:
# oi_df_filtered.columns

In [14]:
# oi_df_filtered.groupby('encounter_id')['value'].mean().reset_index().columns

In [15]:
# feature_dfs = {'si': si_df_filtered, 'osi': osi_df_filtered, 'oi': oi_df_filtered}
# stats = ['min', 'max', 'mean', 'last', 'median']
# feature_stats = [(feature, stat) for feature in feature_dfs for stat in stats]


In [16]:
# io_df_full_aki = add2IO(io_df_aki, feature_dfs, stats)
# io_df_full_con = add2IO(io_df_con, feature_dfs, stats)