In [None]:
import pandas as pd
# import modin.pandas as pd
import numpy as np
import pickle as pkl
from os import listdir, walk
from os.path import isfile, join
from dask.distributed import Client
import dask.dataframe as dd

import datetime

import json
import time

import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

%matplotlib inline

In [None]:
cohort = pd.read_csv("/data/adarsh/fda_project_data/study_cohort_with_outcome_truncated_15days.csv")

In [None]:
cohort

In [None]:
df = dd.read_csv('/data/deidentified_trews_deployment_data/cdm/cdm_t.csv')

In [None]:
cohort_enc_ids = cohort.enc_id.unique()

In [None]:
# use dask default scheduling
start = time.time()

# find all rows in cdm_t that have an enc_id in our patient cohort
cdm_t = df.loc[df.enc_id.isin(cohort_enc_ids)].compute()

end = time.time()
print((end-start)/60.)

In [None]:
sorted(cdm_t.fid.unique())

### Extracting lab test data

In [None]:
# we need to extract lab tests, vital signs, composite indices, and administrative info
# start with lab tests: most recent value in the last 72 hours
lab_vars = ['anion_gap', 'bicarbonate', 'glucose', 'hematocrit', 'lactate', 'bun', 'creatinine', 'sodium',
            'troponin', 'wbc']

lab_df = cdm_t.loc[cdm_t.fid.isin(lab_vars)]

In [None]:
lab_df['tsp'] = pd.to_datetime(lab_df.tsp)

In [None]:
lab_df = lab_df.merge(cohort[['enc_id', 'admit_time', 'end_time', 'hospital']], on='enc_id', how='left')

In [None]:
lab_df['admit_time'] = pd.to_datetime(lab_df['admit_time'])
lab_df['end_time'] = pd.to_datetime(lab_df['end_time'])

# only interested in lab measurements taken before the last observation time
lab_df = lab_df.query('tsp <= end_time')
lab_df

In [None]:
# create hourly prediction intervals
cohort['obs_time'] = cohort.apply(lambda x: pd.date_range(x['admit_time'], x['end_time'], freq="1H"),axis=1)

In [None]:
def convert_datetime_index_to_rows(row):
    frame = row.obs_time.to_frame().reset_index().rename(columns={'index':'enc_id', 0:'obs_time'})
    frame['enc_id'] = row['enc_id']
    
    return frame

pred_df = pd.concat(list(cohort.apply(convert_datetime_index_to_rows, axis=1)))

In [None]:
pred_df = pred_df.sort_values(['enc_id', 'obs_time'])

In [None]:
# Extract lab measurements

for var in lab_vars:
    # get the lab value rows, sort them, and combine them with the prediction times
    temp_df = lab_df.query('fid == @var').sort_values(['enc_id', 'tsp']).merge(pred_df, on='enc_id', how='inner')
    # only care about values before prediction time
    temp_df = temp_df.query('tsp <= obs_time')
    temp_df['elapsed_time'] = temp_df['obs_time'] - temp_df['tsp']
    # get most recent (i.e., last) value that occurred in the preceding 72 hours
    temp_values = (temp_df
                   .loc[temp_df.elapsed_time <= datetime.timedelta(hours=72)]
                   .groupby(['enc_id', 'obs_time'], as_index=False)
                   .nth(-1)
                   .reset_index())
    temp_values = temp_values[['enc_id', 'obs_time', 'value']].rename(columns={'value':var})
    
    # merge into the pred df
    pred_df = pred_df.merge(temp_values, on=['enc_id', 'obs_time'], how="left")
    
    print("Completed extraction of {}".format(var))

In [None]:
# missingness percentages
for var in lab_vars:
    print(var, np.mean(pred_df[var].isnull().astype(np.double)))

In [None]:
# need to add troponin missingness indicator
# will add other missingness indicators for posterity
# indicator is 1 if lab test is missing
for var in lab_vars:
    pred_df[var + "_missing"] = pred_df[var].isnull().astype(np.double)

In [None]:
# save our work so far
# pred_df.to_csv("/data/adarsh/fda_project_data/lab_features.csv", index=False)

In [None]:
del lab_df

### Extract vital signs

In [None]:
# pred_df = pred_df[['enc_id', 'obs_time', 'anion_gap', 'bicarbonate', 'glucose',
#        'hematocrit', 'lactate', 'bun', 'creatinine', 'sodium', 'troponin',
#        'wbc', 'anion_gap_missing', 'bicarbonate_missing', 'glucose_missing',
#        'hematocrit_missing', 'lactate_missing', 'bun_missing',
#        'creatinine_missing', 'sodium_missing', 'troponin_missing',
#        'wbc_missing']]

In [None]:
vital_vars = ['nbp_dias', 'nbp_sys', 'heart_rate', 'spo2', 'resp_rate', 'temperature', 'gcs']
# not included :(Anion gap ÷ serum bicarbonate) × 1000 (computed from extracted lab tests)
# not included: shock index (latest heart rate / latest systolic bp); computed from extracted vitals

In [None]:
vital_df = cdm_t.loc[cdm_t.fid.isin(vital_vars)]

In [None]:
vital_df['tsp'] = pd.to_datetime(vital_df.tsp)
vital_df = vital_df.merge(cohort[['enc_id', 'admit_time', 'end_time', 'hospital']], on='enc_id', how='left')

In [None]:
vital_df['admit_time'] = pd.to_datetime(vital_df['admit_time'])
vital_df['end_time'] = pd.to_datetime(vital_df['end_time'])

# only interested in vital measurements taken before the last observation time
vital_df = vital_df.query('tsp <= end_time')

In [None]:
# Extract vital measurements

for var in vital_vars:
    # get the vital value rows, sort them, and combine them with the prediction times
    temp_df = (vital_df
               .query('fid == @var')
               .sort_values(['enc_id', 'tsp'])
               .merge(pred_df[['enc_id', 'obs_time']], on='enc_id', how='inner'))
    
    # only care about values before prediction time
    temp_df = temp_df.query('tsp <= obs_time')
    temp_df['elapsed_time'] = temp_df['obs_time'] - temp_df['tsp']
    
    # only care about values in the last 24 hours
    temp_df = temp_df.loc[temp_df.elapsed_time <= datetime.timedelta(hours=24)]
    temp_df['value'] = temp_df['value'].astype(np.double)
    
    # filter out values according to AAM paper
    reference_val = None
    if var == 'nbp_sys':
        temp_df = temp_df.loc[temp_df['value'] <= 300]
        reference_val = 100
    elif var == 'nbp_dias':
        reference_val = 70
    elif var == 'heart_rate':
        temp_df = temp_df.loc[temp_df['value'] <= 300]
        reference_val = 75
    elif var == 'resp_rate':
        temp_df = temp_df.loc[temp_df['value'] <= 80]
        reference_val = 11
    elif var == 'spo2':
        temp_df = temp_df.loc[temp_df['value'] >= 50]
        reference_val = 100
    elif var == 'temperature':
        temp_df = temp_df.loc[(temp_df['value'] >= 85) & (temp_df['value'] <= 108)]
        reference_val = 98
    elif var == 'gcs':
        # best possible gcs is 15, so interested in lowest value (i.e. farthest from 15)
        reference_val = 15
    
    
    # get the highest value in the preceding 24 hours
    highest_values = (temp_df
                      .groupby(['enc_id', 'obs_time'], as_index=False)['value']
                      .max())
    highest_values = highest_values.rename(columns={'value':var + '_high'})
    # get the lowest value in the preceding 24 hours
    lowest_values = (temp_df
                     .groupby(['enc_id', 'obs_time'], as_index=False)['value']
                     .min())
    highest_values[var+'_low'] = lowest_values['value']
    
    # get the most "deranged" value in the preceding 24 hours
    highest_values['high_dev'] = np.abs(highest_values[var+'_high'] - reference_val)
    highest_values['low_dev'] = np.abs(highest_values[var+'_low'] - reference_val)
    # if highest is more extreme then pick highest
    highest_values[var+'_worst'] = (highest_values['high_dev'] > highest_values['low_dev'])*highest_values[var+'_high']
    # if lowest is more extreme then pick lowest
    highest_values[var+'_worst'] = highest_values[var+'_worst'] + (highest_values['low_dev'] >= highest_values['high_dev'])*highest_values[var+'_low']
        
    # get latest value
    # get most recent (i.e., last) value that occurred in the preceding 24 hours
    latest_values = (temp_df
                     .groupby(['enc_id', 'obs_time'], as_index=False)['value']
                     .nth(-1)
                     .values)
    highest_values[var+'_latest'] = latest_values
    
    highest_values = highest_values[['enc_id', 'obs_time', var+'_high', var+'_low', var+'_worst', var+'_latest']]
       
    
    # merge into the pred df
    pred_df = pred_df.merge(highest_values, on=['enc_id', 'obs_time'], how="left")
    
    print("Completed extraction of {}".format(var))

In [None]:
pred_df.columns

In [None]:
pred_df.head()

In [None]:
# pred_df.to_hdf('/data/adarsh/fda_project_data/lab_and_features.h5', key='s')

### Computing the other features

In [None]:
# season, time of day

In [None]:
extracted_months = pd.DatetimeIndex(pred_df.obs_time).month

In [None]:
month_to_season_map = {
    1:1,
    2:1,
    11:1,
    12:1,
    3:2,
    4:2,
    5:2,
    6:2,
    7:3,
    8:3,
    9:3,
    10:3
}

pred_df['season'] = extracted_months.map(month_to_season_map)

In [None]:
# time of day

In [None]:
hour_to_timeframe_map = {}
for i in range(24):
    if i in range(1,7):
        hour_to_timeframe_map[i] = 1
    elif i in range(7,12):
        hour_to_timeframe_map[i] = 2
    else:
        hour_to_timeframe_map[i] = 3

pred_df['time_of_day'] = pd.DatetimeIndex(pred_df.obs_time).hour.map(hour_to_timeframe_map)

In [None]:
for var in lab_vars:
    pred_df[var] = pred_df[var].astype(np.double)

In [None]:
pred_df

In [None]:
pred_df.to_hdf('/data/adarsh/fda_project_data/aam_lab_and_vital_features_with_spo2.h5', key='s')

In [None]:
pred_df

In [None]:
pred_df.enc_id.unique().shape