In [None]:
import pandas as pd
# import modin.pandas as pd
import numpy as np
import pickle as pkl
from os import listdir, walk
from os.path import isfile, join
from dask.distributed import Client
import dask.dataframe as dd

import datetime

import json
import time

import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

%matplotlib inline

In [None]:
pred_df = pd.read_hdf('/data/adarsh/fda_project_data/aam_lab_and_vital_features_with_spo2.h5')

In [None]:
pred_df.columns

In [None]:
cohort = pd.read_csv("/data/adarsh/fda_project_data/study_cohort_with_outcome_truncated_15days.csv")

In [None]:
cohort_event_times = pd.read_csv("/data/adarsh/fda_project_data/study_cohort_with_outcome.csv")
cohort_event_times = cohort_event_times[['enc_id', 'event_time']]

In [None]:
cohort = cohort.merge(cohort_event_times, on=['enc_id'], how='left')

In [None]:
cdm_s = pd.read_csv("/data/deidentified_trews_deployment_data/cdm/cdm_s.csv")

In [None]:
cdm_g = pd.read_csv("/data/adarsh/fda_project_data/cdm_g.csv")

## Extract demographic features

In [None]:
age_df = cdm_s.loc[cdm_s.enc_id.isin(pred_df.enc_id)].query("fid == 'age'")[['enc_id', 'value']]
age_df = age_df.rename(columns={'value':'age'})

In [None]:
gender_df = cdm_s.loc[cdm_s.enc_id.isin(pred_df.enc_id)].query("fid == 'gender'")[['enc_id', 'value']]
gender_df = gender_df.rename(columns={'value':'gender'})

In [None]:
pred_df = pred_df.merge(age_df, on=['enc_id'], how='left')

In [None]:
pred_df = pred_df.merge(gender_df, on=['enc_id'], how='left')

## LOS feature

In [None]:
first_time_df = pred_df.groupby('enc_id', as_index=False).nth(0)[['enc_id', 'obs_time']].rename(columns={'obs_time':'first_time'})
pred_df = pred_df.merge(first_time_df, on=['enc_id'], how='left')

In [None]:
# transpired length of stay in hours
pred_df['first_time'] = ((pred_df.obs_time.astype('int64') // 10**9) - (pred_df['first_time'].astype('int64') // 10**9)) / 60 / 60

In [None]:
pred_df = pred_df.rename(columns={'first_time':'los'})

In [None]:
# Add hospital feature
pred_df = pred_df.merge(cohort[['enc_id', 'hospital']], on='enc_id', how='left')

## Add labels to this dataset

In [None]:
pred_df = pred_df.merge(cohort[['enc_id', 'outcome', 'event_time']], on='enc_id', how='left')

In [None]:
pred_df['event_time'] = pd.to_datetime(pred_df.event_time)

In [None]:
# label is 1 if outcome is non-zero and event time is <= 12 hours from obs time

In [None]:
pred_df['label'] = (pred_df['event_time'] - pred_df['obs_time'] <= datetime.timedelta(hours = 12)) & (pred_df['outcome'] != 0)

In [None]:
pred_df['label'] = pred_df.label.astype(np.int)

In [None]:
pred_df = pred_df.drop(columns=['event_time', 'outcome'])

## Admit type feature

In [None]:
# TODO: add surgical categories, for now will just use emergency/not emergency

In [None]:
ed_admit_types = ['Trauma: Emergency Room Visit in which the Trauma Team is Activated.', 
                  'Urgent: The patient required immediate attention for the care and treatment of a physical disorder.', 
                  'Emergency: The patient required immediate medical intervention as a result of severe, life threatening, or potentially disabling conditions.']

In [None]:
admit_df = cdm_s.loc[cdm_s.enc_id.isin(pred_df.enc_id)].query('fid == "admit_type"')

In [None]:
admit_df['ed_admit'] = admit_df.value.isin(ed_admit_types).astype(np.int)

In [None]:
admit_df = admit_df[['enc_id', 'ed_admit']]

In [None]:
pred_df = pred_df.merge(admit_df, on='enc_id', how='left')

In [None]:
pred_df.to_hdf('/data/adarsh/fda_project_data/raw_complete_dataset.h5', key='s', mode='w')

# Imputation and computing AAM model features

In [None]:
vital_vars = ['nbp_dias', 'nbp_sys', 'heart_rate', 'spo2', 'resp_rate', 'temperature', 'gcs']
lab_vars = ['anion_gap', 'bicarbonate', 'glucose', 'hematocrit', 'lactate', 'bun', 'creatinine', 'sodium',
            'troponin', 'wbc']

In [None]:
aam_df.head()

In [None]:
pred_df.columns

In [None]:
# impute vitals and labs
suffixes = ['_latest', '_high', '_low', '_worst']
for var in vital_vars + lab_vars:
    query_string = var+"_popmean"
    default_value = cdm_g.query('fid == @query_string')['value'].values[0]
    if var in vital_vars:
        for suffix in suffixes:
            pred_df[var + suffix] = pred_df[var+suffix].fillna(default_value)
    else:
        pred_df[var] = pred_df[var].fillna(default_value)

In [None]:
def sigmoid(x):
     return 1./(1 + np.exp(-x))

In [None]:
def invert_feature_transforms(frame):
    nf = frame.copy()
    nf['bicarbonate'] =  np.sqrt(frame.bicarbonate)
    nf['hematocrit'] = frame.hematocrit_cu**(1./3)
    nf['bun'] = np.exp(frame.bun_log)
    nf['creatinine'] = np.exp(np.sqrt(frame.creatinine_log_sq))
    nf['nbp_sys_latest'] = frame.nbp_sys_latest_cu**(1./3)
    nf['heart_rate_latest'] = frame.heart_rate_latest_cu**(1./3)
    nf['heart_rate_instability'] = np.exp(np.sqrt(frame.heart_rate_instability_log_sq))
    nf['spo2_instability'] = np.exp(frame.spo2_instability_log)
    nf['spo2_latest'] = sigmoid(frame.spo2_latest_logit_cu**(1./3))
    nf['spo2_worst'] = sigomid(frame.spo2_worst_logit)
    nf['resp_rate_instability'] = np.exp(frame.resp_rate_instability_log)
    nf['temperature_instability'] = np.exp(np.sqrt(frame.temperature_instability_log_sq))
    nf['temperature_latest'] = np.sqrt(frame.temperature_latest_sq)
    nf['resp_rate_latest'] = frame.resp_rate_latest_cu ** (1./3)
    nf['los'] = np.exp(frame.los_log) - 1
    nf['age'] = np.exp(frame.age_log)

In [None]:
aam_df = pred_df[['enc_id', 'obs_time', 'label']].copy()

In [None]:
aam_df['anion_gap'] = pred_df.anion_gap

In [None]:
aam_df['bicarbonate_sq'] = pred_df.bicarbonate**2

In [None]:
aam_df['glucose'] = pred_df.glucose

In [None]:
aam_df['hematocrit_cu'] = pred_df.hematocrit**3

In [None]:
aam_df['lactate'] = pred_df.lactate

In [None]:
aam_df['bun_log'] = np.log(pred_df.bun)

In [None]:
aam_df['creatinine_log_sq'] = np.log(pred_df.creatinine)**2

In [None]:
aam_df['sodium'] = pred_df.sodium

In [None]:
aam_df['troponin'] = pred_df.troponin

In [None]:
aam_df['troponin_missing'] = pred_df.troponin_missing

In [None]:
aam_df['wbc'] = pred_df.wbc

In [None]:
aam_df['nbp_dias_latest_sq'] = pred_df.nbp_dias_latest**2

In [None]:
aam_df['nbp_sys_instability'] = pred_df.nbp_sys_high - pred_df.nbp_sys_low

In [None]:
aam_df['nbp_sys_latest_cu'] = pred_df.nbp_sys_latest**3

In [None]:
aam_df['heart_rate_latest_cu'] = pred_df.heart_rate_latest**3

In [None]:
aam_df['heart_rate_instability_log_sq'] = np.log((pred_df.heart_rate_high - pred_df.heart_rate_low)
                                                 .replace({0:1}))**2

In [None]:
aam_df['spo2_instability_log'] = np.log((pred_df.spo2_high - pred_df.spo2_low).replace({0:1}))

In [None]:
def logit(x):
    return np.log(x/(1. - x))

In [None]:
aam_df['spo2_latest_logit_cu'] = logit(pred_df.spo2_latest.replace({0:1, 100:99})/100.)**3

In [None]:
aam_df['spo2_worst_logit'] = logit(pred_df.spo2_worst.replace({0:1, 100:99})/100.)

In [None]:
aam_df['resp_rate_instability_log'] = np.log((pred_df.resp_rate_high - pred_df.resp_rate_low).replace({0:1}))

In [None]:
aam_df['temperature_instability_log_sq'] = np.log((pred_df.temperature_high - pred_df.temperature_low)
                                                  .replace({0:1}))**2

In [None]:
aam_df['temperature_latest_sq'] = pred_df.temperature_latest**2

In [None]:
aam_df['resp_rate_latest_cu'] = pred_df.resp_rate_latest**3

In [None]:
aam_df['resp_rate_worst'] = pred_df.resp_rate_worst 

In [None]:
aam_df['gcs_latest'] = pred_df.gcs_latest

In [None]:
aam_df['anion_gap_bicarbonate_ratio'] = (pred_df.anion_gap / pred_df.bicarbonate)*1000

In [None]:
aam_df['shock_index'] = pred_df.heart_rate_latest / pred_df.nbp_sys_latest

In [None]:
aam_df['los_log'] = np.log(pred_df.los + 1)

In [None]:
aam_df['age_log'] = np.log(pred_df.age.replace({'>= 90':90}).astype(np.double))

In [None]:
aam_df['sex'] = pred_df['gender'].astype(np.int)

In [None]:
season_dummies = pd.get_dummies(pred_df['season'])

In [None]:
aam_df['season_1'] = season_dummies[1]
aam_df['season_2'] = season_dummies[2]
aam_df['season_3'] = season_dummies[3]

In [None]:
time_of_day_dummies = pd.get_dummies(pred_df['time_of_day'])

In [None]:
aam_df['time_of_day_1'] = time_of_day_dummies[1]
aam_df['time_of_day_2'] = time_of_day_dummies[2]
aam_df['time_of_day_3'] = time_of_day_dummies[3]

In [None]:
aam_df['ed_admit'] = pred_df.ed_admit

In [None]:
hospital_dummies = pd.get_dummies(pred_df.hospital)

In [None]:
hospital_dummies

In [None]:
aam_df['hospital_bmc'] = hospital_dummies['BMC']
aam_df['hospital_hcgh'] = hospital_dummies['HCGH']
aam_df['hospital_jhh'] = hospital_dummies['JHH']
aam_df['hospital_sh'] = hospital_dummies['SH']
aam_df['hospital_smh'] = hospital_dummies['SMH']

In [None]:
aam_df.to_hdf('/data/adarsh/fda_project_data/aam_model_dataset.h5', key='s', mode='w')

In [None]:
# aam_df.to_csv("/data/adarsh/fda_project_data/aam_model_dataset.csv", index=False)

In [None]:
cdm_g

In [None]:
pred_df.query('label==1').groupby(['enc_id'], as_index=False).nth(0).groupby('hospital', as_index=False).count()