In [None]:
import pandas as pd
# import modin.pandas as pd
import numpy as np
import pickle as pkl
from os import listdir, walk
from os.path import isfile, join
from dask.distributed import Client
import dask.dataframe as dd

import datetime

import json
import time

import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

%matplotlib inline

In [None]:
care_defs = pd.read_csv(filepath+'/care_unit_defs.csv')
# SMH 3A L&D is a maternity unit.
care_defs.at[care_defs.loc[care_defs.care_unit == 'SMH 3A L&D'].index[0], 'maternity'] = True

In [None]:
pat_enc = pd.read_csv(filepath+'/pat_enc.csv')

In [None]:
df = dd.read_csv('/data/deidentified_trews_deployment_data/cdm/cdm_t.csv')

In [None]:
cohort = pd.read_csv("/data/adarsh/fda_project_data/study_cohort.csv")

In [None]:
cohort['admit_time'] = pd.to_datetime(cohort.admit_time)

In [None]:
cohort_enc_ids = cohort.enc_id.unique()

In [None]:
# use dask default scheduling
start = time.time()

# find all rows in cdm_t corresponding to the study cohort
cdm_t = df.loc[df.enc_id.isin(cohort_enc_ids)].compute()

end = time.time()
print((end-start)/60.)

In [None]:
# Outcome definition
# a) transfer to the ICU, where patient stayed in the ICU for >=6 hours or died in the ICU, 1
# b) death outside the ICU, 2
# c) transfer to the ICU where patient stayed for <6 hours and patient went to operating room aftewards, 3

# Strategy: compute each outcome separately
# Combine results and pick first outcome for each patient

In [None]:
# Things to compute: death and time of death
# time spent in ICU

In [None]:
# compute table of patients who died and their time of death
discharge_df = cdm_t.query("fid == 'discharge'")

In [None]:
# convert json value to columns
json_df = pd.json_normalize(discharge_df.value.apply(json.loads))

In [None]:
discharge_df['department'] = json_df.department.values
discharge_df['disposition'] = json_df.disposition.values

In [None]:
discharge_df = discharge_df[['dataset_id', 'enc_id', 'tsp', 'department', 'disposition']]

In [None]:
mortality_df = discharge_df.loc[discharge_df.disposition.str.contains('Expired')]
mortality_df['tsp'] = pd.to_datetime(mortality_df.tsp)
mortality_df = mortality_df.rename(columns = {'tsp': 'death_time'}).drop(['disposition', 'dataset_id'], axis=1)

In [None]:
# was the unit the patient died in the ICU?
mortality_df = mortality_df.merge(care_defs.rename(columns={'care_unit':'department'})[['department', 'level_of_care']])

In [None]:
mortality_df['died_in_icu'] = mortality_df.level_of_care == 'icu'
mortality_df = mortality_df.drop('level_of_care', axis=1)

In [None]:
mortality_df = mortality_df.rename(columns={'death_time':'event_time'})

In [None]:
outcome_df = mortality_df.query('died_in_icu == False')[['enc_id', 'event_time']]

In [None]:
# death outside the icu; outcome label 2
outcome_df['outcome'] = 2

In [None]:
# death outside the icu; outcome label 1
icu_death_df = mortality_df.query('died_in_icu == True')[['enc_id', 'event_time']]
icu_death_df['outcome'] = 1

In [None]:
outcome_df = pd.concat([outcome_df, icu_death_df])

## Now computing outcomes related to length of stay in ICU

In [None]:
# now we need to find out how long a patient's ICU stay was
care_unit_rows = cdm_t.query("fid == 'care_unit'")

In [None]:
care_unit_rows = (care_unit_rows
                  .rename(columns={'value':'care_unit'})
                  .merge(care_defs[['hospital', 'care_unit', 'level_of_care']], on='care_unit', how='left'))

In [None]:
care_unit_rows['tsp'] = pd.to_datetime(care_unit_rows.tsp)

In [None]:
care_unit_rows = care_unit_rows.sort_values(['enc_id', 'tsp'])

In [None]:
# all the times patients were in the ICU
icu_times = care_unit_rows.query("level_of_care == 'icu'")

In [None]:
icu_times['icu_start_tsp'] = icu_times['tsp']

In [None]:
max_icu_times = np.max(icu_times.groupby('enc_id', as_index=False).size()['size'])

In [None]:
non_icu_care_unit_rows = care_unit_rows.query("level_of_care != 'icu'")

In [None]:
# for each patient's first icu stay: we will make the dataframe containing their information
# get first icu stay information
nth_icu_stay = icu_times.groupby('enc_id', as_index=False).nth(0)[['enc_id', 'icu_start_tsp']]

# get first unit after the icu stay
next_unit_df = (non_icu_care_unit_rows
                .merge(nth_icu_stay, on='enc_id', how='left')
                .query('tsp > icu_start_tsp')
                .groupby('enc_id', as_index=False).nth(0)
                .rename(columns={'tsp':'next_unit_tsp', 
                                 'care_unit':'next_care_unit', 
                                 'level_of_care':'next_level_of_care'})
               [['enc_id', 'next_unit_tsp', 'next_care_unit', 'next_level_of_care', 'icu_start_tsp']])
icu_los_df = icu_times.merge(next_unit_df, on=['enc_id', 'icu_start_tsp'], how='inner')

In [None]:
# now, for the remaining stays
# do this for each icu stay
for icu_count in range(1, max_icu_times):

    nth_icu_stay = icu_times.groupby('enc_id', as_index=False).nth(icu_count)[['enc_id', 'icu_start_tsp']]

    # get first unit after the icu stay
    next_unit_df = (non_icu_care_unit_rows
                    .merge(nth_icu_stay, on='enc_id', how='left')
                    .query('tsp > icu_start_tsp')
                    .groupby('enc_id', as_index=False).nth(0)
                    .rename(columns={'tsp':'next_unit_tsp', 
                                     'care_unit':'next_care_unit', 
                                     'level_of_care':'next_level_of_care'})
                   [['enc_id', 'next_unit_tsp', 'next_care_unit', 'next_level_of_care', 'icu_start_tsp']])
    icu_los_df = pd.concat([icu_los_df, icu_times.merge(next_unit_df, on=['enc_id', 'icu_start_tsp'], how='inner')])

In [None]:
# compute duration of each icu stay
icu_los_df['icu_los'] = icu_los_df['next_unit_tsp'] - icu_los_df['icu_start_tsp']
icu_los_df = icu_los_df.rename(columns={'tsp':'event_time'})

In [None]:
# icu stays of at least 6 hours are outcome 1
long_icu_stay = icu_los_df.loc[icu_los_df.icu_los >= datetime.timedelta(hours=6)]
long_icu_stay['outcome'] = 1
outcome_df = pd.concat([outcome_df, long_icu_stay[['enc_id', 'event_time', 'outcome']]])

In [None]:
# icu stays of less than 6 hours and next unit was surgery
short_icu_stay = (icu_los_df
                  .loc[icu_los_df.icu_los < datetime.timedelta(hours=6)]
                  .query("next_level_of_care == 'surgery'"))
short_icu_stay['outcome'] = 3
outcome_df = pd.concat([outcome_df, short_icu_stay[['enc_id', 'event_time', 'outcome']]])

In [None]:
# get first instance of outcome for each patient
outcome_df = outcome_df.sort_values(['enc_id', 'event_time']).groupby('enc_id', as_index=False).nth(0)

In [None]:
# add outcome information to cohort df
cohort = cohort.merge(outcome_df, on='enc_id', how='left')

In [None]:
# everyone else did not experience the outcome
cohort['outcome'] = cohort['outcome'].fillna(0)

In [None]:
discharge_times = (care_unit_rows
                   .groupby('enc_id', as_index=False)
                   .nth(-1)
                   .rename(columns={'tsp':'discharge_time'})
                   [['enc_id', 'discharge_time']])

In [None]:
cohort = cohort.merge(discharge_times)

In [None]:
cohort

In [None]:
# if eventtime is null then use the discharge time, otherwise use the existing event time
cohort['event_time'] = np.where(np.isnan(cohort.event_time.values), 
                                cohort.discharge_time, 
                                cohort.event_time)

In [None]:
cohort.to_csv("/data/adarsh/fda_project_data/study_cohort_with_outcome.csv", index=False)

In [None]:
### Dataset with max LOS of 15 days
cohort = pd.read_csv("/data/adarsh/fda_project_data/study_cohort_with_outcome.csv")

In [None]:
cohort['admit_time'] = pd.to_datetime(cohort['admit_time'])
cohort['event_time'] = pd.to_datetime(cohort['event_time'])
cohort['discharge_time'] = pd.to_datetime(cohort['discharge_time'])
# reduce LOS to 15 days
cohort['los'] = cohort['event_time'] - cohort['admit_time']

# max time we would make a prediction
cohort['last_obs_time'] = cohort['admit_time'] + datetime.timedelta(days=15)

# if patient had outcome, but it is more than 15 days + 12 hours from admission, then we won't predict it
# occurring in the next 12 hours so outcome is 0
cohort.loc[(cohort.outcome != 0) 
           & (cohort.event_time > (cohort.last_obs_time + datetime.timedelta(hours=12))), ['outcome']] = 0

In [None]:
# last observation time is:
# min(event time, admit + 15 days) for people with outcome not 0

# min(admit + 15 days, discharge time) for people with outcome 0
cohort['end_time'] = np.where(cohort.outcome == 0, np.minimum(cohort.discharge_time, cohort.last_obs_time), cohort.last_obs_time)

# if you have outcome, then last observation time is min(event_time, admit + 15 days)
cohort['end_time'] = np.where(cohort.outcome != 0, np.minimum(cohort.event_time, cohort.last_obs_time), cohort.end_time)

In [None]:
cohort = cohort.drop(columns=['los', 'discharge_time', 'last_obs_time'])

In [None]:
cohort

In [None]:
# patient cohort where LOS (and labels) updated to be less than 15 days
cohort.to_csv("/data/adarsh/fda_project_data/study_cohort_with_outcome_truncated_15days.csv", index=False)