In [None]:
import pandas as pd
# import modin.pandas as pd
import numpy as np
import pickle as pkl
from os import listdir, walk
from os.path import isfile, join
from dask.distributed import Client
import dask.dataframe as dd

import time

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
care_defs = pd.read_csv('/data/deidentified_trews_deployment_data/cdm/care_unit_defs.csv')
# SMH 3A L&D is a maternity unit. Not marked in the file
care_defs.at[care_defs.loc[care_defs.care_unit == 'SMH 3A L&D'].index[0], 'maternity'] = True

In [None]:
pat_enc = pd.read_csv('/data/deidentified_trews_deployment_data/cdm/pat_enc.csv')

In [None]:
pat_enc.pat_id.unique().shape

In [None]:
pat_enc

In [None]:
df = dd.read_csv('/data/deidentified_trews_deployment_data/cdm/cdm_t.csv')

In [None]:
# inclusion critera: > 18
# not admitted to hospital for childbirth
# were admitted to hospital (so have a department after ED)
# First, find childbirth ids and exclude them
# Second, find admits to non-ED departments. Get admit time as well
# Third, find ED patients who went to non-ED, non-discharge departments. Get admit time
# Combine step 2 and step 3 dataframes

In [None]:
cdm_s = pd.read_csv('/data/deidentified_trews_deployment_data/cdm/cdm_s.csv')

In [None]:
childbirth_admit_types = ['Delivery - Emergent: L&D Patients treated in the ED',
                          'Delivery: Patients Who Are Assessed on L&D for a desired pregnancy', 
                          'Newborn: Born during this patient admission']


# Find enc_ids not associated with childbirth admissions
non_childbirth_encids = (cdm_s.loc[cdm_s.fid == 'admit_type']
                         .query('value not in @childbirth_admit_types')
                         .enc_id.unique())

In [None]:
# use dask default scheduling
start = time.time()

# find all rows in cdm_t that have an enc_id not associated with childbirth admissions
non_childbirth_df = df.loc[df.enc_id.isin(non_childbirth_encids)].compute()

end = time.time()
print((end-start)/60.)

In [None]:
ncb_care_unit_rows = non_childbirth_df.loc[non_childbirth_df.fid == 'care_unit']

In [None]:
# merge nonchildbirth care unit rows with patient ids
ncb_care_unit_rows = ncb_care_unit_rows.merge(pat_enc[['enc_id', 'pat_id']], on='enc_id', how='left')

In [None]:
ncb_care_unit_rows = (ncb_care_unit_rows
                      .rename(columns={'value':'care_unit'})
                      .merge(care_defs[['hospital', 
                                        'care_unit', 
                                        'level_of_care', 
                                        'maternity']], on='care_unit', how='left'))

In [None]:
ncb_care_unit_rows['tsp'] = pd.to_datetime(ncb_care_unit_rows.tsp)

In [None]:
ncb_care_unit_rows = ncb_care_unit_rows.sort_values(['pat_id', 'tsp'])

In [None]:
ncb_care_unit_rows

In [None]:
# disinclude patients whose first unit is a maternity unit
ncb_pat_ids = ncb_care_unit_rows.groupby('pat_id', as_index=False).nth(0).query('maternity == False').pat_id.values
ncb_care_unit_rows = ncb_care_unit_rows.loc[ncb_care_unit_rows.pat_id.isin(ncb_pat_ids)]

In [None]:
# disinclude patients whose first unit is icu. we are predicting unplanned icu transfer
ncb_pat_ids = ncb_care_unit_rows.groupby('pat_id', as_index=False).nth(0).query('level_of_care != "icu"').pat_id.values
ncb_care_unit_rows = ncb_care_unit_rows.loc[ncb_care_unit_rows.pat_id.isin(ncb_pat_ids)]

In [None]:
ncb_care_unit_rows

In [None]:
# find patients whose first unit was not ED
# get admission time
non_ed_pts = (ncb_care_unit_rows
              .groupby('pat_id', as_index=False).nth(0).query("level_of_care != 'icu'")
              .query('level_of_care != "emergency"')[['enc_id', 'pat_id', 'tsp', 'hospital']])

In [None]:
# patients whose first unit was the ED
ed_admit_pat_ids = (ncb_care_unit_rows
                .groupby('pat_id', as_index=False)
                .nth(0).query('level_of_care == "emergency"')['pat_id']
                .values)

In [None]:
ed_admit_enc_ids = (ncb_care_unit_rows
                    .loc[ncb_care_unit_rows.pat_id.isin(ed_admit_pat_ids)]
                    .groupby('pat_id', as_index=False)
                    .nth(0)['enc_id'])

In [None]:
# Find the patients who are admitted to a main ward from the ED
# So their first ward after the ED is not discharge, and is not the ICU
ed_pts = (ncb_care_unit_rows
                    .loc[ncb_care_unit_rows.enc_id.isin(ed_admit_enc_ids)]
                    .query('care_unit != "Discharge"')
                    .query('level_of_care != "emergency"')
                    .groupby('enc_id', as_index=False).nth(0)
                    .query("level_of_care != 'icu'")[['enc_id', 'pat_id', 'tsp', 'hospital']])

In [None]:
cohort = pd.concat([non_ed_pts, ed_pts])

In [None]:
cohort = cohort.rename(columns={'tsp':'admit_time'})

In [None]:
# 304986 patient encounters!

In [None]:
cohort.to_csv("/data/adarsh/fda_project_data/study_cohort.csv", index=False)