## Import Necessary Libraries

In [1]:
import argparse
from datetime import datetime
import numpy as np
import os
import pandas as pd
import pickle
import random
import sys

In [6]:
from tqdm.auto import tqdm

## Preprocess Grady Data

In [9]:
def create_csv(years):

    print("Creating csv for Grady Patients...")
    csv_path = "/opt/bmi-585r/Grady_Data/encounter_summary_all_07_02_2021.csv"

    # load the csv files
    encs_df = pd.read_csv(csv_path)

    # convert sep3 time and start index to datetime columns
    encs_df['start_index'] = pd.to_datetime(encs_df['start_index'], format='%Y-%m-%d %H:%M:%S')
    encs_df['first_sep3_time'] = pd.to_datetime(encs_df['first_sep3_time'], format='%Y-%m-%d %H:%M:%S')

    # add sepsis label column
    encs_df['sepsis_patient'] = np.where(encs_df['first_sep3_time'].isnull(), 0, 1)

    # get columns from super_table
    super_table_df = pd.read_csv('super_table_data_dictionary.csv')
    super_table_cols = super_table_df['super_table_col'].str.strip().values.tolist()
    
    for year in years:
        
        print("Looking at patients for year: ", year)
        
        # look at patients for certain year
        enc_df = encs_df.loc[encs_df['start_index'].dt.year == year].reset_index(drop=True)

        enc_cols = list(enc_df.columns)

        nonsep_patients = len(enc_df.loc[enc_df['sepsis_patient'] == 0])
        sep_patients = len(enc_df.loc[enc_df['sepsis_patient'] == 1])

        print("Grady patients who had sepsis in %i: %i" % (year, sep_patients) )
        print("Grady patients who did not have sepsis in %i: %i" % (year, nonsep_patients) )

        # get csn start time, sepsis patient idx
        csn_idx = enc_df.columns.get_loc('csn')
        start_time_idx = enc_df.columns.get_loc('start_index')
        first_sep3_time_idx = enc_df.columns.get_loc('first_sep3_time')
        sepsis_patient_idx = enc_df.columns.get_loc('sepsis_patient')

        patient_list = []

        i = 1
        for row in tqdm(enc_df.values[:100]):
            print("Looking at patient %i of %i...\n" % (i, len(enc_df)) )
            #print("Looking at patient %i of %i...\n" % (i, sep_patients) )

            csn = row[csn_idx]
            start_time = row[start_time_idx]
            first_sep3_time = row[first_sep3_time_idx]
            sepsis_patient = row[sepsis_patient_idx]

            # load pickle file
            pkl_filename = "/opt/bmi-585r/Grady_Data/" + str(year) + '/' + str(csn) + '.pickle'

            # check if pickle file exists
            if not os.path.isfile(pkl_filename):
                print("Pickle file does not exist. Skipping patient %i of %i...\n" % (i, len(enc_df)) )
                i = i + 1
                continue

            with open(pkl_filename, 'rb') as pf:
                enc_dict = pickle.load(pf)

            # grab patient's super table
            super_table = enc_dict['super_table']

            if len(list(super_table.columns)) != len(super_table_cols):
                print("Incorrect super table columns. Skipping patient %i of %i...\n" % (i, len(enc_df)) )
                i = i + 1
                continue

            # check if sepsis patient or not
            # if sepsis patient, collect values before calculated first sepsis time
            # otherwise, collect all nonsepsis patient vals
            if sepsis_patient == 1:
                after_start_time_cond = super_table.index >= start_time
                before_first_sep3_time_cond = super_table.index < first_sep3_time
                patient_vals = super_table.loc[(after_start_time_cond) & (before_first_sep3_time_cond)].values
            else:
                patient_vals = super_table.values

            # skip for patients with no values
            if patient_vals.size == 0:
                print("No values for patient. Skipping patient %i of %i...\n" % (i, len(enc_df)) )
                i = i + 1
                continue

            for patient_val in patient_vals:
                patient_info = np.concatenate((row, patient_val))
                patient_list.append(patient_info)

            i = i + 1

        # create dataframe and save to csv file
        if patient_list:
            patient_df = pd.DataFrame(np.row_stack(patient_list), columns=enc_cols + super_table_cols)
            patient_df.to_csv("/opt/bmi-585r/Grady_Data/gr_prep_" + str(year) + ".csv", index=False)
        else:
            print("No recorded patients for %s. Skipping year..." % year)

## Gather All Grady Patients For Available Years

In [10]:
years = list(range(2014,2015))
print("years: ", years)

years:  [2014]


In [11]:
%%time
create_csv(years)

Creating csv for Grady Patients...


PermissionError: [Errno 13] Permission denied: '/opt/bmi-585r/Grady_Data/encounter_summary_all_07_02_2021.csv'

## Print Dataframe Heads for Confirmation

In [4]:
years = list(range(2016,2021))
for year in years:
    print("Looking at year: ", year)
    df = pd.read_csv("/opt/bmi-585r/Grady_Data/gr_prep_" + str(year) + ".csv")
    print(df.head())
    print("*******************************************")

Looking at year:  2016


  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1017101607  Z1360733            0                  0                0   
1  1017101607  Z1360733            0                  0                0   
2  1017101607  Z1360733            0                  0                0   
3  1017101607  Z1360733            0                  0                0   
4  1017101607  Z1360733            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN          14.0          NaN              NaN          NaN   
1             NaN          14.0          NaN              NaN          NaN   
2             NaN          14.0          NaN              NaN          NaN   
3             NaN          14.0          NaN              NaN          NaN   
4             NaN          14.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig

  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1019440753  Z1722910            0                  0                0   
1  1019440753  Z1722910            0                  0                0   
2  1019440753  Z1722910            0                  0                0   
3  1019440753  Z1722910            0                  0                0   
4  1019440753  Z1722910            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN           7.0          NaN              NaN          NaN   
1             NaN           7.0          NaN              NaN          NaN   
2             NaN           7.0          NaN              NaN          NaN   
3             NaN           7.0          NaN              NaN          NaN   
4             NaN           7.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig

  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1025222642  Z1723215            0                  0                0   
1  1025222642  Z1723215            0                  0                0   
2  1025222642  Z1723215            0                  0                0   
3  1025222642  Z1723215            0                  0                0   
4  1025222642  Z1723215            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN         349.0          NaN              NaN          NaN   
1             NaN         349.0          NaN              NaN          NaN   
2             NaN         349.0          NaN              NaN          NaN   
3             NaN         349.0          NaN              NaN          NaN   
4             NaN         349.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig

  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1027990195  Z1723215            0                  0                0   
1  1027990195  Z1723215            0                  0                0   
2  1027990195  Z1723215            0                  0                0   
3  1027990195  Z1723215            0                  0                0   
4  1027990195  Z1723215            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN          29.0          NaN              NaN          NaN   
1             NaN          29.0          NaN              NaN          NaN   
2             NaN          29.0          NaN              NaN          NaN   
3             NaN          29.0          NaN              NaN          NaN   
4             NaN          29.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig

  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1035996185  Z1722864            0                  0                0   
1  1035996185  Z1722864            0                  0                0   
2  1035996185  Z1722864            0                  0                0   
3  1035996185  Z1722864            0                  0                0   
4  1035996185  Z1722864            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN          -9.0          NaN              NaN          NaN   
1             NaN          -9.0          NaN              NaN          NaN   
2             NaN          -9.0          NaN              NaN          NaN   
3             NaN          -9.0          NaN              NaN          NaN   
4             NaN          -9.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig