## Import Necessary Libraries

In [1]:
import argparse
from datetime import datetime
import numpy as np
import os
import pandas as pd
import pickle
import random
import sys

## Preprocess Grady Data

In [2]:
def create_csv(years):

    print("Creating csv for Grady Patients...")
    csv_path = "/opt/bmi-585r/Grady_Data/encounter_summary_all_07_02_2021.csv"

    # load the csv files
    encs_df = pd.read_csv(csv_path)

    # convert sep3 time and start index to datetime columns
    encs_df['start_index'] = pd.to_datetime(encs_df['start_index'], format='%Y-%m-%d %H:%M:%S')
    encs_df['first_sep3_time'] = pd.to_datetime(encs_df['first_sep3_time'], format='%Y-%m-%d %H:%M:%S')

    # add sepsis label column
    encs_df['sepsis_patient'] = np.where(encs_df['first_sep3_time'].isnull(), 0, 1)

    # get columns from super_table
    super_table_df = pd.read_csv('super_table_data_dictionary.csv')
    super_table_cols = super_table_df['super_table_col'].str.strip().values.tolist()
    
    for year in years:
        
        print("Looking at patients for year: ", year)
        
        # look at patients for certain year
        enc_df = encs_df.loc[encs_df['start_index'].dt.year == year].reset_index(drop=True)

        enc_cols = list(enc_df.columns)

        nonsep_patients = len(enc_df.loc[enc_df['sepsis_patient'] == 0])
        sep_patients = len(enc_df.loc[enc_df['sepsis_patient'] == 1])

        print("Grady patients who had sepsis in %i: %i" % (year, sep_patients) )
        print("Grady patients who did not have sepsis in %i: %i" % (year, nonsep_patients) )

        # get csn start time, sepsis patient idx
        csn_idx = enc_df.columns.get_loc('csn')
        start_time_idx = enc_df.columns.get_loc('start_index')
        first_sep3_time_idx = enc_df.columns.get_loc('first_sep3_time')
        sepsis_patient_idx = enc_df.columns.get_loc('sepsis_patient')

        patient_list = []

        i = 1
        for row in enc_df.values:
            print("Looking at patient %i of %i...\n" % (i, len(enc_df)) )
            #print("Looking at patient %i of %i...\n" % (i, sep_patients) )

            csn = row[csn_idx]
            start_time = row[start_time_idx]
            first_sep3_time = row[first_sep3_time_idx]
            sepsis_patient = row[sepsis_patient_idx]

            # load pickle file
            pkl_filename = "/opt/bmi-585r/Grady_Data/" + str(year) + '/' + str(csn) + '.pickle'

            # check if pickle file exists
            if not os.path.isfile(pkl_filename):
                print("Pickle file does not exist. Skipping patient %i of %i...\n" % (i, len(enc_df)) )
                i = i + 1
                continue

            with open(pkl_filename, 'rb') as pf:
                enc_dict = pickle.load(pf)

            # grab patient's super table
            super_table = enc_dict['super_table']

            if len(list(super_table.columns)) != len(super_table_cols):
                print("Incorrect super table columns. Skipping patient %i of %i...\n" % (i, len(enc_df)) )
                i = i + 1
                continue

            # check if sepsis patient or not
            # if sepsis patient, collect values before calculated first sepsis time
            # otherwise, collect all nonsepsis patient vals
            if sepsis_patient == 1:
                after_start_time_cond = super_table.index >= start_time
                before_first_sep3_time_cond = super_table.index < first_sep3_time
                patient_vals = super_table.loc[(after_start_time_cond) & (before_first_sep3_time_cond)].values
            else:
                patient_vals = super_table.values

            # skip for patients with no values
            if patient_vals.size == 0:
                print("No values for patient. Skipping patient %i of %i...\n" % (i, len(enc_df)) )
                i = i + 1
                continue

            for patient_val in patient_vals:
                patient_info = np.concatenate((row, patient_val))
                patient_list.append(patient_info)

            i = i + 1

        # create dataframe and save to csv file
        if patient_list:
            patient_df = pd.DataFrame(np.row_stack(patient_list), columns=enc_cols + super_table_cols)
            patient_df.to_csv("/opt/bmi-585r/Grady_Data/gr_prep_" + str(year) + ".csv", index=False)
        else:
            print("No recorded patients for %s. Skipping year..." % year)

## Gather All Grady Patients For Available Years

In [3]:
years = list(range(2014,2021))
print("years: ", years)
create_csv(years)

years:  [2014, 2015, 2016, 2017, 2018, 2019, 2020]
Creating csv for Grady Patients...
Looking at patients for year:  2014
Grady patients who had sepsis in 2014: 0
Grady patients who did not have sepsis in 2014: 0
No recorded patients for 2014. Skipping year...
Looking at patients for year:  2015
Grady patients who had sepsis in 2015: 0
Grady patients who did not have sepsis in 2015: 0
No recorded patients for 2015. Skipping year...
Looking at patients for year:  2016
Grady patients who had sepsis in 2016: 4156
Grady patients who did not have sepsis in 2016: 20787
Looking at patient 1 of 24943...

Looking at patient 2 of 24943...

Looking at patient 3 of 24943...

Looking at patient 4 of 24943...

Looking at patient 5 of 24943...

Looking at patient 6 of 24943...

Looking at patient 7 of 24943...

Looking at patient 8 of 24943...

Looking at patient 9 of 24943...

Looking at patient 10 of 24943...

Looking at patient 11 of 24943...

Looking at patient 12 of 24943...

Looking at patient 

  enc_dict = pickle.load(pf)


Looking at patient 59 of 24943...

Looking at patient 60 of 24943...

Looking at patient 61 of 24943...

Looking at patient 62 of 24943...

Looking at patient 63 of 24943...

Looking at patient 64 of 24943...

Looking at patient 65 of 24943...

Looking at patient 66 of 24943...

Looking at patient 67 of 24943...

Looking at patient 68 of 24943...

Looking at patient 69 of 24943...

Looking at patient 70 of 24943...

Looking at patient 71 of 24943...

Looking at patient 72 of 24943...

Looking at patient 73 of 24943...

Looking at patient 74 of 24943...

Looking at patient 75 of 24943...

Looking at patient 76 of 24943...

Looking at patient 77 of 24943...

Looking at patient 78 of 24943...

Looking at patient 79 of 24943...

Looking at patient 80 of 24943...

Looking at patient 81 of 24943...

Looking at patient 82 of 24943...

Looking at patient 83 of 24943...

Looking at patient 84 of 24943...

Looking at patient 85 of 24943...

Looking at patient 86 of 24943...

Looking at patient 8

  enc_dict = pickle.load(pf)


Looking at patient 25 of 26337...

Looking at patient 26 of 26337...

Looking at patient 27 of 26337...

Looking at patient 28 of 26337...

Looking at patient 29 of 26337...

Looking at patient 30 of 26337...

Looking at patient 31 of 26337...

Looking at patient 32 of 26337...

Looking at patient 33 of 26337...

Looking at patient 34 of 26337...

Looking at patient 35 of 26337...

Looking at patient 36 of 26337...

Looking at patient 37 of 26337...

Looking at patient 38 of 26337...

Looking at patient 39 of 26337...

Looking at patient 40 of 26337...

Looking at patient 41 of 26337...

No values for patient. Skipping patient 41 of 26337...

Looking at patient 42 of 26337...

Looking at patient 43 of 26337...

Looking at patient 44 of 26337...

Looking at patient 45 of 26337...

Looking at patient 46 of 26337...

Looking at patient 47 of 26337...

Looking at patient 48 of 26337...

Looking at patient 49 of 26337...

Looking at patient 50 of 26337...

Looking at patient 51 of 26337...


  enc_dict = pickle.load(pf)


No values for patient. Skipping patient 21 of 27844...

Looking at patient 22 of 27844...

Looking at patient 23 of 27844...

Looking at patient 24 of 27844...

Looking at patient 25 of 27844...

Looking at patient 26 of 27844...

Looking at patient 27 of 27844...

Looking at patient 28 of 27844...

Looking at patient 29 of 27844...

Looking at patient 30 of 27844...

Looking at patient 31 of 27844...

Looking at patient 32 of 27844...

Looking at patient 33 of 27844...

Looking at patient 34 of 27844...

Looking at patient 35 of 27844...

Looking at patient 36 of 27844...

Looking at patient 37 of 27844...

Looking at patient 38 of 27844...

Looking at patient 39 of 27844...

Looking at patient 40 of 27844...

Looking at patient 41 of 27844...

Looking at patient 42 of 27844...

Looking at patient 43 of 27844...

Looking at patient 44 of 27844...

Looking at patient 45 of 27844...

Looking at patient 46 of 27844...

Looking at patient 47 of 27844...

Looking at patient 48 of 27844...


  enc_dict = pickle.load(pf)


Looking at patient 21 of 28881...

Looking at patient 22 of 28881...

Looking at patient 23 of 28881...

Looking at patient 24 of 28881...

Looking at patient 25 of 28881...

Looking at patient 26 of 28881...

Looking at patient 27 of 28881...

Looking at patient 28 of 28881...

Looking at patient 29 of 28881...

Looking at patient 30 of 28881...

Looking at patient 31 of 28881...

Looking at patient 32 of 28881...

Looking at patient 33 of 28881...

Looking at patient 34 of 28881...

Looking at patient 35 of 28881...

Looking at patient 36 of 28881...

Looking at patient 37 of 28881...

Looking at patient 38 of 28881...

Looking at patient 39 of 28881...

Looking at patient 40 of 28881...

Looking at patient 41 of 28881...

Looking at patient 42 of 28881...

Looking at patient 43 of 28881...

Looking at patient 44 of 28881...

Looking at patient 45 of 28881...

Looking at patient 46 of 28881...

Looking at patient 47 of 28881...

Looking at patient 48 of 28881...

Looking at patient 4

  enc_dict = pickle.load(pf)


Looking at patient 24 of 11918...

Looking at patient 25 of 11918...

Looking at patient 26 of 11918...

Looking at patient 27 of 11918...

Looking at patient 28 of 11918...

Looking at patient 29 of 11918...

Looking at patient 30 of 11918...

Looking at patient 31 of 11918...

Looking at patient 32 of 11918...

Looking at patient 33 of 11918...

Looking at patient 34 of 11918...

Looking at patient 35 of 11918...

Looking at patient 36 of 11918...

Looking at patient 37 of 11918...

Looking at patient 38 of 11918...

Looking at patient 39 of 11918...

Looking at patient 40 of 11918...

Looking at patient 41 of 11918...

Looking at patient 42 of 11918...

Looking at patient 43 of 11918...

Looking at patient 44 of 11918...

Looking at patient 45 of 11918...

Looking at patient 46 of 11918...

Looking at patient 47 of 11918...

Looking at patient 48 of 11918...

Looking at patient 49 of 11918...

Looking at patient 50 of 11918...

Looking at patient 51 of 11918...

Looking at patient 5

## Print Dataframe Heads for Confirmation

In [4]:
years = list(range(2016,2021))
for year in years:
    print("Looking at year: ", year)
    df = pd.read_csv("/opt/bmi-585r/Grady_Data/gr_prep_" + str(year) + ".csv")
    print(df.head())
    print("*******************************************")

Looking at year:  2016


  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1017101607  Z1360733            0                  0                0   
1  1017101607  Z1360733            0                  0                0   
2  1017101607  Z1360733            0                  0                0   
3  1017101607  Z1360733            0                  0                0   
4  1017101607  Z1360733            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN          14.0          NaN              NaN          NaN   
1             NaN          14.0          NaN              NaN          NaN   
2             NaN          14.0          NaN              NaN          NaN   
3             NaN          14.0          NaN              NaN          NaN   
4             NaN          14.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig

  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1019440753  Z1722910            0                  0                0   
1  1019440753  Z1722910            0                  0                0   
2  1019440753  Z1722910            0                  0                0   
3  1019440753  Z1722910            0                  0                0   
4  1019440753  Z1722910            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN           7.0          NaN              NaN          NaN   
1             NaN           7.0          NaN              NaN          NaN   
2             NaN           7.0          NaN              NaN          NaN   
3             NaN           7.0          NaN              NaN          NaN   
4             NaN           7.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig

  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1025222642  Z1723215            0                  0                0   
1  1025222642  Z1723215            0                  0                0   
2  1025222642  Z1723215            0                  0                0   
3  1025222642  Z1723215            0                  0                0   
4  1025222642  Z1723215            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN         349.0          NaN              NaN          NaN   
1             NaN         349.0          NaN              NaN          NaN   
2             NaN         349.0          NaN              NaN          NaN   
3             NaN         349.0          NaN              NaN          NaN   
4             NaN         349.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig

  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1027990195  Z1723215            0                  0                0   
1  1027990195  Z1723215            0                  0                0   
2  1027990195  Z1723215            0                  0                0   
3  1027990195  Z1723215            0                  0                0   
4  1027990195  Z1723215            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN          29.0          NaN              NaN          NaN   
1             NaN          29.0          NaN              NaN          NaN   
2             NaN          29.0          NaN              NaN          NaN   
3             NaN          29.0          NaN              NaN          NaN   
4             NaN          29.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig

  df = pd.read_csv("gr_prep_" + str(year) + ".csv")


          csn     pt_id  y_vent_rows  y_vent_start_time  y_vent_end_time  \
0  1035996185  Z1722864            0                  0                0   
1  1035996185  Z1722864            0                  0                0   
2  1035996185  Z1722864            0                  0                0   
3  1035996185  Z1722864            0                  0                0   
4  1035996185  Z1722864            0                  0                0   

  vent_start_time  ed_wait_time  worst_pf_pa worst_pf_pa_time  worst_pf_sp  \
0             NaN          -9.0          NaN              NaN          NaN   
1             NaN          -9.0          NaN              NaN          NaN   
2             NaN          -9.0          NaN              NaN          NaN   
3             NaN          -9.0          NaN              NaN          NaN   
4             NaN          -9.0          NaN              NaN          NaN   

   ... procedure best_map norepinephrine_dose_weight epinephrine_dose_weig