In [None]:
import numpy as np
import matplotlib.pyplot as plt # version > 3.0
import pandas as pd
from datetime import datetime
from google.colab import drive
import math
plt.style.use('seaborn')

In [None]:
path_to_sepsis3_single_stay_csv = "/path/to/Sepsis_Labs_Vitals_Demographic.csv"
sepsis_original = pd.read_csv(path_to_sepsis3_single_stay_csv)
path_to_sepsis3_many_stays_csv = "/path/to/sepsis_labs_per_stay_avg.csv"
path_to_vitals_stays_csv = "/content/vitals_per_stay_avg.csv"
sepsis_many_stays_df = pd.read_csv(path_to_sepsis3_many_stays_csv)
vitals_df = pd.read_csv(path_to_vitals_stays_csv)

vitals_list = []

for column, _ in vitals_df.iteritems():
  vitals_list.append(column)

print(vitals_list)

sepsis_lab_list = []

for column, _ in sepsis_many_stays_df.iteritems():
  sepsis_lab_list.append(column)

print(sepsis_lab_list)

def find_first_non_null_sepsis_lab(subject_id, stay_id, column_name):
  """
  For given params, find first non null value for that column.

  subject_id : Patient is identified using subject_id
  stay_id : Patient's stay_id for a particular stay in the ICU
  column_name: Name of lab value that is used in determining sepsis and is also part of patient's covariates.
  """
  modified_sepsis_lab_1 = sepsis_many_stays_df[sepsis_many_stays_df["subject_id"] == subject_id]
  modified_sepsis_lab_2 = modified_sepsis_lab_1[modified_sepsis_lab_1["stay_id"] == stay_id]
  for _, row in modified_sepsis_lab_2.iterrows():
    if type(row[column_name+"_avg"]) != str and not math.isnan(row[column_name+"_avg"]):
      return row[column_name+"_avg"]
    else:
      return None

def find_first_non_null_vital(subject_id, stay_id, column_name):
  """
  For given params, find first non null value for that column.

  subject_id : Patient is identified using subject_id
  stay_id : Patient's stay_id for a particular stay in the ICU
  column_name: Name of vital value that is part of patient's covariates.
  """
  modified_vitals_1 = vitals_df[vitals_df["subject_id"] == subject_id]
  modified_vitals_2 = modified_vitals_1[modified_vitals_1["stay_id"] == stay_id]
  for _, row in modified_vitals_2.iterrows():
    if type(row[column_name+"_avg"]) != str and not math.isnan(row[column_name+"_avg"]):
      return row[column_name+"_avg"]
    else:
      return None

sepsis_non_null_df = sepsis_original.copy(deep=True)

# Fill NaNs in each column with the average value for that column taken over the entire stay (stay_id) of the patient(subject_id) in the ICU 
for column,_ in sepsis_original.iteritems():
  for i,row in sepsis_original.iterrows():
    if column+"_avg" in sepsis_lab_list and math.isnan(row[column]):
      non_null_value = find_first_non_null_sepsis_lab(row["subject_id"], row["stay_id"], column)
      if non_null_value is not None:
        print("Supposed to be non null: ",non_null_value)
      sepsis_non_null_df.at[i, column] = non_null_value
    elif column+"_avg" in vitals_list and math.isnan(row[column]):
      non_null_value = find_first_non_null_vital(row["subject_id"], row["stay_id"], column)
      if non_null_value is not None:
        print("Supposed to be non null: ",non_null_value)
      sepsis_non_null_df.at[i, column] = non_null_value
    else:
      continue

sepsis_non_null_df.info()
sepsis_non_null_df.to_csv("/path/to/Sepsis_Lab_and_Vital_with_Stay_Avg.csv")
  


In [None]:
path_to_filled_with_stay_avg_csv = "/path/to/Sepsis_Lab_and_Vital_with_Stay_Avg.csv"
sepsis_original_with_avgs = pd.read_csv(path_to_filled_with_avg_csv)
path_to_csv_with_both_sepsis_and_not_sepsis_icu_patients = "/path/to/Avg_sepsis_labs_both_Sepsis_and_not_Sepsis.csv"
sepsis_labs_with_all_icu_patient_avgs_df = pd.read_csv(path_to_csv_with_both_sepsis_and_not_sepsis_icu_patients)

# Compute parameters to generate distributions for sampling corresponding values
pao2_novent_avg = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["respiration_avg"] == 0]["pao2fio2ratio_novent_avg"].median()
pao2_vent_avg = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["respiration_avg"] == 0]["pao2fio2ratio_vent_avg"].median()
meanbp_avg = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["cardiovascular_avg"] == 0]["meanbp_min_avg"].mean()
meanbp_std = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["cardiovascular_avg"] == 0]["meanbp_min_avg"].std()
uo_24hr_avg = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["renal_avg"] == 0]["uo_24hr_avg"].median()
bilr_avg = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["liver_avg"] == 0]["bilirubin_max_avg"].mean()
bilr_std = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["liver_avg"] == 0]["bilirubin_max_avg"].std()
creatinine_max_avg = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["renal_avg"] == 0]["creatinine_max_avg"].mean()
creatinine_max_std = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["renal_avg"] == 0]["creatinine_max_avg"].std()
platelet_min_avg = sepsis_labs_with_all_icu_patient_avgs_df[sepsis_labs_with_all_icu_patient_avgs_df["coagulation_avg"] == 0]["platelet_min_avg"].median()
heart_rate_avg = vitals_df["heart_rate_avg"].mean()
heart_rate_std = vitals_df["heart_rate_avg"].std()
sbp_avg = vitals_df["sbp_avg"].mean()
sbp_std = vitals_df["sbp_avg"].std()
mbp_avg = vitals_df["mbp_avg"].mean()
mbp_std = vitals_df["mbp_avg"].std()
dbp_avg = vitals_df["dbp_avg"].mean()
dbp_std = vitals_df["dbp_avg"].std()
sbp_ni_avg = vitals_df["sbp_ni_avg"].mean()
sbp_ni_std = vitals_df["sbp_ni_avg"].std()
mbp_ni_avg = vitals_df["mbp_ni_avg"].mean()
mbp_ni_std = vitals_df["mbp_ni_avg"].std()
dbp_ni_avg = vitals_df["dbp_ni_avg"].mean()
dbp_ni_std = vitals_df["dbp_ni_avg"].std()
temperature_avg = vitals_df["temperature_avg"].mean()
temperature_std = vitals_df["temperature_avg"].std()
glucose_avg = vitals_df["glucose_avg"].median()
spo2_avg = vitals_df["spo2_avg"].mean()
spo2_std = vitals_df["spo2_avg"].std()
resp_rate_avg = vitals_df["resp_rate_avg"].mean()
resp_rate_std = vitals_df["resp_rate_avg"].std()

sepsis_original_with_avgs_and_sampled_data = pd.read_csv("/path/to/sampled_template.csv")

# Fill NaNs with values sampled from constructed distributions across patients with similar scores (in the case of lab values, Eg: respiration 0) 
# as 0 score implies NaN values and vice versa. Vitals sampled around mean/median.
for row_idx, row in sepsis_original_with_avgs.iterrows():
  print(row_idx)
  row_updated = row.fillna(value={"pao2fio2ratio_novent": float(np.random.uniform(pao2_novent_avg -50 , pao2_novent_avg + 50)),
                                                  "pao2fio2ratio_vent": float(np.random.uniform(pao2_vent_avg - 50 , pao2_vent_avg + 50)),
                                                  "rate_dobutamine":0 ,
                                                  "rate_epinephrine":0,
                                                  "rate_norepinephrine":0, 
                                                  "rate_dopamine":0,
                                                  "meanbp_min":float(np.random.normal(meanbp_avg , meanbp_std)),
                                                  "gcs_min": 15.0, 
                                                  "uo_24hr": float(np.random.uniform(uo_24hr_avg - 100, uo_24hr_avg + 100)),
                                                  "bilirubin_max": float(np.random.normal(bilr_avg, bilr_std)),
                                                  "creatinine_max":float(np.random.normal(creatinine_max_avg, creatinine_max_std)),
                                                  "platelet_min":float(np.random.uniform(platelet_min_avg - 50 , platelet_min_avg + 50)),
                                                  "heart_rate":float(np.random.normal(heart_rate_avg, heart_rate_std)), 
                                                  "sbp":float(np.random.normal(sbp_avg, sbp_std)), 
                                                  "mbp":float(np.random.normal(mbp_avg, mbp_std)), 
                                                  "dbp":float(np.random.normal(dbp_avg, dbp_std)), 
                                                  "sbp_ni":float(np.random.normal(sbp_ni_avg, sbp_ni_std)), 
                                                  "mbp_ni":float(np.random.normal(mbp_ni_avg, mbp_ni_std)),
                                                  "dbp_ni":float(np.random.normal(dbp_ni_avg, dbp_ni_std)), 
                                                  "temperature":float(np.random.normal(temperature_avg, temperature_std)),
                                                  "glucose":float(np.random.uniform(glucose_avg - 10, glucose_avg + 10)),
                                                  "spo2":float(np.random.normal(spo2_avg, spo2_std)),
                                                  "resp_rate":float(np.random.normal(resp_rate_avg, resp_rate_std))})
  
  sepsis_original_with_avgs_and_sampled_data = sepsis_original_with_avgs_and_sampled_data.append(row_updated)


sepsis_original_with_avgs_and_sampled_data.to_csv("/path/to/No_NaN_Values_Newest_14May_1.csv")
