In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
X = pd.read_csv("Table1_data.csv").iloc[:, 1:]
y = pd.read_csv("ydata.csv").iloc[:, 1:]

In [2]:
df = pd.read_csv("Xdata_non.csv")
df = df.iloc[:,1:]

# Create indicator columns
unique_values = df['EthnicGroupDSC'].unique()
for value in unique_values:
    df[value] = df['EthnicGroupDSC'].apply(lambda x: 1 if x == value else 0)
df = df.drop('EthnicGroupDSC', axis=1)

# Create indicator columns
unique_values = df['PatientRaceDSC'].unique()
for value in unique_values:
    df[value] = df['PatientRaceDSC'].apply(lambda x: 1 if x == value else 0)
df = df.drop('PatientRaceDSC', axis=1)

In [3]:
# function to calculate SOFA score
def calc_sofa(components):
    sub_scr = np.zeros(6)
    # coagulation
    c = 0
    x = components["Platelets_10_3_per_ml"]
    if (x > 150) or np.isnan(x):
        c = 0
    elif (x > 100): 
        c = 1
    elif (x > 50): # removed =
        c = 2
    elif (x > 20): # removed =
        c = 3
    else:
        c = 4
    sub_scr[0] = c
    
    # liver
    c = 0
    x = components["Total_Bilirubin_mg_per_dL"]
    if (x < 1.2) or np.isnan(x):
        c = 0
    elif (x <= 1.9):
        c = 1
    elif (x < 6):
        c = 2
    elif (x < 12):
        c = 3
    else:
        c = 4
    sub_scr[1] = c
    
    # GCS
    c = 0
    x = components["GCS"]
    #if (x > 1) or np.isnan(x):
    if (x > 14) or np.isnan(x): # added
        c = 0
    elif (x > 12):
        c = 1
    elif (x > 9):
        c = 2
    elif (x > 5):
        c = 3
    else:
        c = 4
    sub_scr[2] = c
    
    # renal
    c = 0
    x = components["Creatinine_mg_per_dL"]
    u = components["Urine_output_mL"] # added
    # if (x < 1.2) or np.isnan(x):
    if (x < 1.2):  # added
        c = 0
    elif (x < 2):
        c = 1
    elif (x < 3.5):
        c = 2
    #elif (x < 5):
    elif (x < 5) or (u < 500):     # added  
        c = 3
    # else:
    elif (x >= 5) or (u < 200):     # added
        c = 4
    else:              
        c = 0          # added
    sub_scr[3] = c

    # Cardiovascular
    c = 0
    MAP = components["MAP"]
    n = components["Norepinephrine_μg_kg_min"]
    e = components["Epinephrine_μg_kg_min"]
    dobute = components["Dobutamine_any_dose"]
    dope = components["Dopamine_μg_kg_min"]
    # if np.isnan(MAP) or np.isnan(n) or np.isnan(e) or np.isnan(dobute) or np.isnan(dope):
    if np.isnan(MAP) and np.isnan(n) and np.isnan(e) and np.isnan(dobute) and np.isnan(dope): # added
        c = 0
    elif (dope > 15 or e > 0.1 or n > 0.1):
        c = 4
    elif (dope > 5 or (e < 0.1 and e > 0) or (n < 0.1 and n > 0)):
        c = 3
    elif ((dope <= 5 and dope > 0) or dobute > 0):
        c = 2
    elif (MAP < 70):
        c = 1
    else:
        c = 0
    sub_scr[4] = c
    
    # respiratory
    c = 0
    p = components["PaO2_FiO2"]
    v = components["mech_vent_cpap"]
    
    if (p < 100) and (v==1): # added
        c = 4
    elif (p < 200) and (v==1): # added
        c = 3
    elif (p < 300):
        c = 2
    elif (p < 400):
        c = 1
    else:
        c = 0
    sub_scr[5] = c
    
    sofa_scr = 0
    for i in range(0, len(sub_scr)):
        sofa_scr += sub_scr[i]
        
    return sofa_scr

In [4]:
# function to do median imputation for SOFA scores
def median_impute(dataframe, medians):
    temp_df = dataframe.copy()
    sofas = np.zeros(temp_df.shape[0])
    for i in tqdm(range(dataframe.shape[0])):
        for j in range(dataframe.shape[1]):
            if np.isnan(dataframe.iloc[i, j]):    
                temp_df.iloc[i, j] = medians.iloc[j, 0]
        sofas[i] = calc_sofa(temp_df.iloc[i, :])
    return sofas

In [5]:
# get SOFA scores - do median imputation for missing values 
medians = pd.DataFrame(df.median())
sofas_median = median_impute(df, medians)
sofas_median = sofas_median.reshape(len(sofas_median), 1)

100%|███████████████████████████████████| 37739/37739 [00:25<00:00, 1452.64it/s]


In [6]:
df["SOFA"] = sofas_median
X = df

In [7]:
num_pts = X.shape[0]
num_cols = X.shape[1]

In [8]:
# percent missing values
num_nans = np.array([np.sum(np.isnan(X.iloc[:, i])) for i in range(num_cols)])
percent_nans = num_nans/num_pts*100

In [9]:
# Tara -- what does this do?
comp_sum_dead = np.zeros(num_cols)
comp_sum_alive = np.zeros(num_cols)
num_dead = np.zeros(num_cols)
num_alive = np.zeros(num_cols)
for i in tqdm(range(num_pts)):
    for j in range(num_cols):
        if not np.isnan(X.iloc[i, j]):
            if y.iloc[i, 0]:
                comp_sum_dead[j] += X.iloc[i, j]
                num_dead[j] += 1
            else:
                comp_sum_alive[j] += X.iloc[i, j]
                num_alive[j] += 1
        
avg_dead_comp = [comp_sum_dead[i]/num_dead[i] for i in range(num_cols)]
avg_alive_comp = [comp_sum_alive[i]/num_alive[i] for i in range(num_cols)]
avg = np.array(X.mean())

100%|███████████████████████████████████| 37739/37739 [00:36<00:00, 1048.15it/s]


In [10]:
# look at the column names
X.columns

Index(['Age', 'SexDSC', 'GCS', 'PaO2_FiO2', 'mech_vent_cpap',
       'Total_Bilirubin_mg_per_dL', 'Platelets_10_3_per_ml',
       'Creatinine_mg_per_dL', 'Urine_output_mL', 'MAP', 'Dobutamine_any_dose',
       'Dopamine_μg_kg_min', 'Epinephrine_μg_kg_min',
       'Norepinephrine_μg_kg_min', 'Not Hispanic', 'Hispanic', 'Unavailable',
       'Unknown', 'White', 'Asian', 'Black or African American',
       'American Indian or Alaska Native', 'SOFA'],
      dtype='object')

In [11]:
# get standard deviations of variables
stdDev = X.std()

In [12]:
# update variable names for table 1
data = [avg, avg_dead_comp, avg_alive_comp, stdDev, percent_nans]

table1 = pd.DataFrame(data, 
                      index=["All", 
                             "Died", 
                             "Survived", 
                             "SD", 
                             "% missing"],
                      columns=X.columns)

table1.rename(columns={'Age': 'Age (years)'}, inplace=True)
table1.rename(columns={'SexDSC': 'Sex (% male)'}, inplace=True)
table1.rename(columns={'PaO2_FiO2': 'PaO2/FiO2'}, inplace=True)
table1.rename(columns={'Total_Bilirubin_mg_per_dL': 'Total Bili (mg/dL)'}, inplace=True)
table1.rename(columns={'Platelets_10_3_per_ml': 'Plts (10^3/ml)'}, inplace=True)
table1.rename(columns={'mech_vent_cpap': 'MV or CPAP (%)'}, inplace=True)
table1.rename(columns={'Creatinine_mg_per_dL': 'Creatinine (mg/dL)'}, inplace=True)
table1.rename(columns={'Urine_output_mL': 'Urine output (mL/day)'}, inplace=True)
table1.rename(columns={'MAP': 'MAP (mm Hg)'}, inplace=True)
table1.rename(columns={'Dobutamine_any_dose': 'On Dobutamine (%)'}, inplace=True)
table1.rename(columns={'Dopamine_μg_kg_min': 'Dopamine (μg/kg/min)'}, inplace=True)
table1.rename(columns={'Epinephrine_μg_kg_min': 'Epinephrine (μg/kg/min)'}, inplace=True)
table1.rename(columns={'Norepinephrine_μg_kg_min': 'Norepi. (μg/kg/min)'}, inplace=True)

table1['Sex (% male)']['All'] *=100
table1['Sex (% male)']['Died'] *=100
table1['Sex (% male)']['Survived'] *=100

table1['On Dobutamine (%)']['All'] *=100
table1['On Dobutamine (%)']['Died'] *=100
table1['On Dobutamine (%)']['Survived'] *=100

table1['MV or CPAP (%)']['All'] *=100
table1['MV or CPAP (%)']['Died'] *=100
table1['MV or CPAP (%)']['Survived'] *=100

formatted_df = (table1.T).applymap(lambda x: '{:.2f}'.format(x))

formatted_df.loc['Sex (% male)', 'SD'] = '--'
formatted_df.loc['MV or CPAP (%)', 'SD'] = '--'
formatted_df.loc['On Dobutamine (%)', 'SD'] = '--'

formatted_df

Unnamed: 0,All,Died,Survived,SD,% missing
Age (years),62.22,68.14,61.6,16.53,0.0
Sex (% male),60.16,57.41,60.45,--,0.0
GCS,11.25,8.09,11.63,4.60,38.36
PaO2/FiO2,190.53,198.92,189.32,117.91,63.2
MV or CPAP (%),34.14,55.42,31.92,--,0.0
Total Bili (mg/dL),1.11,1.65,1.04,1.70,20.23
Plts (10^3/ml),183.85,174.61,184.79,84.76,2.73
Creatinine (mg/dL),1.37,1.96,1.31,1.09,2.68
Urine output (mL/day),1381.69,1249.87,1395.24,1085.23,8.66
MAP (mm Hg),73.48,66.28,74.22,15.30,2.32


In [13]:
formatted_df.to_csv("table1.csv")