# Patients exploration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.ensemble
import sklearn.model_selection

import sys
# Our util file is in the parent directory
sys.path.append('../')
import utils as mutil

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

import tabulate
def print_table(df):
    # Print the stuff
    data = [["Column", "dtype", "# NaN"]]
    for t, c in zip(df.dtypes, df.columns.values):
        if t == 'object':
            print("\x1b[31m", t, "\t", c, "\x1b[0m")
        # print(t, "\t", c)
        data.append([c, t, df[c].isna().sum()])

    table = tabulate.tabulate(data, tablefmt='html')
    display(table)

## Load dataset
* Source https://www.kaggle.com/datasets/mitishaagarwal/patient

In [2]:
patients_ds = pd.read_csv("../patient/dataset.csv")
patients_ds = patients_ds.drop(columns=["Unnamed: 83"])

## Layouttitanic_train.head()

In [3]:
patients_ds.head()


Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,66154,25312,118,68.0,22.73,0,Caucasian,M,180.3,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,114252,59342,81,77.0,27.42,0,Caucasian,F,160.0,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
2,119783,50777,118,25.0,31.95,0,Caucasian,F,172.7,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
3,79267,46918,118,81.0,22.64,1,Caucasian,F,165.1,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
4,92056,34377,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,0


In [4]:
patients_ds.describe(include='all')

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
count,91713.0,91713.0,91713.0,87485.0,88284.0,91713.0,90318,91688,90379.0,91601,...,90998.0,90998.0,90998.0,90998.0,90998.0,90998.0,90998.0,90051,90051,91713.0
unique,,,,,,,6,2,,5,...,,,,,,,,11,10,
top,,,,,,,Caucasian,M,,Accident & Emergency,...,,,,,,,,Cardiovascular,Cardiovascular,
freq,,,,,,,70684,49469,,54060,...,,,,,,,,29999,38816,
mean,65606.07928,65537.131464,105.669262,62.309516,29.185818,0.183736,,,169.641588,,...,0.015693,0.225192,0.012989,0.026165,0.007066,0.004132,0.020638,,,0.086302
std,37795.088538,37811.252183,62.854406,16.775119,8.275142,0.387271,,,10.795378,,...,0.124284,0.417711,0.113229,0.159628,0.083763,0.064148,0.142169,,,0.280811
min,1.0,1.0,2.0,16.0,14.844926,0.0,,,137.2,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
25%,32852.0,32830.0,47.0,52.0,23.641975,0.0,,,162.5,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
50%,65665.0,65413.0,109.0,65.0,27.654655,0.0,,,170.1,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
75%,98342.0,98298.0,161.0,75.0,32.930206,0.0,,,177.8,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0


In [5]:
# print("Columns:", patients_ds.columns.values)

# Convert Categorical to numeric
for t, c in zip(patients_ds.dtypes, patients_ds.columns.values):
    if t == 'object':
        print("\x1b[31m", t, "\t", c, "\x1b[0m")
        patients_ds[c] = pd.Categorical(patients_ds[c])
        patients_ds[c] = patients_ds[c].cat.codes

# Convert problematic types
float64_cols = list(patients_ds.select_dtypes(include='float64'))
patients_ds[float64_cols] = patients_ds[float64_cols].astype('float32')

int64_cols = list(patients_ds.select_dtypes(include='int64'))
patients_ds[int64_cols] = patients_ds[int64_cols].astype('int32')

print_table(patients_ds)

[31m object 	 ethnicity [0m
[31m object 	 gender [0m
[31m object 	 icu_admit_source [0m
[31m object 	 icu_stay_type [0m
[31m object 	 icu_type [0m
[31m object 	 apache_3j_bodysystem [0m
[31m object 	 apache_2_bodysystem [0m


0,1,2
Column,dtype,# NaN
encounter_id,int32,0
patient_id,int32,0
hospital_id,int32,0
age,float32,4228
bmi,float32,3429
elective_surgery,int32,0
ethnicity,int8,0
gender,int8,0
height,float32,1334


In [6]:

        
def replace_random(df, c):
    avg = df[c].mean()
    std = df[c].std()
    null_count = df[c].isnull().sum()
    
    null_random_list = np.random.randint(avg - std, avg + std, size=null_count)
    df[c][np.isnan(df[c])] = null_random_list
    
def replace_zero(df, c):
    df[c] = df[c].fillna(0)

# To keep the distribution properties, we replace NaN in these columns by distribution preserving random values
random_replace_columns = [ "age", "bmi", "height","weight" ]
# Having no information on these values can have semantic meaning therefore we set them to 0 (in essence introducing an unknown class)
zero_replace_columns = [
    "apache_2_diagnosis",
    "apache_3j_diagnosis",
    "arf_apache",
    "gcs_eyes_apache",
    "gcs_motor_apache",
    "gcs_unable_apache",
    "gcs_verbal_apache",
    "heart_rate_apache",
    "intubated_apache",
    "map_apache",
    "resprate_apache",
    "temp_apache",
    "ventilated_apache",
    "d1_diasbp_max",
    "d1_diasbp_min",
    "d1_diasbp_noninvasive_max",
    "d1_diasbp_noninvasive_min",
    "d1_heartrate_max",
    "d1_heartrate_min",
    "d1_mbp_max",
    "d1_mbp_min",
    "d1_mbp_noninvasive_max",
    "d1_mbp_noninvasive_min",
    "d1_resprate_max",
    "d1_resprate_min",
    "d1_spo2_max",
    "d1_spo2_min",
    "d1_sysbp_max",
    "d1_sysbp_min",
    "d1_sysbp_noninvasive_max",
    "d1_sysbp_noninvasive_min",
    "d1_temp_max",
    "d1_temp_min",
    "h1_diasbp_max",
    "h1_diasbp_min",
    "h1_diasbp_noninvasive_max",
    "h1_diasbp_noninvasive_min",
    "h1_heartrate_max",
    "h1_heartrate_min",
    "h1_mbp_max",
    "h1_mbp_min",
    "h1_mbp_noninvasive_max",
    "h1_mbp_noninvasive_min",
    "h1_resprate_max",
    "h1_resprate_min",
    "h1_spo2_max",
    "h1_spo2_min",
    "h1_sysbp_max",
    "h1_sysbp_min",
    "h1_sysbp_noninvasive_max",
    "h1_sysbp_noninvasive_min",
    "d1_glucose_max",
    "d1_glucose_min",
    "d1_potassium_max",
    "d1_potassium_min",
    # "apache_4a_hospital_death_prob",
    # "apache_4a_icu_death_prob",
    "aids",
    "cirrhosis",
    "diabetes_mellitus",
    "hepatic_failure",
    "immunosuppression",
    "leukemia",
    "lymphoma",
    "solid_tumor_with_metastasis",
]

# Remove the Appache III score for testing
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4893757/
patients_ds = patients_ds.drop(columns=["apache_4a_hospital_death_prob", "apache_4a_icu_death_prob"])

# Remove ids as they do not contain relevant information
patients_ds = patients_ds.drop(columns=["encounter_id", "patient_id"])

for c in random_replace_columns:
    replace_random(patients_ds, c)

for c in zero_replace_columns:
    replace_zero(patients_ds, c)

print_table(patients_ds)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c][np.isnan(df[c])] = null_random_list


0,1,2
Column,dtype,# NaN
encounter_id,int32,0
patient_id,int32,0
hospital_id,int32,0
age,float32,0
bmi,float32,0
elective_surgery,int32,0
ethnicity,int8,0
gender,int8,0
height,float32,0


# Notes:
* maybe to complex, switch to https://www.kaggle.com/datasets/saurabhshahane/in-hospital-mortality-prediction ???

## Train, Test, Validation split

In [7]:
train, val, test = mutil.split_train_val_test(patients_ds, validation_size=0.1, test_size=0.2)

In [8]:
# No missing labels
train['hospital_death'].isna().sum()
train.groupby("hospital_death")['hospital_death'].count()

hospital_death
0    58654
1     5544
Name: hospital_death, dtype: int64

In [9]:

train_x = train.drop(columns=['hospital_death'])
train_y = train['hospital_death']

val_x = val.drop(columns=['hospital_death'])
val_y = val['hospital_death']

test_x = test.drop(columns=['hospital_death'])
test_y = test['hospital_death']