# Patients exploration

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import utils as mutil
import sklearn
import sklearn.ensemble
import sklearn.model_selection

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

## Load dataset

In [39]:
patients_ds = pd.read_csv("patient/dataset.csv")
patients_ds = patients_ds.drop(columns=["Unnamed: 83"])

## Layouttitanic_train.head()

In [40]:
patients_ds.head()


Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,66154,25312,118,68.0,22.73,0,Caucasian,M,180.3,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,114252,59342,81,77.0,27.42,0,Caucasian,F,160.0,Floor,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
2,119783,50777,118,25.0,31.95,0,Caucasian,F,172.7,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic,0
3,79267,46918,118,81.0,22.64,1,Caucasian,F,165.1,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular,0
4,92056,34377,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma,0


In [5]:
patients_ds.describe(include='all')

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,Unnamed: 83,hospital_death
count,91713.0,91713.0,91713.0,87485.0,88284.0,91713.0,90318,91688,90379.0,91601,...,90998.0,90998.0,90998.0,90998.0,90998.0,90998.0,90051,90051,0.0,91713.0
unique,,,,,,,6,2,,5,...,,,,,,,11,10,,
top,,,,,,,Caucasian,M,,Accident & Emergency,...,,,,,,,Cardiovascular,Cardiovascular,,
freq,,,,,,,70684,49469,,54060,...,,,,,,,29999,38816,,
mean,65606.07928,65537.131464,105.669262,62.309516,29.185818,0.183736,,,169.641588,,...,0.225192,0.012989,0.026165,0.007066,0.004132,0.020638,,,,0.086302
std,37795.088538,37811.252183,62.854406,16.775119,8.275142,0.387271,,,10.795378,,...,0.417711,0.113229,0.159628,0.083763,0.064148,0.142169,,,,0.280811
min,1.0,1.0,2.0,16.0,14.844926,0.0,,,137.2,,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0
25%,32852.0,32830.0,47.0,52.0,23.641975,0.0,,,162.5,,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0
50%,65665.0,65413.0,109.0,65.0,27.654655,0.0,,,170.1,,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0
75%,98342.0,98298.0,161.0,75.0,32.930206,0.0,,,177.8,,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0


In [41]:
# print("Columns:", patients_ds.columns.values)

# Convert Categorical to numeric
for t, c in zip(patients_ds.dtypes, patients_ds.columns.values):
    if t == 'object':
        print("\x1b[31m", t, "\t", c, "\x1b[0m")
        patients_ds[c] = pd.Categorical(patients_ds[c])
        patients_ds[c] = patients_ds[c].cat.codes

# Convert problematic types
float64_cols = list(patients_ds.select_dtypes(include='float64'))
patients_ds[float64_cols] = patients_ds[float64_cols].astype('float32')

int64_cols = list(patients_ds.select_dtypes(include='int64'))
patients_ds[int64_cols] = patients_ds[int64_cols].astype('int32')

# Print the stuff
import tabulate
data = [["Column", "dtype", "# NaN"]]
for t, c in zip(patients_ds.dtypes, patients_ds.columns.values):
    if t == 'object':
        print("\x1b[31m", t, "\t", c, "\x1b[0m")
    # print(t, "\t", c)
    data.append([c, t, patients_ds[c].isna().sum()])
        
table = tabulate.tabulate(data, tablefmt='html')
table

[31m object 	 ethnicity [0m
[31m object 	 gender [0m
[31m object 	 icu_admit_source [0m
[31m object 	 icu_stay_type [0m
[31m object 	 icu_type [0m
[31m object 	 apache_3j_bodysystem [0m
[31m object 	 apache_2_bodysystem [0m


0,1,2
Column,dtype,# NaN
encounter_id,int32,0
patient_id,int32,0
hospital_id,int32,0
age,float32,4228
bmi,float32,3429
elective_surgery,int32,0
ethnicity,int8,0
gender,int8,0
height,float32,1334


# Notes:
* maybe to complex, switch to https://www.kaggle.com/datasets/saurabhshahane/in-hospital-mortality-prediction ???

## Train, Test, Validation split

In [35]:
train, val, test = mutil.split_train_val_test(patients_ds, validation_size=0.1, test_size=0.2)

In [36]:
# No missing labels
train['hospital_death'].isna().sum()

0

In [37]:
# Create training dataset
train_x = train.drop(columns=['hospital_death'])
train_y = train['hospital_death']
# Train model based on best estimated configuration from previous tests
model = sklearn.ensemble.RandomForestClassifier(min_samples_split= 10, n_estimators= 50, criterion='gini', min_samples_leaf= 1, class_weight="balanced")
model = model.fit(X=train_x, y=train_y)
# Calculate train accuracy
train_accuracy = model.score(X=train_x, y=train_y)
print(f"Train accuracy {train_accuracy}")
print(f"Training set utilisation = {100 * len(titanic_train_clean) / len(titanic_train)}%")

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').