In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras

In [2]:
name = "ICU Mortality Prediction"

## Inputting Our Data

In [3]:
path = "./data/train_x.csv"
# dtypes = {"admissionheight": float, "admissionweight": float, "age": str, "gender": str, "nursingchartcelltypevalname": str, "nursingchartvalue": float, "offset": int, "patientunitstayid": int, "unitvisitnumber": int }
df = pd.read_csv(path, dtype=str, keep_default_na=False, na_values=[""], engine="python")
print("Reading data from: {}".format(path))

Reading data from: ./data/train_x.csv


## Preprocessing

In [4]:
df.rename(columns={df.columns[0]: "id"}, inplace=True)
df.drop("cellattributevalue", axis=1, inplace=True)
df.drop("celllabel", axis=1, inplace=True)
df.drop("labmeasurenamesystem", axis=1, inplace=True)
df.drop("labname", axis=1, inplace=True)
df.drop("labresult", axis=1, inplace=True)

In [5]:
df["id"] = df["id"].astype("int64")
df["admissionheight"] = df["admissionheight"].astype(float)
df["admissionweight"] = df["admissionweight"].astype(float)
df.replace(["> 89"], 90, inplace=True)
df["age"] = df["age"].astype(float)
df["gender"] = df["gender"].astype(str)
df["ethnicity"] = df["ethnicity"].astype(str)
df["nursingchartcelltypevalname"] = df["nursingchartcelltypevalname"].astype(str)
df.replace(["Unable to score due to medication"], np.nan, inplace=True)
df["nursingchartvalue"] = df["nursingchartvalue"].astype(float)
df["offset"] = df["offset"].astype(float)
df["patientunitstayid"] = df["patientunitstayid"].astype(float)
df["unitvisitnumber"] = df["unitvisitnumber"].astype(float)
df.shape, df.dtypes

((791936, 11),
 id                               int64
 admissionheight                float64
 admissionweight                float64
 age                            float64
 ethnicity                       object
 gender                          object
 nursingchartcelltypevalname     object
 nursingchartvalue              float64
 offset                         float64
 patientunitstayid              float64
 unitvisitnumber                float64
 dtype: object)

In [7]:
nonnumerical = df.select_dtypes(exclude=["int64", "float64"])
categorical = pd.get_dummies(nonnumerical)
numerical =  df.select_dtypes(include=["int64", "float64"])
final_features = pd.concat([numerical, categorical], axis=1)


In [8]:
labels = pd.read_csv("./data/train_y.csv", dtype=str, keep_default_na=False, na_values=[""], engine="python")

In [9]:
# import os  
# df.to_csv("./data/filtered.csv")  
# features.to_csv("./data/pivoted.csv")

## Analyzing Dataset

In [10]:
display(final_features)
display(final_features.describe().T)

Unnamed: 0,id,admissionheight,admissionweight,age,nursingchartvalue,offset,patientunitstayid,unitvisitnumber,ethnicity_African American,ethnicity_Asian,...,nursingchartcelltypevalname_Heart Rate,nursingchartcelltypevalname_Invasive BP Diastolic,nursingchartcelltypevalname_Invasive BP Mean,nursingchartcelltypevalname_Invasive BP Systolic,nursingchartcelltypevalname_Non-Invasive BP Diastolic,nursingchartcelltypevalname_Non-Invasive BP Mean,nursingchartcelltypevalname_Non-Invasive BP Systolic,nursingchartcelltypevalname_O2 Saturation,nursingchartcelltypevalname_Respiratory Rate,nursingchartcelltypevalname_nan
0,0,157.5,,87.0,,0.0,141764.0,2.0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,157.5,46.5,87.0,,0.0,141765.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,167.0,77.5,76.0,,0.0,143870.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,172.7,60.3,34.0,,0.0,144815.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,4,177.8,91.7,61.0,,0.0,145427.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791931,791931,,,,,931.0,2715137.0,,0,0,...,0,0,0,0,0,0,0,0,0,1
791932,791932,,,,,1951.0,2715137.0,,0,0,...,0,0,0,0,0,0,0,0,0,1
791933,791933,,,,,1001.0,2715137.0,,0,0,...,0,0,0,0,0,0,0,0,0,1
791934,791934,,,,,37.0,2754778.0,,0,0,...,0,0,0,0,0,0,0,0,0,1


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,791936.0,395967.5,228612.4,0.0,197983.75,395967.5,593951.25,791935.0
admissionheight,1957.0,169.9399,16.62301,0.3,162.6,170.2,177.8,600.0
admissionweight,1856.0,82.68857,24.35354,0.5,65.7,78.91,96.125,227.7
age,2012.0,63.29523,17.6729,15.0,53.0,66.0,77.0,90.0
nursingchartvalue,773272.0,76.84195,37.65256,-90.0,55.0,81.0,99.0,1114.0
offset,791936.0,5107.248,8347.077,-68394.0,646.0,2217.0,5893.0,76852.0
patientunitstayid,791936.0,1784650.0,1019373.0,141764.0,931649.0,1705630.0,2755766.0,3353113.0
unitvisitnumber,2016.0,1.198909,0.5163518,1.0,1.0,1.0,1.0,5.0
ethnicity_African American,791936.0,0.0002348675,0.0153236,0.0,0.0,0.0,0.0,1.0
ethnicity_Asian,791936.0,2.651729e-05,0.00514943,0.0,0.0,0.0,0.0,1.0


In [11]:
corr = final_features.corr() 
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,id,admissionheight,admissionweight,age,nursingchartvalue,offset,patientunitstayid,unitvisitnumber,ethnicity_African American,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown,ethnicity_nan,gender_Female,gender_Male,gender_nan,nursingchartcelltypevalname_GCS Total,nursingchartcelltypevalname_Heart Rate,nursingchartcelltypevalname_Invasive BP Diastolic,nursingchartcelltypevalname_Invasive BP Mean,nursingchartcelltypevalname_Invasive BP Systolic,nursingchartcelltypevalname_Non-Invasive BP Diastolic,nursingchartcelltypevalname_Non-Invasive BP Mean,nursingchartcelltypevalname_Non-Invasive BP Systolic,nursingchartcelltypevalname_O2 Saturation,nursingchartcelltypevalname_Respiratory Rate,nursingchartcelltypevalname_nan
id,1.0,0.020002,0.064146,0.058388,-0.018496,0.024892,0.944042,-0.112992,-0.026476,-0.008895,-0.077782,-0.018421,-0.006732,-0.016122,0.086493,-0.054885,-0.067681,0.087191,-0.010427,-0.018079,-0.008577,0.012528,-0.008564,-0.005782,0.034491,-0.005801,-0.076496,0.009095,0.165792
admissionheight,0.020002,1.0,0.248388,-0.0773,,,0.016849,0.006658,0.017099,-0.10664,-0.015257,0.038114,0.03041,-0.023173,0.048064,-0.434163,0.433361,0.006429,,,,,,,,,,,
admissionweight,0.064146,0.248388,1.0,-0.104969,,,0.063573,-0.00217,0.025173,-0.054419,0.01108,-0.018955,0.020837,-0.053813,0.045823,-0.252875,0.250737,0.023707,,,,,,,,,,,
age,0.058388,-0.0773,-0.104969,1.0,,,0.050316,0.047184,-0.137424,0.01849,0.147845,-0.031513,-0.056095,-0.044266,-0.022312,0.051056,-0.050771,-0.004098,,,,,,,,,,,
nursingchartvalue,-0.018496,,,,1.0,0.001846,-0.018712,,,,,,,,,,,,-0.279816,0.12394,-0.070485,0.016296,0.197684,-0.110608,0.049708,0.521049,0.204147,-0.681788,
offset,0.024892,,,,0.001846,1.0,0.021854,,-0.009378,-0.003151,-0.027547,-0.006523,-0.002382,-0.005712,0.030633,-0.019432,-0.023975,0.03088,-0.004977,0.021652,-0.014207,-0.008694,-0.014212,0.005919,-0.001009,0.005914,-0.014743,0.008556,-0.024692
patientunitstayid,0.944042,0.016849,0.063573,0.050316,-0.018712,0.021854,1.0,-0.105198,-0.001305,0.000197,-0.005639,-0.002507,-0.003114,0.000528,0.006133,-0.010078,-0.000229,0.00653,-0.011093,-0.006905,-0.004949,0.016671,-0.004937,0.004718,0.044604,0.0047,-0.068341,0.022267,0.004493
unitvisitnumber,-0.112992,0.006658,-0.00217,0.047184,,,-0.105198,1.0,-0.016592,-0.030068,0.074566,-0.060029,-0.029817,-0.024971,-0.022933,0.008392,-0.006823,-0.01718,,,,,,,,,,,
ethnicity_African American,-0.026476,0.017099,0.025173,-0.137424,,-0.009378,-0.001305,-0.016592,1.0,-7.9e-05,-0.00069,-0.000163,-6e-05,-0.000143,-0.306148,0.202098,0.226879,-0.303697,-0.002531,-0.00709,-0.00249,-0.002349,-0.00249,-0.006282,-0.005527,-0.006282,-0.005939,-0.006852,0.099928
ethnicity_Asian,-0.008895,-0.10664,-0.054419,0.01849,,-0.003151,0.000197,-0.030068,-7.9e-05,1.0,-0.000232,-5.5e-05,-2e-05,-4.8e-05,-0.102858,0.061667,0.081281,-0.102035,-0.00085,-0.002382,-0.000837,-0.000789,-0.000837,-0.002111,-0.001857,-0.002111,-0.001996,-0.002302,0.033573


## Splitting Our Data

In [13]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=.8, random_state=42)

NameError: name 'x' is not defined