## Data preparation

This notebook is used to explore issues with the raw data and transform it for use in modeling and visualization tasks.

In [289]:
from functools import wraps
from pathlib import Path
import datetime as dt

import numpy as np
import polars as pl
import polars.selectors as cs
import altair as alt

In [230]:
competition_path = Path("/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/")
data_path = Path(competition_path, "data/")
training_v2_path = Path(data_path, "training_v2.csv")

In [290]:
def reduce_memory_usage_pl(df):
    
    start_mem = df.estimated_size("mb")
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    # pl.Uint8,pl.UInt16,pl.UInt32,pl.UInt64
    Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
    Numeric_Float_types = [pl.Float32,pl.Float64]
    
    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        c_max = df[col].max()
        if col_type in Numeric_Int_types:
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(df[col].cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(df[col].cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(df[col].cast(pl.Int32))
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df = df.with_columns(df[col].cast(pl.Int64))

        elif col_type in Numeric_Float_types:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(df[col].cast(pl.Float32))
            else:
                pass
        elif col_type == pl.Utf8:
            df = df.with_columns(df[col].cast(pl.Categorical))
        else:
            pass
    mem_usg = round(df.estimated_size("mb"), 2)
    print(f"Memory usage became: {mem_usg} MB")
    
    return df

# Transformations

In [293]:
train_cleaned_pl = (
    pl.scan_csv(training_v2_path, infer_schema_length=100000, null_values="NA")
    .with_columns(
        pl.col("apache_4a_hospital_death_prob").cast(pl.Float64),
        pl.col("apache_4a_icu_death_prob").cast(pl.Float64),
        pl.col("ethnicity").cast(pl.Utf8).cast(pl.Categorical),
        pl.col("gender").cast(pl.Utf8).cast(pl.Categorical),
        pl.col("encounter_id").cast(pl.Utf8).cast(pl.Categorical),
        pl.col("patient_id").cast(pl.Utf8).cast(pl.Categorical),
        pl.col("hospital_id").cast(pl.Utf8).cast(pl.Categorical),
        pl.col("icu_id").cast(pl.Utf8).cast(pl.Categorical),
        pl.col("elective_surgery").cast(pl.UInt8),
        pl.col("hospital_death").cast(pl.UInt8),
    )
).collect().pipe(reduce_memory_usage_pl)

Memory usage of dataframe is 137.11 MB
Memory usage became:  53.199461936950684  MB


In [294]:
train_cleaned_pl.describe()

describe,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,icu_type,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,…,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
str,str,str,str,f64,f64,f64,f64,str,str,f64,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str
"""count""","""91713""","""91713""","""91713""",91713.0,91713.0,91713.0,91713.0,"""91713""","""91713""",91713.0,"""91713""","""91713""","""91713""","""91713""","""91713""",91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,…,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,"""91713""","""91713"""
"""null_count""","""0""","""0""","""0""",0.0,4228.0,3429.0,0.0,"""1395""","""25""",1334.0,"""21409""","""112""","""0""","""0""","""0""",0.0,0.0,2720.0,54379.0,1662.0,1101.0,0.0,715.0,58134.0,19262.0,18853.0,70868.0,1901.0,1901.0,1037.0,1901.0,11036.0,878.0,19878.0,715.0,994.0,…,84369.0,75673.0,75673.0,72102.0,72102.0,72617.0,72617.0,75953.0,75953.0,59271.0,59271.0,60123.0,60123.0,59262.0,59262.0,66008.0,66008.0,75959.0,75959.0,76424.0,76424.0,75945.0,75945.0,80195.0,80195.0,7947.0,7947.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,"""1662""","""1662"""
"""mean""",,,,0.086302,62.309516,29.185818,0.183736,,,169.641586,,,,,,0.835766,0.0,84.028343,2.902968,185.401739,558.21637,0.201106,0.027979,1.147721,25.825331,1.480013,0.595752,3.465049,5.471195,0.009528,3.994778,160.326828,99.707932,32.988739,0.151223,88.015873,…,3.021543,196.09601,195.481546,4.201144,4.153007,138.239609,137.901947,13.460212,13.423475,45.248924,38.433861,7.389236,7.32453,165.913986,103.511353,285.667084,223.523041,44.668446,43.383453,7.338363,7.327883,163.841354,144.154221,244.404984,235.933044,0.086787,0.043955,0.000857,0.015693,0.225192,0.012989,0.026165,0.007066,0.004132,0.020638,,
"""std""",,,,0.280811,16.775119,8.275142,0.387271,,,10.795379,,,,,,2.487756,0.0,25.011497,0.681863,86.050882,463.266998,0.400829,0.164912,2.165538,20.672979,1.525787,0.263238,0.951715,1.288376,0.097148,1.560166,90.79055,30.870502,6.873585,0.358268,42.032412,…,2.884303,92.646583,92.779494,0.763201,0.752529,5.745875,5.676796,6.979339,6.965104,14.669777,10.944916,0.084735,0.111561,108.005936,61.848053,128.218948,117.552498,14.630907,14.113107,0.105821,0.107873,113.455742,98.464546,129.96431,126.458504,0.247569,0.217341,0.029265,0.124284,0.417711,0.113229,0.159628,0.083763,0.064148,0.142169,,
"""min""",,,,0.0,16.0,14.844926,0.0,,,137.199997,,,,,,-24.947222,0.0,38.599998,1.2,0.0,0.01,0.0,0.0,0.1,4.0,0.3,0.21,1.0,1.0,0.0,1.0,39.0,0.0,16.200001,0.0,0.0,…,0.4,0.0,0.0,2.5,2.5,114.0,114.0,1.1,1.0898,18.4,14.9,7.05428,6.89,39.0,28.0,54.799999,36.0,15.0,14.997,6.93,6.9,34.0,31.0,42.0,38.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
"""25%""",,,,0.0,52.0,23.641975,0.0,,,162.5,,,,,,0.035417,0.0,66.800003,2.4,113.0,203.009995,0.0,0.0,0.4,13.0,0.72,0.4,3.0,6.0,0.0,4.0,97.0,86.0,28.0,0.0,54.0,…,1.3,133.0,132.0,3.7,3.7,136.0,135.0,8.6,8.6,36.0,32.0,7.341,7.27,88.099998,69.0,192.285721,132.5,36.0,35.0,7.29,7.28,80.699997,77.0,142.0,136.0,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
"""50%""",,,,0.0,65.0,27.654655,0.0,,,170.100006,,,,,,0.138889,0.0,80.300003,2.9,122.0,409.019989,0.0,0.0,0.6,19.0,0.98,0.5,4.0,6.0,0.0,5.0,133.0,104.0,33.200001,0.0,67.0,…,2.0,181.0,181.0,4.1,4.1,139.0,138.0,12.12,12.1,42.799999,37.0,7.392,7.34,127.0,85.0,272.666656,205.0,42.099998,41.0,7.35,7.34,120.0,107.0,223.333328,214.0,0.05,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
"""75%""",,,,0.0,75.0,32.930374,0.0,,,177.800003,,,,,,0.409028,0.0,97.099998,3.4,301.0,703.030029,0.0,0.0,1.1,32.0,1.53,0.85,4.0,6.0,0.0,5.0,196.0,120.0,37.900002,0.0,125.0,…,3.6,241.0,240.0,4.6,4.5,141.0,141.0,16.799999,16.700001,50.0,43.0,7.44,7.4,206.0,116.0,365.0,300.0,49.200001,48.0,7.41,7.4,216.0,178.0,328.0,317.5,0.13,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
"""max""",,,,1.0,89.0,67.814987,1.0,,,195.589996,,,,,,159.090973,0.0,186.0,4.6,308.0,2201.050049,1.0,1.0,51.0,127.0,11.18,1.0,4.0,6.0,1.0,5.0,598.700012,178.0,51.400002,1.0,200.0,…,18.019501,585.0,585.0,7.2,7.1,157.0,157.0,44.102001,44.102001,111.0,85.912003,7.62,7.55786,540.86499,448.891998,834.804993,604.227783,111.504997,107.0,7.57,7.563,534.905029,514.905029,720.0,654.813782,0.99,0.97,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,


In [295]:
train_cleaned_pl.null_count().melt().sort(by="value", descending=True)

variable,value
str,u32
"""h1_bilirubin_m…",84619
"""h1_bilirubin_m…",84619
"""h1_lactate_max…",84369
"""h1_lactate_min…",84369
"""h1_albumin_max…",83824
"""h1_albumin_min…",83824
"""h1_pao2fio2rat…",80195
"""h1_pao2fio2rat…",80195
"""h1_arterial_ph…",76424
"""h1_arterial_ph…",76424


In [296]:
train_cleaned_pl.schema

{'encounter_id': Categorical,
 'patient_id': Categorical,
 'hospital_id': Categorical,
 'hospital_death': UInt8,
 'age': Int8,
 'bmi': Float32,
 'elective_surgery': UInt8,
 'ethnicity': Categorical,
 'gender': Categorical,
 'height': Float32,
 'hospital_admit_source': Categorical,
 'icu_admit_source': Categorical,
 'icu_id': Categorical,
 'icu_stay_type': Categorical,
 'icu_type': Categorical,
 'pre_icu_los_days': Float32,
 'readmission_status': Int8,
 'weight': Float32,
 'albumin_apache': Float32,
 'apache_2_diagnosis': Int16,
 'apache_3j_diagnosis': Float32,
 'apache_post_operative': Int8,
 'arf_apache': Int8,
 'bilirubin_apache': Float32,
 'bun_apache': Float32,
 'creatinine_apache': Float32,
 'fio2_apache': Float32,
 'gcs_eyes_apache': Int8,
 'gcs_motor_apache': Int8,
 'gcs_unable_apache': Int8,
 'gcs_verbal_apache': Int8,
 'glucose_apache': Float32,
 'heart_rate_apache': Int16,
 'hematocrit_apache': Float32,
 'intubated_apache': Int8,
 'map_apache': Int16,
 'paco2_apache': Float32

In [297]:
round(train_cleaned_pl.estimated_size('megabytes'), 2)

53.2

In [299]:
train_cleaned_pl.write_parquet(Path(data_path, "train.parquet"))

# Analysis

In [280]:
train_pl = pl.read_csv(training_v2_path, infer_schema_length=100000, null_values="NA")

In [284]:
round(train_pl.estimated_size('megabytes'), 2)

139.07

In [253]:
set(train_pl.dtypes)

{Float64, Int64, Utf8}

In [251]:
train_pl.schema

{'encounter_id': Int64,
 'patient_id': Int64,
 'hospital_id': Int64,
 'hospital_death': Int64,
 'age': Int64,
 'bmi': Float64,
 'elective_surgery': Int64,
 'ethnicity': Utf8,
 'gender': Utf8,
 'height': Float64,
 'hospital_admit_source': Utf8,
 'icu_admit_source': Utf8,
 'icu_id': Int64,
 'icu_stay_type': Utf8,
 'icu_type': Utf8,
 'pre_icu_los_days': Float64,
 'readmission_status': Int64,
 'weight': Float64,
 'albumin_apache': Float64,
 'apache_2_diagnosis': Int64,
 'apache_3j_diagnosis': Float64,
 'apache_post_operative': Int64,
 'arf_apache': Int64,
 'bilirubin_apache': Float64,
 'bun_apache': Float64,
 'creatinine_apache': Float64,
 'fio2_apache': Float64,
 'gcs_eyes_apache': Int64,
 'gcs_motor_apache': Int64,
 'gcs_unable_apache': Int64,
 'gcs_verbal_apache': Int64,
 'glucose_apache': Float64,
 'heart_rate_apache': Int64,
 'hematocrit_apache': Float64,
 'intubated_apache': Int64,
 'map_apache': Int64,
 'paco2_apache': Float64,
 'paco2_for_ph_apache': Float64,
 'pao2_apache': Float6

In [237]:
train_pl.select(cs.string()).describe()

describe,ethnicity,gender,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem
str,str,str,str,str,str,str,str,str
"""count""","""91713""","""91713""","""91713""","""91713""","""91713""","""91713""","""91713""","""91713"""
"""null_count""","""1395""","""25""","""21409""","""112""","""0""","""0""","""1662""","""1662"""
"""mean""",,,,,,,,
"""std""",,,,,,,,
"""min""","""African Americ…","""F""","""Acute Care/Flo…","""Accident & Eme…","""admit""","""CCU-CTICU""","""Cardiovascular…","""Cardiovascular…"
"""25%""",,,,,,,,
"""50%""",,,,,,,,
"""75%""",,,,,,,,
"""max""","""Other/Unknown""","""M""","""Step-Down Unit…","""Other ICU""","""transfer""","""SICU""","""Trauma""","""Undefined diag…"


In [242]:
train_pl.select(cs.string()).glimpse()

Rows: 91713
Columns: 8
$ ethnicity             <str> 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', None
$ gender                <str> 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'M'
$ hospital_admit_source <str> 'Floor', 'Floor', 'Emergency Department', 'Operating Room', None, 'Direct Admit', 'Operating Room', 'Emergency Department', 'Other Hospital', 'Direct Admit'
$ icu_admit_source      <str> 'Floor', 'Floor', 'Accident & Emergency', 'Operating Room / Recovery', 'Accident & Emergency', 'Accident & Emergency', 'Accident & Emergency', 'Accident & Emergency', 'Other Hospital', 'Accident & Emergency'
$ icu_stay_type         <str> 'admit', 'admit', 'admit', 'admit', 'admit', 'admit', 'admit', 'admit', 'admit', 'admit'
$ icu_type              <str> 'CTICU', 'Med-Surg ICU', 'Med-Surg ICU', 'CTICU', 'Med-Surg ICU', 'Med-Surg ICU', 'Med-Surg ICU', 'Med-Surg ICU', 'CCU-CTICU', 'CCU-CTICU'
$ apache_3j_bodysystem  <str> 'Se

In [235]:
train_pl.select(cs.numeric()).describe()

describe,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,…,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,…,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0,91713.0
"""null_count""",0.0,0.0,0.0,0.0,4228.0,3429.0,0.0,1334.0,0.0,0.0,0.0,2720.0,54379.0,1662.0,1101.0,0.0,715.0,58134.0,19262.0,18853.0,70868.0,1901.0,1901.0,1037.0,1901.0,11036.0,878.0,19878.0,715.0,994.0,70868.0,70868.0,70868.0,70868.0,1234.0,18600.0,…,57941.0,84369.0,84369.0,75673.0,75673.0,72102.0,72102.0,72617.0,72617.0,75953.0,75953.0,59271.0,59271.0,60123.0,60123.0,59262.0,59262.0,66008.0,66008.0,75959.0,75959.0,76424.0,76424.0,75945.0,75945.0,80195.0,80195.0,7947.0,7947.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0
"""mean""",65606.07928,65537.131464,105.669262,0.086302,62.309516,29.185818,0.183736,169.641588,508.357692,0.835766,0.0,84.02834,2.902968,185.401739,558.216377,0.201106,0.027979,1.147721,25.82533,1.480014,0.595751,3.465049,5.471195,0.009528,3.994778,160.326822,99.707932,32.988739,0.151223,88.015873,42.183238,42.183238,131.148467,7.353895,25.811007,137.966373,…,1.482979,3.06886,3.021543,196.09601,195.481546,4.201144,4.153006,138.23961,137.901948,13.460212,13.423475,45.248924,38.433862,7.389236,7.32453,165.91398,103.511349,285.667079,223.523037,44.668444,43.383453,7.338363,7.327883,163.841354,144.154224,244.404982,235.93305,0.086787,0.043955,0.000857,0.015693,0.225192,0.012989,0.026165,0.007066,0.004132,0.020638
"""std""",37795.088538,37811.252183,62.854406,0.280811,16.775119,8.275142,0.387271,10.795378,228.989661,2.487756,0.0,25.011497,0.681863,86.050882,463.266985,0.400829,0.164912,2.165538,20.67298,1.525787,0.263238,0.951715,1.288376,0.097148,1.560166,90.790551,30.870502,6.873585,0.358268,42.032412,12.382412,12.382412,83.607292,0.097755,15.106312,5.279418,…,0.748557,2.926545,2.884303,92.646583,92.779494,0.763201,0.752529,5.745875,5.676796,6.979339,6.965104,14.669776,10.944916,0.084735,0.111561,108.005939,61.848052,128.218956,117.552497,14.630907,14.113107,0.105821,0.107873,113.455738,98.464543,129.964308,126.458507,0.247569,0.217341,0.029265,0.124284,0.417711,0.113229,0.159628,0.083763,0.064148,0.142169
"""min""",1.0,1.0,2.0,0.0,16.0,14.844926,0.0,137.2,82.0,-24.947222,0.0,38.6,1.2,101.0,0.01,0.0,0.0,0.1,4.0,0.3,0.21,1.0,1.0,0.0,1.0,39.0,30.0,16.2,0.0,40.0,18.0,18.0,31.0,6.96054,4.0,117.0,…,0.9,0.4,0.4,20.0,20.0,2.5,2.5,114.0,114.0,1.1,1.0898,18.4,14.9,7.05428,6.89,39.0,28.0,54.8,36.0,15.0,14.997,6.93,6.9,34.0,31.0,42.0,38.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",32852.0,32830.0,47.0,0.0,52.0,23.641975,0.0,162.5,369.0,0.035417,0.0,66.8,2.4,113.0,203.01,0.0,0.0,0.4,13.0,0.72,0.4,3.0,6.0,0.0,4.0,97.0,86.0,28.0,0.0,54.0,34.4,34.4,77.5,7.308,11.0,135.0,…,1.1,1.3,1.3,133.0,132.0,3.7,3.7,136.0,135.0,8.6,8.6,36.0,32.0,7.341,7.27,88.1,69.0,192.285714,132.5,36.0,35.0,7.29,7.28,80.7,77.0,142.0,136.0,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""50%""",65665.0,65413.0,109.0,0.0,65.0,27.654655,0.0,170.1,504.0,0.138889,0.0,80.3,2.9,122.0,409.02,0.0,0.0,0.6,19.0,0.98,0.5,4.0,6.0,0.0,5.0,133.0,104.0,33.2,0.0,67.0,40.0,40.0,103.5,7.36,28.0,138.0,…,1.21,2.05,2.0,181.0,181.0,4.1,4.1,139.0,138.0,12.12,12.1,42.8,37.0,7.392,7.34,127.0,85.0,272.666667,205.0,42.1,41.0,7.35,7.34,120.0,107.0,223.333333,214.0,0.05,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""75%""",98342.0,98298.0,161.0,0.0,75.0,32.930374,0.0,177.8,679.0,0.409028,0.0,97.1,3.4,301.0,703.03,0.0,0.0,1.1,32.0,1.53,0.85,4.0,6.0,0.0,5.0,196.0,120.0,37.9,0.0,125.0,47.0,47.0,153.0,7.419,36.0,141.0,…,1.5,3.6,3.6,241.0,240.0,4.6,4.5,141.0,141.0,16.8,16.7,50.0,43.0,7.44,7.4,206.0,116.0,365.0,300.0,49.2,48.0,7.41,7.4,216.0,178.0,328.0,317.5,0.13,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""max""",131051.0,131051.0,204.0,1.0,89.0,67.81499,1.0,195.59,927.0,159.090972,0.0,186.0,4.6,308.0,2201.05,1.0,1.0,51.0,127.0,11.18,1.0,4.0,6.0,1.0,5.0,598.7,178.0,51.4,1.0,200.0,95.0,95.0,498.0,7.59,60.0,158.0,…,6.127,18.1,18.0195,585.0,585.0,7.2,7.1,157.0,157.0,44.102,44.102,111.0,85.912,7.62,7.55786,540.865,448.892,834.805,604.227778,111.505,107.0,7.57,7.563,534.905,514.905,720.0,654.813793,0.99,0.97,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [243]:
train_pl.select(cs.numeric()).glimpse()

Rows: 91713
Columns: 178
$ encounter_id                  <i64> 66154, 114252, 119783, 79267, 92056, 33181, 82208, 120995, 80471, 42871
$ patient_id                    <i64> 25312, 59342, 50777, 46918, 34377, 74489, 49526, 50129, 10577, 90749
$ hospital_id                   <i64> 118, 81, 118, 118, 33, 83, 83, 33, 118, 118
$ hospital_death                <i64> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0
$ age                           <i64> 68, 77, 25, 81, 19, 67, 59, 70, 45, 50
$ bmi                           <f64> 22.73, 27.42, 31.95, 22.64, None, 27.56, 57.45, None, None, 25.71
$ elective_surgery              <i64> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0
$ height                        <f64> 180.3, 160.0, 172.7, 165.1, 188.0, 190.5, 165.1, 165.0, 170.2, 175.3
$ icu_id                        <i64> 92, 90, 93, 92, 91, 95, 95, 91, 114, 114
$ pre_icu_los_days              <f64> 0.541666667, 0.927777778, 0.000694444, 0.000694444, 0.073611111, 0.000694444, 0.000694444, 0.002083333, 0.009027778, 0.060416667
$ readmi

# Null values

In [267]:
train_pl.null_count().melt().sort(by=pl.col("value"), descending=True)

variable,value
str,u32
"""h1_bilirubin_m…",84619
"""h1_bilirubin_m…",84619
"""h1_lactate_max…",84369
"""h1_lactate_min…",84369
"""h1_albumin_max…",83824
"""h1_albumin_min…",83824
"""h1_pao2fio2rat…",80195
"""h1_pao2fio2rat…",80195
"""h1_arterial_ph…",76424
"""h1_arterial_ph…",76424


In [268]:
train_pl.drop_nulls().shape

(25, 186)

In [249]:
train_pl.select(pl.col("hospital_admit_source")).to_series().value_counts(sort=True).to_pandas()

Unnamed: 0,hospital_admit_source,counts
0,Emergency Department,36962
1,,21409
2,Operating Room,9787
3,Floor,8055
4,Direct Admit,6441
5,Recovery Room,2896
6,Acute Care/Floor,1910
7,Other Hospital,1641
8,Step-Down Unit (SDU),1131
9,PACU,1017


In [239]:
(
    train_pl
    .fill_nan(None)
    .drop_nulls()
    .null_count()
    .melt()
    .sort(by=pl.col("value"), descending=True)
)

variable,value
str,u32
"""encounter_id""",0
"""patient_id""",0
"""hospital_id""",0
"""hospital_death…",0
"""age""",0
"""bmi""",0
"""elective_surge…",0
"""ethnicity""",0
"""gender""",0
"""height""",0


In [244]:
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper

In [245]:
@log_step
def count_na(df):
    return df.null_count().melt().filter(pl.col("value") > 0).sort("value", descending=True)

In [246]:
(
    train_pl
    .pipe(count_na)
    .to_pandas()
)

just ran step count_na shape=(175, 2) took 0:00:00.001923s


Unnamed: 0,variable,value
0,h1_bilirubin_max,84619
1,h1_bilirubin_min,84619
2,h1_lactate_max,84369
3,h1_lactate_min,84369
4,h1_albumin_max,83824
...,...,...
170,d1_sysbp_min,159
171,d1_heartrate_max,145
172,d1_heartrate_min,145
173,icu_admit_source,112


In [247]:
null_cols = (
    train_pl
    .pipe(count_na)
    .to_pandas()
)

just ran step count_na shape=(175, 2) took 0:00:00.001648s


In [248]:
alt.Chart(null_cols).encode(
    y=alt.Y('variable:N', sort=alt.EncodingSortField(field="value", op="sum", order='descending')),
    x='value:Q'
).mark_bar()

In [219]:
null_col_vals = (
    train_pl
    .select(pl.col(["hospital_admit_source", "apache_3j_bodysystem", "apache_2_bodysystem", "ethnicity", "icu_admit_source", "gender"]))
)

In [220]:
null_col_vals

hospital_admit_source,apache_3j_bodysystem,apache_2_bodysystem,ethnicity,icu_admit_source,gender
str,str,str,cat,str,cat
"""Floor""","""Sepsis""","""Cardiovascular…","""Caucasian""","""Floor""","""M"""
"""Floor""","""Respiratory""","""Respiratory""","""Caucasian""","""Floor""","""F"""
"""Emergency Depa…","""Metabolic""","""Metabolic""","""Caucasian""","""Accident & Eme…","""F"""
"""Operating Room…","""Cardiovascular…","""Cardiovascular…","""Caucasian""","""Operating Room…","""F"""
,"""Trauma""","""Trauma""","""Caucasian""","""Accident & Eme…","""M"""
"""Direct Admit""","""Neurological""","""Neurologic""","""Caucasian""","""Accident & Eme…","""M"""
"""Operating Room…","""Respiratory""","""Respiratory""","""Caucasian""","""Accident & Eme…","""F"""
"""Emergency Depa…","""Sepsis""","""Cardiovascular…","""Caucasian""","""Accident & Eme…","""M"""
"""Other Hospital…","""Cardiovascular…","""Cardiovascular…","""Caucasian""","""Other Hospital…","""M"""
"""Direct Admit""","""Cardiovascular…","""Cardiovascular…",,"""Accident & Eme…","""M"""


In [221]:
admit_sources = null_col_vals.select(pl.col("hospital_admit_source")).to_series().value_counts(sort=True).to_pandas()

In [222]:
admit_sources

Unnamed: 0,hospital_admit_source,counts
0,Emergency Department,36962
1,,21409
2,Operating Room,9787
3,Floor,8055
4,Direct Admit,6441
5,Recovery Room,2896
6,Acute Care/Floor,1910
7,Other Hospital,1641
8,Step-Down Unit (SDU),1131
9,PACU,1017


In [223]:
admit_sources = null_col_vals.select(pl.col("hospital_admit_source")).to_series().value_counts(sort=True).to_pandas()

alt.Chart(admit_sources).mark_bar().encode(
    alt.Y('hospital_admit_source:N', sort=alt.EncodingSortField(field="counts", order='descending')),
    alt.X('counts')
)

In [224]:
apache_3j_bodysystem_counts = null_col_vals.select(pl.col("apache_3j_bodysystem")).to_series().value_counts(sort=True).to_pandas()

alt.Chart(apache_3j_bodysystem_counts).mark_bar().encode(
    alt.Y('apache_3j_bodysystem:N', sort=alt.EncodingSortField(field="counts", order='descending')),
    alt.X('counts')
)

In [225]:
apache_2_bodysystem_counts = null_col_vals.select(pl.col("apache_2_bodysystem")).to_series().value_counts(sort=True).to_pandas()

alt.Chart(apache_2_bodysystem_counts).mark_bar().encode(
    alt.Y('apache_2_bodysystem:N', sort=alt.EncodingSortField(field="counts", order='descending')),
    alt.X('counts')
)

In [226]:
apache_3j_bodysystem_counts = null_col_vals.select(pl.col("ethnicity")).to_series().value_counts(sort=True).to_pandas()

alt.Chart(apache_3j_bodysystem_counts).mark_bar().encode(
    alt.Y('ethnicity:N', sort=alt.EncodingSortField(field="counts", order='descending')),
    alt.X('counts')
)

In [227]:
icu_admit_source = null_col_vals.select(pl.col("icu_admit_source")).to_series().value_counts(sort=True).to_pandas()

alt.Chart(icu_admit_source).mark_bar().encode(
    alt.Y('icu_admit_source:N', sort=alt.EncodingSortField(field="counts", order='descending')),
    alt.X('counts')
)

In [228]:
gender = null_col_vals.select(pl.col("gender")).to_series().value_counts(sort=True).to_pandas()

alt.Chart(gender).mark_bar().encode(
    alt.Y('gender:N', sort=alt.EncodingSortField(field="counts", order='descending')),
    alt.X('counts')
)