In [86]:
import polars as pl
import pandas as pd
import hydra
from hydra import compose, initialize
from prefect import flow, get_run_logger
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [5]:
with initialize(version_base="1.3.2",
                    config_path="../src/config",
                    job_name="test_flow"):
    cfg = compose(config_name="config")

In [9]:
cfg.numeric_features

['pre_icu_los_days']

In [23]:
list(cfg.numeric_features)

['pre_icu_los_days']

In [22]:
type(cfg.numeric_features)

omegaconf.listconfig.ListConfig

In [10]:
cfg.categorical_features

['gender', 'ethnicity', 'hospital_admit_source', 'icu_admit_source']

In [41]:
numeric_features = list(cfg.numeric_features)

categorical_features = list(cfg.categorical_features)

In [42]:
numeric_features

['pre_icu_los_days']

In [43]:
categorical_features

['gender', 'ethnicity', 'hospital_admit_source', 'icu_admit_source']

In [44]:
all_features = numeric_features + categorical_features

In [45]:
all_features

['pre_icu_los_days',
 'gender',
 'ethnicity',
 'hospital_admit_source',
 'icu_admit_source']

In [13]:
X_train = pl.read_parquet(cfg.paths.data.X_train)

In [14]:
y_train = pl.read_parquet(cfg.paths.data.y_train)

In [61]:
X_train.select(all_features).null_count()

pre_icu_los_days,gender,ethnicity,hospital_admit_source,icu_admit_source
u32,u32,u32,u32,u32
0,13,841,12841,73


In [67]:
# Defining numerical and categorical pipelines
numeric_features = list(cfg.numeric_features)
numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [68]:
categorical_features = list(cfg.categorical_features)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [69]:
# Combining transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [70]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [88]:
pipeline

In [89]:
# Fitting the pipeline on the training data
pipeline.fit(X_train.to_pandas(), y_train.to_pandas())

In [100]:
pipeline.get_feature_names_out()

array(['num__pre_icu_los_days', 'cat__gender_F', 'cat__gender_M',
       'cat__gender_None', 'cat__ethnicity_African American',
       'cat__ethnicity_Asian', 'cat__ethnicity_Caucasian',
       'cat__ethnicity_Hispanic', 'cat__ethnicity_Native American',
       'cat__ethnicity_Other/Unknown', 'cat__ethnicity_None',
       'cat__hospital_admit_source_Acute Care/Floor',
       'cat__hospital_admit_source_Chest Pain Center',
       'cat__hospital_admit_source_Direct Admit',
       'cat__hospital_admit_source_Emergency Department',
       'cat__hospital_admit_source_Floor',
       'cat__hospital_admit_source_ICU',
       'cat__hospital_admit_source_ICU to SDU',
       'cat__hospital_admit_source_Observation',
       'cat__hospital_admit_source_Operating Room',
       'cat__hospital_admit_source_Other',
       'cat__hospital_admit_source_Other Hospital',
       'cat__hospital_admit_source_Other ICU',
       'cat__hospital_admit_source_PACU',
       'cat__hospital_admit_source_Recovery Room'

In [90]:
X_train_preprocessed = pipeline.transform(X_train.to_pandas())

In [92]:
X_train_preprocessed

<55027x33 sparse matrix of type '<class 'numpy.float64'>'
	with 275135 stored elements in Compressed Sparse Row format>

In [102]:
X_train_preprocessed = pl.DataFrame(X_train_preprocessed.toarray())

In [105]:
X_train_preprocessed.columns = list(pipeline.get_feature_names_out())

In [107]:
X_train_preprocessed.limit(5)

num__pre_icu_los_days,cat__gender_F,cat__gender_M,cat__gender_None,cat__ethnicity_African American,cat__ethnicity_Asian,cat__ethnicity_Caucasian,cat__ethnicity_Hispanic,cat__ethnicity_Native American,cat__ethnicity_Other/Unknown,cat__ethnicity_None,cat__hospital_admit_source_Acute Care/Floor,cat__hospital_admit_source_Chest Pain Center,cat__hospital_admit_source_Direct Admit,cat__hospital_admit_source_Emergency Department,cat__hospital_admit_source_Floor,cat__hospital_admit_source_ICU,cat__hospital_admit_source_ICU to SDU,cat__hospital_admit_source_Observation,cat__hospital_admit_source_Operating Room,cat__hospital_admit_source_Other,cat__hospital_admit_source_Other Hospital,cat__hospital_admit_source_Other ICU,cat__hospital_admit_source_PACU,cat__hospital_admit_source_Recovery Room,cat__hospital_admit_source_Step-Down Unit (SDU),cat__hospital_admit_source_None,cat__icu_admit_source_Accident & Emergency,cat__icu_admit_source_Floor,cat__icu_admit_source_Operating Room / Recovery,cat__icu_admit_source_Other Hospital,cat__icu_admit_source_Other ICU,cat__icu_admit_source_None
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-0.069488,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
-0.115009,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
-0.300766,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
-0.247047,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
0.089409,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
