In [22]:
import sys
from pathlib import Path
from pprint import pprint

import polars as pl
import polars.selectors as cs
from hydra import compose, initialize
from omegaconf import OmegaConf, DictConfig

sys.path.append('/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/src/')

from ingest_data import ingest_raw_data
from clean_data import clean
from split_data import split, save_splits
from prefect_hydra_test import run_flow

In [2]:
with initialize(version_base="1.3.2",
                config_path="../src/config",
                job_name="test_flow"):
    cfg = compose(config_name="config")

In [3]:
print(OmegaConf.to_yaml(cfg))

target_var: hospital_death
train_test_split:
  random_state: 42
  train_ratio: 0.6
  validation_ratio: 0.15
  test_ratio: 0.25
base_path: /Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020
data_path: ${base_path}/data
model_path: ${base_path}/models
paths:
  mlflow:
    mlruns: ${base_path}/mlruns
  data:
    raw: ${data_path}/training_v2.csv
    clean: ${data_path}/clean.parquet
    X_train: ${data_path}/X_train.parquet
    X_test: ${data_path}/X_test.parquet
    X_validation: ${data_path}/X_validation.parquet
    y_train: ${data_path}/y_train.parquet
    y_test: ${data_path}/y_test.parquet
    y_validation: ${data_path}/y_validation.parquet
  models:
    xgboost: ${model_path}/xgboost
    lightgbm: ${model_path}/lightgbm
    catboost: ${model_path}/catboost
    pycaret: ${model_path}/pycaret
    tpot: ${model_path}/tpot



In [4]:
raw_path = cfg.paths.data.raw
clean_path = cfg.paths.data.clean
X_train_path = cfg.paths.data.X_train
X_test_path = cfg.paths.data.X_test
X_validation_path = cfg.paths.data.X_validation
y_train_path = cfg.paths.data.y_train
y_test_path = cfg.paths.data.y_test
y_validation_path = cfg.paths.data.y_validation

In [5]:
raw_path

'/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/data/training_v2.csv'

In [6]:
Path(raw_path)

PosixPath('/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/data/training_v2.csv')

In [7]:
Path(cfg.paths.data.raw)

PosixPath('/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/data/training_v2.csv')

In [8]:
cfg.target_var

'hospital_death'

In [9]:
pl.read_csv(raw_path, infer_schema_length=10000).glimpse()

Rows: 91713
Columns: 186
$ encounter_id                  <i64> 66154, 114252, 119783, 79267, 92056, 33181, 82208, 120995, 80471, 42871
$ patient_id                    <i64> 25312, 59342, 50777, 46918, 34377, 74489, 49526, 50129, 10577, 90749
$ hospital_id                   <i64> 118, 81, 118, 118, 33, 83, 83, 33, 118, 118
$ hospital_death                <i64> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0
$ age                           <str> '68', '77', '25', '81', '19', '67', '59', '70', '45', '50'
$ bmi                           <str> '22.73', '27.42', '31.95', '22.64', 'NA', '27.56', '57.45', 'NA', 'NA', '25.71'
$ elective_surgery              <i64> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0
$ ethnicity                     <str> 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', None
$ gender                        <str> 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'M'
$ height                        <str> '180.3', '160', '172.7', '165.1', '188', '

In [10]:
run_flow(cfg)

[Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `DataFrame`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `dict`')),
 Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `NoneType`'))]

In [12]:
pl.read_parquet(X_train_path).shape

(55027, 185)

In [13]:
pl.read_parquet(y_train_path).shape

(55027, 1)

In [17]:
pl.read_parquet(X_validation_path).shape

(13757, 185)

In [18]:
pl.read_parquet(y_validation_path).shape

(13757, 1)

In [15]:
pl.read_parquet(X_test_path).shape

(22929, 185)

In [16]:
pl.read_parquet(y_test_path).shape

(22929, 1)

In [19]:
X_train = pl.read_parquet(X_train_path)

In [20]:
X_train.glimpse()

Rows: 55027
Columns: 185
$ encounter_id                  <i64> 67688, 67652, 83860, 80945, 130209, 51233, 66562, 14035, 55295, 81111
$ patient_id                    <i64> 26357, 3653, 79991, 33799, 52998, 41619, 90962, 40021, 70389, 72100
$ hospital_id                   <i64> 32, 116, 128, 27, 27, 161, 68, 200, 35, 80
$ age                           <str> '44', '36', '48', '25', '82', '82', '76', '66', '75', '76'
$ bmi                           <str> '40.49584016', '31.62105548', '22.28160021', '17.67744633', '32.83746556', '22.18364198', '26.2109375', '25.80896017', '23.70531749', '21.94714431'
$ elective_surgery              <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
$ ethnicity                     <str> 'Native American', 'African American', 'Other/Unknown', 'Caucasian', 'Caucasian', 'Caucasian', None, 'Native American', 'Caucasian', 'Caucasian'
$ gender                        <str> 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F'
$ height                        <str> '173', '170.2', '149.8

In [25]:
X_train.select(cs.numeric()).glimpse()

Rows: 55027
Columns: 8
$ encounter_id          <i64> 67688, 67652, 83860, 80945, 130209, 51233, 66562, 14035, 55295, 81111
$ patient_id            <i64> 26357, 3653, 79991, 33799, 52998, 41619, 90962, 40021, 70389, 72100
$ hospital_id           <i64> 32, 116, 128, 27, 27, 161, 68, 200, 35, 80
$ elective_surgery      <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
$ icu_id                <i64> 369, 307, 335, 608, 608, 324, 134, 516, 895, 620
$ pre_icu_los_days      <f64> 0.666666667, 0.554861111, 0.098611111, 0.230555556, 1.056944444, 1.979861111, 0.120833333, 0.306944444, 0.015277778, 0.279861111
$ readmission_status    <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ apache_post_operative <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1



In [24]:
X_train.select(cs.string()).glimpse()

Rows: 55027
Columns: 177
$ age                           <str> '44', '36', '48', '25', '82', '82', '76', '66', '75', '76'
$ bmi                           <str> '40.49584016', '31.62105548', '22.28160021', '17.67744633', '32.83746556', '22.18364198', '26.2109375', '25.80896017', '23.70531749', '21.94714431'
$ ethnicity                     <str> 'Native American', 'African American', 'Other/Unknown', 'Caucasian', 'Caucasian', 'Caucasian', None, 'Native American', 'Caucasian', 'Caucasian'
$ gender                        <str> 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F'
$ height                        <str> '173', '170.2', '149.8', '183', '165', '144', '160', '180.3', '182.9', '154.9'
$ hospital_admit_source         <str> 'Step-Down Unit (SDU)', 'Floor', 'Emergency Department', 'Emergency Department', 'Emergency Department', 'Floor', 'Emergency Department', 'Emergency Department', 'Direct Admit', 'Recovery Room'
$ icu_admit_source              <str> 'Floor', 'Floor', 'Accident & Emerg

In [28]:
X_train.limit(5).to_dicts()

[{'encounter_id': 67688,
  'patient_id': 26357,
  'hospital_id': 32,
  'age': '44',
  'bmi': '40.49584016',
  'elective_surgery': 0,
  'ethnicity': 'Native American',
  'gender': 'M',
  'height': '173',
  'hospital_admit_source': 'Step-Down Unit (SDU)',
  'icu_admit_source': 'Floor',
  'icu_id': 369,
  'icu_stay_type': 'admit',
  'icu_type': 'Med-Surg ICU',
  'pre_icu_los_days': 0.666666667,
  'readmission_status': 0,
  'weight': '121.2',
  'albumin_apache': '2.3',
  'apache_2_diagnosis': '113',
  'apache_3j_diagnosis': '501.06',
  'apache_post_operative': 0,
  'arf_apache': '0',
  'bilirubin_apache': '2.6',
  'bun_apache': '24',
  'creatinine_apache': '0.77',
  'fio_2_apache': '1',
  'gcs_eyes_apache': '2',
  'gcs_motor_apache': '1',
  'gcs_unable_apache': '0',
  'gcs_verbal_apache': '1',
  'glucose_apache': '260',
  'heart_rate_apache': '139',
  'hematocrit_apache': '17.1',
  'intubated_apache': '1',
  'map_apache': '188',
  'paco_2_apache': '31',
  'paco_2_for_ph_apache': '31',
  'p