In [1]:
from pathlib import Path
from pprint import pprint

import polars as pl
from hydra import compose, initialize
from omegaconf import OmegaConf, DictConfig

In [2]:
with initialize(version_base="1.3.2",
                config_path="../src/config",
                job_name="test_flow"):
    cfg = compose(config_name="config")

In [3]:
print(OmegaConf.to_yaml(cfg))

target_var: hospital_death
train_test_split:
  random_state: 42
  train_ratio: 0.7
  validation_ratio: 0.15
  test_ratio: 0.15
base_path: /Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020
data_path: ${base_path}/data
model_path: ${base_path}/models
paths:
  mlflow:
    mlruns: ${base_path}/mlruns
  data:
    raw: ${data_path}/training_v2.csv
    clean: ${data_path}/clean.parquet
    X_train: ${data_path}/X_train.parquet
    X_test: ${data_path}/X_test.parquet
    X_validation: ${data_path}/X_validation.parquet
    y_train: ${data_path}/y_train.parquet
    y_test: ${data_path}/y_test.parquet
    y_validation: ${data_path}/y_validation.parquet
  models:
    xgboost: ${model_path}/xgboost
    lightgbm: ${model_path}/lightgbm
    catboost: ${model_path}/catboost
    pycaret: ${model_path}/pycaret
    tpot: ${model_path}/tpot



In [4]:
raw_path = cfg.paths.data.raw
clean_path = cfg.paths.data.clean
X_train_path = cfg.paths.data.X_train
X_test_path = cfg.paths.data.X_test
X_validation_path = cfg.paths.data.X_validation
y_train_path = cfg.paths.data.y_train
y_test_path = cfg.paths.data.y_test
y_validation_path = cfg.paths.data.y_validation

In [5]:
raw_path

'/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/data/training_v2.csv'

In [6]:
Path(raw_path)

PosixPath('/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/data/training_v2.csv')

In [7]:
Path(cfg.paths.data.raw)

PosixPath('/Users/zacklarsen/Documents/Projects/kaggle-wids-datathon-2020/data/training_v2.csv')

In [8]:
cfg.target_var

'hospital_death'

In [9]:
pl.read_csv(raw_path, infer_schema_length=10000).glimpse()

Rows: 91713
Columns: 186
$ encounter_id                  <i64> 66154, 114252, 119783, 79267, 92056, 33181, 82208, 120995, 80471, 42871
$ patient_id                    <i64> 25312, 59342, 50777, 46918, 34377, 74489, 49526, 50129, 10577, 90749
$ hospital_id                   <i64> 118, 81, 118, 118, 33, 83, 83, 33, 118, 118
$ hospital_death                <i64> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0
$ age                           <str> '68', '77', '25', '81', '19', '67', '59', '70', '45', '50'
$ bmi                           <str> '22.73', '27.42', '31.95', '22.64', 'NA', '27.56', '57.45', 'NA', 'NA', '25.71'
$ elective_surgery              <i64> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0
$ ethnicity                     <str> 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', 'Caucasian', None
$ gender                        <str> 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'M', 'M', 'M'
$ height                        <str> '180.3', '160', '172.7', '165.1', '188', '