In [None]:
import os
import shutil
import pandas as pd

# import pipeline components
from pipeline import (
    _merge,
    _count,
    _find_unique_ids,
    _ingest_data,
    _retrieve_col_val,
    _merge_on_date,
    _engineer_feature,
    _feature_selection,
    _expand_feature,
    _date_to_numeric,
    _train_test_split,
    _one_hot,
    _to_numeric,
    _drop_nan,
    _ml_experiment,
    _standardize,
)

# clear/create dummy volume mount
VOLUME_MOUNT = "volume_mount"
shutil.rmtree(VOLUME_MOUNT, ignore_errors=True)
os.makedirs(VOLUME_MOUNT)

In [None]:
# ingest diagnoses
_ingest_data(
    "mimiciv/2.0/hosp/diagnoses_icd.csv.gz",
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    usecols=["hadm_id", "icd_code"],
)

In [None]:
# select cohort
_find_unique_ids(
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    match_on=["I50", "428"],
    target_col="icd_code",
    id_cols=["hadm_id"],
    match_on_first=3,
)

In [None]:
# ingest medications/prescriptions
_ingest_data(
    "mimiciv/2.0/hosp/prescriptions.csv.gz",
    f"{VOLUME_MOUNT}/prescriptions.parquet",
    usecols=["hadm_id", "subject_id"],
)

In [None]:
# count medications
_count(
    f"{VOLUME_MOUNT}/prescriptions.parquet",
    f"{VOLUME_MOUNT}/prescriptions.parquet",
    groupby="hadm_id",
    output_colname="n_medications",
)

In [None]:
# merge #medications to cohort
_merge(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    input_path_merge=f"{VOLUME_MOUNT}/prescriptions.parquet",
    merge_on=["hadm_id"],
    how="left",
)

In [None]:
# ingest admissions
_ingest_data(
    "mimiciv/2.0/hosp/admissions.csv.gz",
    f"{VOLUME_MOUNT}/admissions.parquet",
    usecols=[
        "subject_id",
        "hadm_id",
        "admittime",
        "dischtime",
        "admission_type",
        "admission_location",
        "hospital_expire_flag",
    ],
)

In [None]:
# merge admissions info to cohort
_merge(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/admissions.parquet",
    merge_on="hadm_id",
    how="left",
)

In [None]:
# ingest patient demographics
_ingest_data(
    "mimiciv/2.0/hosp/patients.csv.gz",
    f"{VOLUME_MOUNT}/patients.parquet",
    usecols=[
        "subject_id",
        "anchor_age",
        "gender",
    ],
)

In [None]:
# merge patient demographics to cohort
_merge(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/patients.parquet",
    merge_on="subject_id",
    how="left",
)

In [None]:
# ingesting diagnoses data
_ingest_data(
    "mimiciv/2.0/hosp/diagnoses_icd.csv.gz",
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    usecols=[
        "hadm_id",
        "seq_num",
    ],
)

In [None]:
# count diagnoses
_count(
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    groupby="hadm_id",
    output_colname="n_diagnoses",
)

In [None]:
# merge #diagnoses to cohort
_merge(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    merge_on="hadm_id",
    how="left",
)

In [None]:
# ingest omr data
_ingest_data(
    "mimiciv/2.0/hosp/omr.csv.gz",
    f"{VOLUME_MOUNT}/omr.parquet",
    usecols=["subject_id", "chartdate", "result_name", "result_value"],
)

In [None]:
# retrieve BMI from omr data
_retrieve_col_val(
    f"{VOLUME_MOUNT}/omr.parquet",
    f"{VOLUME_MOUNT}/bmi.parquet",
    retrieve_string="BMI",
    match_type="startswith",
    name_col="result_name",
    val_col="result_value",
)

# retrieve BP from omr data
_retrieve_col_val(
    f"{VOLUME_MOUNT}/omr.parquet",
    f"{VOLUME_MOUNT}/bp.parquet",
    retrieve_string="Blood Pressure",
    match_type="startswith",
    name_col="result_name",
    val_col="result_value",
)

In [None]:
# merge bmi to cohort
_merge_on_date(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    input_path_merge=f"{VOLUME_MOUNT}/bmi.parquet",
    date_col="admittime",
    date_col_merge="chartdate",
    merge_on="subject_id",
    direction="nearest",
)

In [None]:
# merge bp to cohort
_merge_on_date(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    input_path_merge=f"{VOLUME_MOUNT}/bp.parquet",
    date_col="admittime",
    date_col_merge="chartdate",
    merge_on="subject_id",
    direction="nearest",
)

In [None]:
# create length-of-stay feature (first need to turn dates into numeric format)
_date_to_numeric(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=["dischtime", "admittime"],
)
_engineer_feature(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    feature1="dischtime",
    feature2="admittime",
    operation="subtract",
)

In [None]:
# split BP to systolic/diastolic
_expand_feature(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    feature="Blood Pressure",
    expand_on="/",
)

In [None]:
# select required features
_feature_selection(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=[
        "anchor_age",
        "gender",
        "BMI",
        "Blood Pressure_0",
        "Blood Pressure_1",
        "n_medications",
        "n_diagnoses",
        "admission_type",
        "admission_location",
        "dischtime_subtract_admittime",
        "hospital_expire_flag",
    ],
)

In [None]:
# numerically encode all features
_to_numeric(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=[
        "anchor_age",
        "BMI",
        "Blood Pressure_0",
        "Blood Pressure_1",
        "n_medications",
        "n_diagnoses",
        "dischtime_subtract_admittime",
        "hospital_expire_flag",
    ],
)

_one_hot(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=[
        "gender",
        "admission_type",
        "admission_location",
    ],
)

In [None]:
# drop all NaN values
_drop_nan(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
)

In [None]:
# standardize numerical columns
_standardize(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=[
        "anchor_age",
        "BMI",
        "Blood Pressure_0",
        "Blood Pressure_1",
        "n_medications",
        "n_diagnoses",
        "dischtime_subtract_admittime",
    ],
)

In [None]:
# load and print current cohort
cohort = pd.read_parquet(f"{VOLUME_MOUNT}/cohort.parquet")
cohort

In [None]:
# split cohort into train and test set
_train_test_split(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/train.parquet",
    output_path_test=f"{VOLUME_MOUNT}/test.parquet",
    train_size=0.7,
    stratify="hospital_expire_flag",
    seed=0,
)

In [None]:
# perform ML experiment
_ml_experiment(
    f"{VOLUME_MOUNT}/train.parquet",
    f"{VOLUME_MOUNT}/ml_results.parquet",
    input_path_test=f"{VOLUME_MOUNT}/test.parquet",
    target_col="hospital_expire_flag",
    model_name="LogisticRegression",
    model_hparams={},
)

In [None]:
# load and print ML results
score = pd.read_parquet(f"{VOLUME_MOUNT}/ml_results.parquet")
score