In [3]:
import os
import shutil
import pandas as pd

# import pipeline components
from pipeline import (
    _merge,
    _count,
    _find_unique_ids,
    _ingest_data,
    _retrieve_col_val,
    _merge_on_date,
    _engineer_feature,
    _feature_selection,
    _expand_feature,
    _date_to_numeric,
    _train_test_split,
    _one_hot,
    _to_numeric,
    _drop_nan,
    _ml_experiment,
    _standardize,
)

# clear/create dummy volume mount
VOLUME_MOUNT = "volume_mount"
shutil.rmtree(VOLUME_MOUNT, ignore_errors=True)
os.makedirs(VOLUME_MOUNT)

In [4]:
# ingest diagnoses
_ingest_data(
    "mimiciv/2.0/hosp/diagnoses_icd.csv.gz",
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    usecols=["hadm_id", "icd_code"],
)

In [5]:
# select cohort
_find_unique_ids(
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    match_on=["I50", "428"],
    target_col="icd_code",
    id_cols=["hadm_id"],
    match_on_first=3,
)

In [6]:
# ingest medications/prescriptions
_ingest_data(
    "mimiciv/2.0/hosp/prescriptions.csv.gz",
    f"{VOLUME_MOUNT}/prescriptions.parquet",
    usecols=["hadm_id", "subject_id"],
)

In [7]:
# count medications
_count(
    f"{VOLUME_MOUNT}/prescriptions.parquet",
    f"{VOLUME_MOUNT}/prescriptions.parquet",
    groupby="hadm_id",
    output_colname="n_medications",
)

In [8]:
# merge #medications to cohort
_merge(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    input_path_merge=f"{VOLUME_MOUNT}/prescriptions.parquet",
    merge_on=["hadm_id"],
    how="left",
)

In [9]:
# ingest admissions
_ingest_data(
    "mimiciv/2.0/hosp/admissions.csv.gz",
    f"{VOLUME_MOUNT}/admissions.parquet",
    usecols=[
        "subject_id",
        "hadm_id",
        "admittime",
        "dischtime",
        "admission_type",
        "admission_location",
        "hospital_expire_flag",
    ],
)

In [10]:
# merge admissions info to cohort
_merge(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/admissions.parquet",
    merge_on="hadm_id",
    how="left",
)

In [11]:
# ingest patient demographics
_ingest_data(
    "mimiciv/2.0/hosp/patients.csv.gz",
    f"{VOLUME_MOUNT}/patients.parquet",
    usecols=[
        "subject_id",
        "anchor_age",
        "gender",
    ],
)

In [12]:
# merge patient demographics to cohort
_merge(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/patients.parquet",
    merge_on="subject_id",
    how="left",
)

In [13]:
# ingesting diagnoses data
_ingest_data(
    "mimiciv/2.0/hosp/diagnoses_icd.csv.gz",
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    usecols=[
        "hadm_id",
        "seq_num",
    ],
)

In [14]:
# count diagnoses
_count(
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    groupby="hadm_id",
    output_colname="n_diagnoses",
)

In [15]:
# merge #diagnoses to cohort
_merge(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    merge_on="hadm_id",
    how="left",
)

In [16]:
# ingest omr data
_ingest_data(
    "mimiciv/2.0/hosp/omr.csv.gz",
    f"{VOLUME_MOUNT}/omr.parquet",
    usecols=["subject_id", "chartdate", "result_name", "result_value"],
)

In [17]:
# retrieve BMI from omr data
_retrieve_col_val(
    f"{VOLUME_MOUNT}/omr.parquet",
    f"{VOLUME_MOUNT}/bmi.parquet",
    retrieve_string="BMI",
    match_type="startswith",
    name_col="result_name",
    val_col="result_value",
)

# retrieve BP from omr data
_retrieve_col_val(
    f"{VOLUME_MOUNT}/omr.parquet",
    f"{VOLUME_MOUNT}/bp.parquet",
    retrieve_string="Blood Pressure",
    match_type="startswith",
    name_col="result_name",
    val_col="result_value",
)

In [18]:
# merge bmi to cohort
_merge_on_date(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    input_path_merge=f"{VOLUME_MOUNT}/bmi.parquet",
    date_col="admittime",
    date_col_merge="chartdate",
    merge_on="subject_id",
    direction="nearest",
)

In [19]:
# merge bp to cohort
_merge_on_date(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    input_path_merge=f"{VOLUME_MOUNT}/bp.parquet",
    date_col="admittime",
    date_col_merge="chartdate",
    merge_on="subject_id",
    direction="nearest",
)

In [20]:
# create length-of-stay feature (first need to turn dates into numeric format)
_date_to_numeric(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=["dischtime", "admittime"],
)
_engineer_feature(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    feature1="dischtime",
    feature2="admittime",
    operation="subtract",
)

In [21]:
# split BP to systolic/diastolic
_expand_feature(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    feature="Blood Pressure",
    expand_on="/",
)

In [22]:
# select required features
_feature_selection(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=[
        "anchor_age",
        "gender",
        "BMI",
        "Blood Pressure_0",
        "Blood Pressure_1",
        "n_medications",
        "n_diagnoses",
        "admission_type",
        "admission_location",
        "dischtime_subtract_admittime",
        "hospital_expire_flag",
    ],
)

In [23]:
# numerically encode all features
_to_numeric(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=[
        "anchor_age",
        "BMI",
        "Blood Pressure_0",
        "Blood Pressure_1",
        "n_medications",
        "n_diagnoses",
        "dischtime_subtract_admittime",
        "hospital_expire_flag",
    ],
)

_one_hot(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=[
        "gender",
        "admission_type",
        "admission_location",
    ],
)

In [24]:
# drop all NaN values
_drop_nan(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
)

In [25]:
# standardize numerical columns
_standardize(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    columns=[
        "anchor_age",
        "BMI",
        "Blood Pressure_0",
        "Blood Pressure_1",
        "n_medications",
        "n_diagnoses",
        "dischtime_subtract_admittime",
    ],
)

In [26]:
# load and print current cohort
cohort = pd.read_parquet(f"{VOLUME_MOUNT}/cohort.parquet")
cohort

Unnamed: 0,anchor_age,BMI,Blood Pressure_0,Blood Pressure_1,n_medications,n_diagnoses,dischtime_subtract_admittime,hospital_expire_flag,gender_F,gender_M,...,admission_location_CLINIC REFERRAL,admission_location_EMERGENCY ROOM,admission_location_INFORMATION NOT AVAILABLE,admission_location_INTERNAL TRANSFER TO OR FROM PSYCH,admission_location_PACU,admission_location_PHYSICIAN REFERRAL,admission_location_PROCEDURE SITE,admission_location_TRANSFER FROM HOSPITAL,admission_location_TRANSFER FROM SKILLED NURSING FACILITY,admission_location_WALK-IN/SELF REFERRAL
0,1.168329,-0.140452,-0.399656,-0.652037,0.407204,1.648869,0.389772,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.023764,-0.124341,1.099227,0.280191,2.262284,0.959449,1.954466,0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.602026,-0.154084,-1.383299,-1.154006,0.563093,1.235217,0.748262,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.084088,0.003308,0.396625,0.136771,-0.465774,-1.936117,-0.507900,0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.168329,-0.140452,-0.399656,-0.652037,0.672216,0.407912,0.214962,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43451,0.011805,-0.103272,-0.259136,-0.795457,-0.434597,-0.557276,-0.497276,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
43452,0.156370,0.098734,1.473948,-0.221778,-0.294297,-0.281508,0.016971,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
43453,0.156370,0.101213,0.818186,-0.293488,-0.263119,1.373101,-0.319129,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
43454,0.156370,-0.031393,0.818186,-0.293488,-0.278708,-0.143624,-0.297881,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [27]:
# split cohort into train and test set
_train_test_split(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/train.parquet",
    output_path_test=f"{VOLUME_MOUNT}/test.parquet",
    train_size=0.7,
    stratify="hospital_expire_flag",
    seed=0,
)

In [28]:
# perform ML experiment
_ml_experiment(
    f"{VOLUME_MOUNT}/train.parquet",
    f"{VOLUME_MOUNT}/ml_results.parquet",
    input_path_test=f"{VOLUME_MOUNT}/test.parquet",
    target_col="hospital_expire_flag",
    model_name="LogisticRegression",
    model_hparams={},
)

In [29]:
# load and print ML results
score = pd.read_parquet(f"{VOLUME_MOUNT}/ml_results.parquet")
score

Unnamed: 0,ROCAUC
0,0.84468
