In [None]:
import os
import shutil
from pipeline import _merge, _count, _find_unique_ids, _ingest_data # import the pipeline components
VOLUME_MOUNT = "volume_mount"
shutil.rmtree(VOLUME_MOUNT, ignore_errors=True)
os.makedirs(VOLUME_MOUNT)

In [None]:
"""
This cell runs a pipeline, i.e., several modular components in the INSAFEDARE format in succession.
"""

# ingest diagnoses
print("ingest diagnoses")
_ingest_data(
    "mimiciv/2.0/hosp/diagnoses_icd.csv.gz", f"{VOLUME_MOUNT}/diagnoses.parquet"
)

# select cohort
print("select cohort")
_find_unique_ids(
    f"{VOLUME_MOUNT}/diagnoses.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    match_on=["I50", "428"],
    target_col="icd_code",
    id_cols=["hadm_id"],
    match_on_first=3,
)

# ingest medications/prescriptions
print("ingest medications")
_ingest_data(
    "mimiciv/2.0/hosp/prescriptions.csv.gz",
    f"{VOLUME_MOUNT}/prescriptions.parquet",
    usecols=["hadm_id", "subject_id"],
)
# count
print("count #medications")
_count(
    f"{VOLUME_MOUNT}/prescriptions.parquet",
    f"{VOLUME_MOUNT}/prescriptions.parquet",
    groupby="hadm_id",
    output_colname="n_medications",
)

# merge #medications to cohort
print("merging #medications to cohort")
_merge(
    f"{VOLUME_MOUNT}/cohort.parquet",
    f"{VOLUME_MOUNT}/cohort.parquet",
    input_path_merge=f"{VOLUME_MOUNT}/prescriptions.parquet",
    merge_on=["hadm_id"],
    how="left",
)
