In [1]:
from pathlib import Path

data_dir = Path.cwd() / "data" / "uncompressed"
dir_mapping = {
    "eicu": data_dir / "eicu-crd" / "2.0",
    "mimic-iv": data_dir / "mimiciv" / "3.1"
}
output_dir = Path.cwd() / "output"

In [2]:
from open_icu.config.dataset.meds import MEDSDataset

project = MEDSDataset(
    project_path=output_dir,
    overwrite=True,
)
project.write_metadata({})

In [None]:
from pathlib import Path

from open_icu.config.dataset.source.registry import DatasetConfigRegistry

registry = DatasetConfigRegistry.from_config_path(Path.cwd().parent / "config" / "dataset")
configs = registry.values()

In [None]:
from open_icu.transform.processor import process_table

for config in configs:
    for table in config.tables:
        print(f"{config.name} - Processing table: {table.name}")
        path = dir_mapping.get(config.name)
        assert path is not None
        process_table(
            table,
            path,
            output_dir,
            config.name,
        )
# 4:16

eicu - Processing table: patient
eicu - Processing table: vitalPeriodic
eicu - Processing table: infusionDrug
mimic-iv - Processing table: outputevents
mimic-iv - Processing table: emar
mimic-iv - Processing table: patients
mimic-iv - Processing table: omr
mimic-iv - Processing table: icustays
mimic-iv - Processing table: datetimeevent
mimic-iv - Processing table: hcpcsevents
mimic-iv - Processing table: procedureevents
mimic-iv - Processing table: chartevents
mimic-iv - Processing table: ingredientevents
mimic-iv - Processing table: medications
mimic-iv - Processing table: admissions


In [9]:
import polars as pl

for config in configs:
    for table in config.tables:
        for event in table.events:
            df = pl.scan_parquet(output_dir / "data" / config.name / table.name / f"{event.name}.parquet")
            print(f"{config.name} - {table.name} - {event.name}: {df.select(pl.len()).collect().item():,}")

# eicu - patient - icu_admission: 200859
# eicu - patient - icu_discharge: 200859
# eicu - vitalPeriodic - heartrate: 146671642
# eicu - infusionDrug - drugamount: 4803719
# mimic-iv - icustays - icu_admission: 94458
# mimic-iv - icustays - icu_discharge: 94458
# mimic-iv - icustays - icu_length_of_stay: 94458
# mimic-iv - chartevents - chartevent: 432997491
# mimic-iv - medications - dosage: 10953713
# mimic-iv - medications - rate: 6056482

eicu - patient - icu_admission: 200,859
eicu - patient - icu_discharge: 200,859
eicu - patient - gender: 200,859
eicu - patient - age: 200,859
eicu - vitalPeriodic - heartrate: 146,671,642
eicu - infusionDrug - drugamount: 4,803,719
mimic-iv - outputevents - caregiver_store: 5,359,395
mimic-iv - outputevents - dosage: 5,359,395
mimic-iv - emar - diagnoses: 42,808,593
mimic-iv - emar - scheduletime: 42,808,593
mimic-iv - patients - dob: 364,627
mimic-iv - patients - dod: 364,627
mimic-iv - patients - gender: 364,627
mimic-iv - omr - value: 7,753,027
mimic-iv - icustays - icu_admission: 94,458
mimic-iv - icustays - icu_discharge: 94,458
mimic-iv - icustays - icu_length_of_stay: 94,458
mimic-iv - datetimeevent - caregiver_store: 9,979,761
mimic-iv - datetimeevent - datetimeevent: 9,979,761
mimic-iv - hcpcsevents - hpcsevent: 186,074
mimic-iv - procedureevents - caregiver_store: 808,706
mimic-iv - procedureevents - event_start: 808,706
mimic-iv - procedureevents - event_end: 808,706
mimic-

In [None]:
pl.scan_parquet(output_dir / "metadata" / "codes.parquet").head().collect()

In [None]:
pl.scan_parquet(output_dir / "data" / "mimic-iv" / "chartevents" / "chartevent.parquet").head(5).collect()