# Convert NEUROVASC into MEDS-OWL

In [None]:
import os
import joblib

NEUROVASC = "NEUROVASC"
ETL_OUTPUT = f"{NEUROVASC}/MEDS_cohort"
ETL_INTERMEDIATE = f"{NEUROVASC}/pre_MEDS"
ETL_INPUT = f"{NEUROVASC}/raw_input"
ETL_GRAPH = f"{NEUROVASC}/graph"

os.makedirs(ETL_INPUT, exist_ok=True)
os.makedirs(ETL_INTERMEDIATE, exist_ok=True)
os.makedirs(ETL_OUTPUT, exist_ok=True)
os.makedirs(ETL_GRAPH, exist_ok=True)

NUM_PATIENTS = 10000
TIME_OPT = "NT"
SYN_NEUROVASC_DATA = "https://raw.githubusercontent.com/TeamHeKA/neurovasc/refs/heads/main/exp/data/syn_data_10000.csv"

### a) Load/Generate source dataset

In [None]:
import pandas as pd
#from NEUROVASC.utils.synthetic_generator import generate_synthetic_dataset
#df_input = generate_synthetic_dataset(NUM_PATIENTS, output_csv=f"{ETL_INPUT}/syn_data.csv")
df_input = pd.read_csv(SYN_NEUROVASC_DATA, index_col=0)
df_input = df_input.sample(frac=1, random_state=42).reset_index(drop=True)
df_input = df_input.rename(columns={'output': 'outcome'})

### b) Preprocess the source dataset

In [None]:
from NEUROVASC.utils.pre_MEDS import generate_meds_preprocessed

joblib.dump(df_input["outcome"].astype(int).to_list(), f"{ETL_GRAPH}/outcomes_meds_{TIME_OPT}_{NUM_PATIENTS}.joblib")

df_input_outcome = df_input.iloc[0:NUM_PATIENTS]
df_input_no_outcome = df_input_outcome.drop(columns=["outcome"])
generate_meds_preprocessed(df_input_no_outcome, output_path=ETL_INTERMEDIATE)

print("Neurovasc data ready for MEDS-Extract")

### c) Run MEDS_Extract ETL to convert source into MEDS

In [None]:
from MEDS_transforms.runner import main
import shutil

shutil.rmtree(ETL_OUTPUT)

main([
    "pkg://MEDS_extract.configs._extract.yaml",
    "--overrides",
    f"input_dir={ETL_INTERMEDIATE}",
    f"output_dir={ETL_OUTPUT}",
    f"event_conversion_config_fp={NEUROVASC}/MESSY.yaml",
    "dataset.name=Neurovasc",
    "dataset.version=1.0",
])

### d) Convert NEUROVASC_MEDS into KG through MEDS2RDF

In [None]:
from meds2rdf import MedsRDFConverter

shacl_graph = "https://raw.githubusercontent.com/TeamHeKA/meds-ontology/refs/heads/main/shacl/meds-shapes.ttl"

converter = MedsRDFConverter(ETL_OUTPUT)
graph = converter.convert(include_splits=False, include_labels=False, include_dataset_metadata=False)

In [None]:
# graph.serialize(destination=f"{ETL_GRAPH}/meds_{TIME_OPT}_{NUM_PATIENTS}.ttl", format="turtle", encoding="UTF-8")
graph.serialize(destination=f"{ETL_GRAPH}/meds_{TIME_OPT}_{NUM_PATIENTS}.nt", format="nt")
#graph.serialize(destination=f"{ETL_GRAPH}/meds_{TIME_OPT}_{NUM_PATIENTS}.xml", format="xml", encoding="UTF-8")

In [None]:
from rdflib import Graph

g = Graph()
g.parse(f"{ETL_GRAPH}/meds_{TIME_OPT}_{NUM_PATIENTS}.nt", format="nt")

### e) Compute metrics [OPTIONAL]

In [None]:
from NEUROVASC.utils.metrics import compute_MEDS_graph_metrics_for_neurovasc, save_stats_json
from pathlib import Path

stats = compute_MEDS_graph_metrics_for_neurovasc(
    MEDS_ETL_output_path=ETL_OUTPUT, 
    graph=g,
    tabular_data=df_input_no_outcome,
    MEDS_intermediate=Path(ETL_INTERMEDIATE)
)

save_stats_json(stats, f"{ETL_OUTPUT}/metrics/all_metrics.json")

### f) Compute consistency checks [OPTIONAL]

In [None]:
import polars as pl
from NEUROVASC.utils.neurovasc_meta import EVENTS_COLUMNS
from NEUROVASC.utils.transformers import build_neurovasc_meds_dt, build_neurovasc_medskg_dt, check_dts_consistency

synt_df = df_input.copy()
synt_df[EVENTS_COLUMNS] = (synt_df[EVENTS_COLUMNS] > -1).astype(int)

mimic_outcomes = joblib.load(f"{ETL_GRAPH}/outcomes_meds_{TIME_OPT}_{NUM_PATIENTS}.joblib")

meds_data = pl.read_parquet(str(f"{ETL_OUTPUT}/data/**/*.parquet")).to_dicts()
meds_df = build_neurovasc_meds_dt(meds_data)
meds_df = meds_df.astype(synt_df.dtypes.to_dict())
meds_df["outcome"] = mimic_outcomes

graph_df = build_neurovasc_medskg_dt(g)
graph_df = graph_df.astype(synt_df.dtypes.to_dict())
graph_df["outcome"] = mimic_outcomes

def remove_digits(_df: pd.DataFrame):
    _df["hospital_stay_length"] = _df["hospital_stay_length"].round()
    _df["nb_acte"] = _df["nb_acte"].round()
    _df["age"] = _df["age"].round()
    _df["gcs"] = _df["gcs"].round(2)

for _df in [synt_df, meds_df, graph_df]:
    remove_digits(_df)

check_dts_consistency(meds_df, synt_df)
check_dts_consistency(meds_df, graph_df)
check_dts_consistency(graph_df, synt_df)


# Convert MIMIC-IV Demo into MEDS-OWL
[link](https://physionet.org/content/mimic-iv-demo-meds/0.0.1/) to Physionet repository


In [None]:
!bash MIMIC/run.sh

In [None]:
import os 
import polars as pl

MIMIC_ETL_OUTPUT = "MIMIC/MEDS_cohort"
MIMIC_ETL_GRAPH = f"{MIMIC_ETL_OUTPUT}/graph"
MIMIC_TASKS_PATH = f"MIMIC/tasks"
TIME_OPT = "TS"

os.makedirs(MIMIC_ETL_GRAPH, exist_ok=True)

In [None]:

for f in os.listdir(MIMIC_TASKS_PATH):
    f_name = os.path.splitext(f)[0]
    !aces-cli \
        config_path="{MIMIC_TASKS_PATH}/{f}" \
        cohort_name="{f_name}" \
        cohort_dir="{MIMIC_ETL_OUTPUT}/labels" \
        data=sharded \
        data.standard=meds \
        data.root="{MIMIC_ETL_OUTPUT}/data" \
        data.shard=$(expand_shards train/1 tuning/1 held_out/1) \
        -m

In [None]:
import polars as pl
import joblib

df_mimic_labels = pl.read_parquet(f"{MIMIC_ETL_OUTPUT}/labels/long_term_reccurrence/**/*.parquet")[["subject_id", "prediction_time", "boolean_value"]].to_pandas()

patient_outcomes = df_mimic_labels.loc[df_mimic_labels.groupby('subject_id')['prediction_time'].idxmax()]

df_mimic_data = pl.read_parquet(f"{MIMIC_ETL_OUTPUT}/data/**/*.parquet").to_pandas()[["subject_id", "time", "code", "numeric_value"]] #, "text_value"]] TODO

merged = df_mimic_data.merge(
    patient_outcomes[['subject_id', 'prediction_time']],
    on='subject_id',
    how='inner'
)

filtered_mimic_data = merged[merged['time'] <= merged['prediction_time']]

mimic_outcomes = (patient_outcomes.set_index('subject_id').sort_index())['boolean_value']

MIMIC_ETL_FILTERED = f"{MIMIC_ETL_OUTPUT}/filtered_data"
NUM_PATIENTS = len(mimic_outcomes)

os.makedirs(f"{MIMIC_ETL_FILTERED}/data/train", exist_ok=True)

subject_to_index: dict[int, int] = {subj: i for i, subj in enumerate(patient_outcomes.sort_values("subject_id")["subject_id"])}
filtered_mimic_data.loc[:, "subject_id"] = filtered_mimic_data["subject_id"].map(subject_to_index)
filtered_mimic_data.to_parquet(f"{MIMIC_ETL_FILTERED}/data/train/0.parquet")
joblib.dump(mimic_outcomes.astype(int).to_list(), f"{MIMIC_ETL_GRAPH}/outcomes_meds_{TIME_OPT}_{NUM_PATIENTS}.joblib")

if len(set(filtered_mimic_data["subject_id"])) == NUM_PATIENTS: 
    print("MIMIC data are ready to be converted!")


In [None]:
MIMIC_OUTPUT_GRAPH = f"{MIMIC_ETL_GRAPH}/meds_{TIME_OPT}_{NUM_PATIENTS}.nt"

In [None]:
from meds2rdf import MedsRDFConverter

#converter = MedsRDFConverter(MIMIC_ETL_OUTPUT)
converter = MedsRDFConverter(MIMIC_ETL_FILTERED)
graph = converter.convert(include_dataset_metadata=False, include_splits=False, include_labels=False)
graph.serialize(destination=MIMIC_OUTPUT_GRAPH, format="nt")

In [None]:
from rdflib import Graph

g = Graph()
g.parse(MIMIC_OUTPUT_GRAPH, format="nt")

from NEUROVASC.utils.metrics import compute_MEDS_graph_metrics, save_stats_json

stats = compute_MEDS_graph_metrics(
    MEDS_ETL_output_path=MIMIC_ETL_OUTPUT, 
    graph=g,
)

save_stats_json(stats, f"{MIMIC_ETL_OUTPUT}/metrics/all_metrics.json")

In [None]:
# TODO create a sparql query to get (subject_id, time, code, numeric_value)

#import polars as pl
#from NEUROVASC.utils.transformers import build_neurovasc_meds_dt, build_neurovasc_medskg_dt, check_dts_consistency

# mimic_data = pl.read_parquet(str(f"{MIMIC_ETL_OUTPUT}/data/**/*.parquet")).to_dicts()
#check_dts_consistency(meds_df, graph_df)

# NEUROVASC 2.0

In [None]:
import os

N2 = "NEUROVASC2"
N2_ETL_OUTPUT = f"{N2}/MEDS_cohort"
N2_ETL_INTERMEDIATE = f"{N2}/pre_MEDS"
N2_ETL_INPUT = f"{N2}/raw_input"
N2_ETL_GRAPH = f"{N2}/graph"

os.makedirs(N2_ETL_INPUT, exist_ok=True)
os.makedirs(N2_ETL_INTERMEDIATE, exist_ok=True)
os.makedirs(N2_ETL_OUTPUT, exist_ok=True)
os.makedirs(N2_ETL_GRAPH, exist_ok=True)

TIME_OPT = "NT"

In [None]:
import pandas as pd
#from NEUROVASC.utils.synthetic_generator import generate_synthetic_dataset
#df_input = generate_synthetic_dataset(NUM_PATIENTS, output_csv=f"{ETL_INPUT}/syn_data.csv")
df_input = pd.read_csv(f"{N2_ETL_INPUT}/synthetic_data_sdv.csv")
NUM_PATIENTS = len(set(df_input["ID_PAT"]))

In [None]:
display(df_input)

In [None]:
from NEUROVASC2.pre_MEDS import generate_meds_preprocessed

(patients, events, mimic_outcomes) = generate_meds_preprocessed(
    df_input, 
    output_path=N2_ETL_INTERMEDIATE, 
    outcome_path=f"{N2_ETL_GRAPH}/outcomes_meds_{TIME_OPT}_{NUM_PATIENTS}.joblib"
)

print("Neurovasc 2 data ready for MEDS-Extract")


In [None]:
from MEDS_transforms.runner import main
import shutil

shutil.rmtree(N2_ETL_OUTPUT)

main([
    "pkg://MEDS_extract.configs._extract.yaml",
    "--overrides",
    f"input_dir={N2_ETL_INTERMEDIATE}",
    f"output_dir={N2_ETL_OUTPUT}",
    f"event_conversion_config_fp={N2}/MESSY.yaml",
    "dataset.name=Neurovasc",
    "dataset.version=2.0",
])

In [None]:
import polars as pl

meds_data = pl.read_parquet(str(f"{N2_ETL_OUTPUT}/data/**/*.parquet")).to_dicts()

In [None]:
display(pd.DataFrame(meds_data))

In [None]:
from meds2rdf import MedsRDFConverter

converter = MedsRDFConverter(N2_ETL_OUTPUT)
graph = converter.convert(include_dataset_metadata=True, include_splits=True, include_labels=True)
graph.serialize(destination=f"{N2_ETL_GRAPH}/neurovasc_graph.nt", format="nt")

In [None]:
from rdflib import Graph

g = Graph()
g.parse(f"{N2_ETL_GRAPH}/neurovasc_graph.nt", format="nt")

from NEUROVASC.utils.metrics import compute_MEDS_graph_metrics, save_stats_json

stats = compute_MEDS_graph_metrics(
    MEDS_ETL_output_path=N2_ETL_OUTPUT, 
    graph=g,
)

save_stats_json(stats, f"{N2_ETL_OUTPUT}/metrics/all_metrics.json")