# Convert NEUROVASC into MEDS-OWL

In [None]:
import os
import joblib

NEUROVASC = "NEUROVASC"
ETL_OUTPUT = f"{NEUROVASC}/MEDS_cohort"
ETL_INTERMEDIATE = f"{NEUROVASC}/pre_MEDS"
ETL_INPUT = f"{NEUROVASC}/raw_input"

os.makedirs(ETL_INPUT, exist_ok=True)
os.makedirs(ETL_INTERMEDIATE, exist_ok=True)
os.makedirs(ETL_OUTPUT, exist_ok=True)

GNN_PATH = f"gnn/data"
NUM_PATIENTS = 10000
TIME_OPT = "NT"

### a) Load/Generate source dataset

In [None]:
import pandas as pd
#from NEUROVASC.utils.synthetic_generator import generate_synthetic_dataset
#df_input = generate_synthetic_dataset(NUM_PATIENTS, output_csv=f"{NEUROVASC_INPUT}/syn_data.csv")
df_input = pd.read_csv(f"{GNN_PATH}/syn_data_10000.csv", index_col=0)
df_input = df_input.sample(frac=1, random_state=42).reset_index(drop=True)

### b) Preprocess the source dataset

In [None]:
from NEUROVASC.utils.pre_MEDS import generate_meds_preprocessed

joblib.dump(df_input["outcome"].astype(int).to_list(), f"{GNN_PATH}/outcomes_meds_{TIME_OPT}_{NUM_PATIENTS}.joblib")

df_input_outcome = df_input.iloc[0:NUM_PATIENTS]
df_input_no_outcome = df_input_outcome.drop(columns=["outcome"])
generate_meds_preprocessed(df_input_no_outcome, output_path=ETL_INTERMEDIATE)

print("Neurovasc data ready for MEDS-Extract")

### c) Run MEDS_Extract ETL to convert source into MEDS

In [None]:
from MEDS_transforms.runner import main
import shutil

shutil.rmtree(ETL_OUTPUT)

main([
    "pkg://MEDS_extract.configs._extract.yaml",
    "--overrides",
    f"input_dir={ETL_INTERMEDIATE}",
    f"output_dir={ETL_OUTPUT}",
    f"event_conversion_config_fp={NEUROVASC}/MESSY.yaml",
    "dataset.name=Neurovasc",
    "dataset.version=1.0",
])

### d) Convert NEUROVASC_MEDS into KG through MEDS2RDF

In [None]:
from meds2rdf import MedsRDFConverter

shacl_graph = "https://raw.githubusercontent.com/TeamHeKA/meds-ontology/refs/heads/main/shacl/meds-shapes.ttl"

converter = MedsRDFConverter(ETL_OUTPUT)
graph = converter.convert(include_dataset_metadata=False)

In [None]:
os.makedirs(GNN_PATH, exist_ok=True)

# graph.serialize(destination=f"{GNN_PATH}/meds_{TIME_OPT}_{NUM_PATIENTS}.ttl", format="turtle", encoding="UTF-8")
graph.serialize(destination=f"{GNN_PATH}/meds_{TIME_OPT}_{NUM_PATIENTS}.nt", format="nt")
#graph.serialize(destination=f"{GNN_PATH}/meds_{TIME_OPT}_{NUM_PATIENTS}.xml", format="xml", encoding="UTF-8")

In [None]:
from rdflib import Graph

g = Graph()
g.parse(f"{GNN_PATH}/meds_{TIME_OPT}_{NUM_PATIENTS}.nt", format="nt")

### e) Compute metrics [OPTIONAL]

In [None]:
from NEUROVASC.utils.metrics import compute_MEDS_graph_metrics_for_neurovasc, save_stats_json
from pathlib import Path

stats = compute_MEDS_graph_metrics_for_neurovasc(
    MEDS_ETL_output_path=ETL_OUTPUT, 
    graph=g,
    tabular_data=df_input.drop(columns=["outcome"]),
    MEDS_intermediate=Path(ETL_INTERMEDIATE)
)

save_stats_json(stats, f"{ETL_OUTPUT}/metrics/all_metrics.json")

### f) Compute consistency checks [OPTIONAL]

In [None]:
import polars as pl
from NEUROVASC.utils.neurovasc_meta import EVENTS_COLUMNS
from NEUROVASC.utils.transformers import build_neurovasc_meds_dt, build_neurovasc_medskg_dt, check_dts_consistency

synt_df = df_input.copy()
synt_df[EVENTS_COLUMNS] = (synt_df[EVENTS_COLUMNS] > -1).astype(int)

outcomes = joblib.load(f"{GNN_PATH}/outcomes_meds_{TIME_OPT}_{NUM_PATIENTS}.joblib")

meds_data = pl.read_parquet(str(f"{ETL_OUTPUT}/data/**/*.parquet")).to_dicts()
meds_df = build_neurovasc_meds_dt(meds_data)
meds_df = meds_df.astype(synt_df.dtypes.to_dict())
meds_df["outcome"] = outcomes

graph_df = build_neurovasc_medskg_dt(g)
graph_df = graph_df.astype(synt_df.dtypes.to_dict())
graph_df["outcome"] = outcomes

def remove_digits(_df: pd.DataFrame):
    _df["hospital_stay_length"] = _df["hospital_stay_length"].round()
    _df["nb_acte"] = _df["nb_acte"].round()
    _df["age"] = _df["age"].round()
    _df["gcs"] = _df["gcs"].round(2)

for _df in [synt_df, meds_df, graph_df]:
    remove_digits(_df)

check_dts_consistency(meds_df, synt_df)
check_dts_consistency(meds_df, graph_df)
check_dts_consistency(graph_df, synt_df)


# Convert MIMIC-IV Demo into MEDS-OWL
[link](https://physionet.org/content/mimic-iv-demo-meds/0.0.1/) to Physionet repository


In [None]:
MIMIC_ETL_OUTPUT = "MIMIC/MEDS_cohort"
!bash MIMIC/run.sh

In [None]:
from meds2rdf import MedsRDFConverter

converter = MedsRDFConverter(MIMIC_ETL_OUTPUT)
graph = converter.convert(include_dataset_metadata=True, include_splits=True, include_labels=True)
graph.serialize(destination=f"{MIMIC_ETL_OUTPUT}/mimic_graph.nt", format="nt")

In [None]:
from rdflib import Graph

g = Graph()
g.parse(f"{MIMIC_ETL_OUTPUT}/mimic_graph.nt", format="nt")

from NEUROVASC.utils.metrics import compute_MEDS_graph_metrics, save_stats_json

stats = compute_MEDS_graph_metrics(
    MEDS_ETL_output_path=MIMIC_ETL_OUTPUT, 
    graph=g,
)

save_stats_json(stats, f"{MIMIC_ETL_OUTPUT}/metrics/all_metrics.json")

In [None]:
# TODO create a sparql query to get (subject_id, time, code, numeric_value)

#import polars as pl
#from NEUROVASC.utils.transformers import build_neurovasc_meds_dt, build_neurovasc_medskg_dt, check_dts_consistency

# mimic_data = pl.read_parquet(str(f"{MIMIC_ETL_OUTPUT}/data/**/*.parquet")).to_dicts()
#check_dts_consistency(meds_df, graph_df)