In [None]:
import os

os.makedirs("input", exist_ok=True)
os.makedirs("intermediate", exist_ok=True)
os.makedirs("output", exist_ok=True)

MEDS_ETL_OUTPUT = "output"
GRAPH_OUTPUT_PATH = f"{MEDS_ETL_OUTPUT}/rdf"

In [None]:
from utils.pre_MEDS import generate_meds_preprocessed
from utils.synthetic_generator import generate_synthetic_dataset

import pandas as pd
#df_input = pd.read_csv("gnn/data/syn_data_10000.csv", index_col=0)

df_input = generate_synthetic_dataset(100, output_csv="input/syn_data.csv")
df_intermediate = generate_meds_preprocessed(df_input, output_path="intermediate")

In [None]:
from MEDS_transforms.runner import main
import shutil

shutil.rmtree(MEDS_ETL_OUTPUT)

main([
    "pkg://MEDS_extract.configs._extract.yaml",
    "--overrides",
    "input_dir=intermediate",
    "output_dir=output",
    "event_conversion_config_fp=MESSY.yaml",
    "dataset.name=Neurovasc",
    "dataset.version=1.0",
])

In [None]:
from meds2rdf import MedsRDFConverter

shacl_graph = "https://raw.githubusercontent.com/TeamHeKA/meds-ontology/refs/heads/main/shacl/meds-shapes.ttl"

# Initialize the converter with the path to your MEDS dataset directory
converter = MedsRDFConverter("output")

In [None]:
# Convert the dataset into an RDF graph
graph = converter.convert(include_dataset_metadata=False)

In [None]:
os.makedirs(GRAPH_OUTPUT_PATH, exist_ok=True)
# Serialize the graph to different formats
graph.serialize(destination=f"{GRAPH_OUTPUT_PATH}/output_dataset.ttl", format="turtle")
graph.serialize(destination=f"{GRAPH_OUTPUT_PATH}/output_dataset.nt", format="nt")
#graph.serialize(destination=f"{GRAPH_OUTPUT_PATH}/output_dataset.xml", format="xml")

print("Conversion complete! RDF files saved.")

In [None]:
from rdflib import Graph

g = Graph()
g.parse(f"{GRAPH_OUTPUT_PATH}/output_dataset.nt", format="nt")

In [None]:
from utils.metrics import collect_all_metrics_from_output, save_stats_json

stats = collect_all_metrics_from_output(
    output_path=MEDS_ETL_OUTPUT, 
    graph=g,
    tabular_data=df_input
)

save_stats_json(stats, "output/metrics/all_metrics.json")

In [None]:
import polars as pl
from utils.neurovasc_meta import EVENTS_COLUMNS
from utils.transformers import build_meds_dt, build_medskg_dt, check_dts_consistency

synt_df = df_input.copy()
synt_df[EVENTS_COLUMNS] = (synt_df[EVENTS_COLUMNS] > -1).astype(int)

meds_data = pl.read_parquet(str("output/data/**/*.parquet")).to_dicts()
meds_df = build_meds_dt(meds_data)
meds_df = meds_df.astype(synt_df.dtypes.to_dict())

graph_df = build_medskg_dt(g)
graph_df = graph_df.astype(synt_df.dtypes.to_dict())

def remove_digits(_df: pd.DataFrame):
    _df["hospital_stay_length"] = _df["hospital_stay_length"].round()
    _df["nb_acte"] = _df["nb_acte"].round()
    _df["age"] = _df["age"].round()
    _df["gcs"] = _df["gcs"].round(2)

for _df in [synt_df, meds_df, graph_df]:
    remove_digits(_df)

check_dts_consistency(meds_df, synt_df)
check_dts_consistency(meds_df, graph_df)
check_dts_consistency(graph_df, synt_df)
