In [15]:
import sys, os, rdflib, pickle, urllib.parse, re, random
import pandas as pd
from pathlib import Path
#from sentence_transformers import SentenceTransformer
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

# Custom EHR Tools 
from EHRPipeline.entity_alignment.invokers import Invoker
from EHRPipeline.entity_alignment.entity_alignement import CrossOntologyAligner
from EHRPipeline.entity_alignment.embedder import SimpleDataEmbedder, ClusterGenerator
from EHRPipeline.entity_linking.linking_validation import LinkingValidator
from EHRPipeline.fact_validation.factValidation import Validator


ModuleNotFoundError: No module named 'invokers'

# General setup of environment and files

In [None]:
tokenizer = SentenceTransformer("all-MiniLM-L6-v2")

snomed_embeddings = Path("data/snomed_embedded.pkl")
clusters = Path("data/cluster.pkl")

if snomed_embeddings.exists():
    with open("/data/snomed_embedded.pkl", "rb") as file:
        data_embedding = pickle.load(file)
else:
    snomed = rdflib.Graph()
    snomed.parse("/data/snomed-ct-20221231-mini.ttl", format="ttl")
    embedder = SimpleDataEmbedder(embeddingModel=tokenizer)
    data_embedding = embedder.encode(data=snomed)

if clusters.exists():
    with open("data/cluster.pkl", "rb") as file1:
        segmentation = pickle.load(file1)
else:
    cluster = ClusterGenerator(data_embedding, n_clusters=50)
    segmentation = cluster.generate_clusters()

In [25]:
PATIENTS_CSV = "data/mimic-iii/PATIENTS.csv"
DIAGNOSES_ICD_CSV = "data/mimic-iii/DIAGNOSES_ICD.csv"
LABEVENTS_CSV = "data/mimic-iii/LABEVENTS.csv"
DLABITEMS_CSV = "data/mimic-iii/D_LABITEMS.csv"

OUTPUT_TTL = "data/schema_mapping_triples.ttl"

# Schema Mapping

In [31]:
random.seed(42)
FRACTION = 0.05
################################################################################
# 1. LOAD DATA
################################################################################

patients_df     = pd.read_csv(PATIENTS_CSV)
diagnoses_icd_df= pd.read_csv(DIAGNOSES_ICD_CSV)
lab_events_df   = pd.read_csv(LABEVENTS_CSV)
lab_items_df    = pd.read_csv(DLABITEMS_CSV)


# Get unique subject_ids in the patients table
all_subjects = patients_df['subject_id'].unique().tolist()
random.shuffle(all_subjects)

# Figure out how many subjects constitute 5%
num_5pct = int(len(all_subjects) * FRACTION)
keep_subjects = set(all_subjects[:num_5pct])

# Filter the dataframes to only those subject_ids
patients_df_small       = patients_df[patients_df['subject_id'].isin(keep_subjects)]
diagnoses_icd_df_small  = diagnoses_icd_df[diagnoses_icd_df['subject_id'].isin(keep_subjects)]
lab_events_df_small     = lab_events_df[lab_events_df['subject_id'].isin(keep_subjects)]

# Next, keep only the itemids that appear in lab_events_df_small
keep_itemids = set(lab_events_df_small['itemid'].dropna().unique())
lab_items_df_small = lab_items_df[lab_items_df['itemid'].isin(keep_itemids)]

# Rebuild the itemid->loinc mapping just for those itemids
itemid_to_loinc = {}
for _, row in lab_items_df_small.iterrows():
    itemid = row['itemid']
    loinc = str(row['loinc_code']).strip()
    if loinc == 'nan' or loinc == '':
        loinc = None
    itemid_to_loinc[itemid] = loinc

################################################################################
# 3. HELPER FUNCTIONS (FOR SANITIZING IRIs)
################################################################################

import urllib.parse

def sanitize_value_for_iri(value):
    if pd.isna(value):
        return "NA"
    val_str = str(value)
    # Percent-encode everything that's not safe in a URL path:
    # Here we allow only a few "safe" characters (adjust to your needs).
    # If you want underscores or dashes to remain, set them safe.
    return urllib.parse.quote(val_str, safe="-._~")

################################################################################
# 4. BUILD THE TRIPLES FROM THE REDUCED DATA
################################################################################

triples_ttl = []

# ---------- A) DIAGNOSES ----------
for idx, row in diagnoses_icd_df_small.iterrows():
    row_id = row['row_id']
    subj_id = row['subject_id']
    icd9_code = str(row['icd9_code']).strip()

    diagnosis_iri         = f"<http://example.org/Diagnosis/{subj_id}/PATIENTS/{row_id}>"
    subject_pseudo_iri    = f"<http://example.org/PATIENTS/{subj_id}>"
    icd9_iri              = f"<http://example.org/Code/icd9#{icd9_code}>"

    # Make the relevant RDF statements
    triples_ttl.append(f"{diagnosis_iri} a sphn:Diagnosis .")
    triples_ttl.append(f"{subject_pseudo_iri} a sphn:SubjectPseudoIdentifier .")
    triples_ttl.append(f"{diagnosis_iri} sphn:hasSubjectPseudoIdentifier {subject_pseudo_iri} .")

    triples_ttl.append(f"{icd9_iri} a sphn:Code .")
    triples_ttl.append(f"{diagnosis_iri} sphn:hasCode {icd9_iri} .")

# ---------- B) LAB EVENTS ----------
for idx, row in lab_events_df_small.iterrows():
    row_id  = row['row_id']
    subj_id = row['subject_id']
    itemid  = row['itemid']
    val     = row['value']

    # If missing crucial data, skip
    if pd.isna(row_id) or pd.isna(subj_id) or pd.isna(itemid):
        continue

    lab_event_iri      = f"<http://example.org/LabTestEvent/{int(subj_id)}/PATIENTS/{int(row_id)}>"
    subject_pseudo_iri = f"<http://example.org/PATIENTS/{int(subj_id)}>"
    lab_test_iri       = f"<http://example.org/LabTest/{int(subj_id)}/PATIENTS/{int(itemid)}>"
    
    # The lab result IRI includes the sanitized 'val' to keep them unique
    value_part         = sanitize_value_for_iri(val)
    lab_result_iri     = f"<http://example.org/LabResult/{int(subj_id)}/PATIENTS/{int(itemid)}/{value_part}>"

    # LabTestEvent + subject
    triples_ttl.append(f"{lab_event_iri} a sphn:LabTestEvent .")
    triples_ttl.append(f"{subject_pseudo_iri} a sphn:SubjectPseudoIdentifier .")
    triples_ttl.append(f"{lab_event_iri} sphn:hasSubjectPseudoIdentifier {subject_pseudo_iri} .")

    # LabTestEvent -> LabTest
    triples_ttl.append(f"{lab_test_iri} a sphn:LabTest .")
    triples_ttl.append(f"{lab_event_iri} sphn:hasLabTest {lab_test_iri} .")

    # LabTest -> LabResult
    triples_ttl.append(f"{lab_result_iri} a sphn:LabResult .")
    triples_ttl.append(f"{lab_test_iri} sphn:hasResult {lab_result_iri} .")

    # Link to LOINC code if available
    loinc_code = itemid_to_loinc.get(itemid, None)
    if loinc_code is not None:
        loinc_iri = f"<http://example.org/Code/loinc#{loinc_code}>"
        triples_ttl.append(f"{loinc_iri} a sphn:Code .")
        triples_ttl.append(f"{lab_test_iri} sphn:hasCode {loinc_iri} .")
        triples_ttl.append(f"{lab_result_iri} sphn:hasCode {loinc_iri} .")

################################################################################
# 5. SAVE THE SAMPLED DATA AS A TURTLE FILE
################################################################################

with open(OUTPUT_TTL, "w", encoding="utf-8") as f:
    for line in triples_ttl:
        f.write(line)
        if not line.endswith("\n"):
            f.write("\n")

print(f"sample of the data has been transformed into '{OUTPUT_TTL}' with {len(triples_ttl)} RDF statements.")

sample of the data has been transformed into 'data/schema_mapping_triples.ttl' with 43344 RDF statements.


In [33]:
def merge_ontology_and_data():
    try:
        g = rdflib.Graph()

        print("Parsing initial ontology...")
        g.parse("data/initialsphn.ttl", format="turtle")

        print("Parsing schema mapping...")
        g.parse("data/schema_mapping_triples.ttl", format="turtle")

        output_file = "data/schema_mapping_ontology.ttl"
        g.serialize(destination=output_file, format="turtle")

        print(f"Merging completed! Saved as '{output_file}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

merge_ontology_and_data()

Parsing initial ontology...
Parsing schema mapping...
Merging completed! Saved as 'data/schema_mapping_ontology.ttl'.


# Cross-Ontology Entity Alignment

In [34]:
# Import RDF file from schema mapping
query = rdflib.Graph()
query.parse("data/schema_mapping_ontology.ttl", format="ttl") 

<Graph identifier=Nf15330bd67cf4c03aa373ea4b464aca3 (<class 'rdflib.graph.Graph'>)>

In [None]:
ontologyaligner = CrossOntologyAligner(dataGraph=data_embedding, clusters=segmentation, embeddingModel=tokenizer)
CrossOntologyAlignedKG = ontologyaligner.merge(query=query, Invoker="icd9tosnomed", Namespace=rdflib.URIRef("https://biomedit.ch/rdf/sphn-schema/sphn#hasCode"))

# Entity Linking Validation Step

In [None]:
LinkingValidator(CrossOntologyAlignedKG)

# TransE Embedding

In [3]:
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
triples_factory = TriplesFactory.from_path('data/formatted_triples_FINAL_2.txt')

# Andy's code
training, validation, testing = triples_factory.split([0.8, 0.1, 0.1])

result = pipeline(
    training=training,
    validation=validation,
    testing=testing,
    model='TransE',
    model_kwargs={
        'embedding_dim': 20,
    },
    optimizer='Adam',
    optimizer_kwargs={
        'lr': 1e-3,
        'weight_decay': 1e-5
    },
    negative_sampler='basic',
    loss='SoftplusLoss',
    training_loop='sLCWA',
    training_kwargs={
        'num_epochs': 100,
        'batch_size': 32,
        'label_smoothing': 0.0
    },
    evaluator_kwargs=  {
        "filtered": True
    },
    filter_validation_when_testing = True,
)

using automatically assigned random_state=779459635
No random seed is specified. Setting to 407465778.
No cuda devices were available. The model runs on CPU
Training epochs on cpu: 100%|██████████| 100/100 [01:18<00:00,  1.28epoch/s, loss=0.485, prev_loss=0.486]
Evaluating on cpu: 100%|██████████| 2.68k/2.68k [00:05<00:00, 518triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 5.22s seconds


In [9]:
from pykeen import predict  # or pykeen.models.predict, depending on version

df_predictions = predict.predict_target(
    model=result.model,
    head="Diagnosis/10033/PATIENTS/112578",
    relation="hasCode",
    triples_factory=result.training
).df

# Inspect the top 10
df_predictions.head(10)

Unnamed: 0,tail_id,score,tail_label
11301,11301,-0.255418,snomed#3238004
11322,11322,-0.264629,snomed#40462002
11248,11248,-0.288456,snomed#10509002
11298,11298,-0.291339,snomed#309773000
11254,11254,-0.295263,snomed#120481000119109
11274,11274,-0.30769,snomed#23719005
11372,11372,-0.315504,snomed#91302008
11262,11262,-0.319369,snomed#139451000119107
9696,9696,-0.333518,icd9#20300
11299,11299,-0.35666,snomed#313436004


### For Fact Validation Part: Generate a txt with the previsions from the cell above

In [12]:
output_file = "predictions.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for idx, row in df_predictions.head(20).iterrows():
        predicted_code = row["tail_label"] 
        subject_uri = "<http://example.org/Diagnosis/10033/PATIENTS/112578>"
        predicate_uri = "<https://biomedit.ch/rdf/sphn-schema/sphn#hasCode>"

        # Decide how to build object_uri based on the prefix of predicted_code
        if predicted_code.startswith("snomed#"):
            # Extract the code and place it under the official SNOMED URI
            code_id = predicted_code.replace("snomed#", "")
            object_uri = f"<http://snomed.info/id/{code_id}>"
        elif predicted_code.startswith("icd9#"):
            # Keep it under your "example.org/Code/icd9#..." namespace
            code_id = predicted_code.replace("icd9#", "")
            object_uri = f"<http://example.org/Code/icd9#{code_id}>"
        else:
            # If there's some fallback or default logic:
            object_uri = f"<http://example.org/Code/{predicted_code}>"

        triple_line = f"{subject_uri}  {predicate_uri}  {object_uri}"
        f.write(triple_line + "\n")

print(f"Wrote top-20 predictions to {output_file}")


Wrote top-20 predictions to predictions.txt


# Fact Validation

In [13]:
from EHRPipeline.fact_validation.factValidation import Validator
def main():
    sparql_endpoint = "http://localhost:7200/repositories/integrationhealthcare" # This is a localhost so has to be configured per machine
    validator = Validator(sparql_endpoint)

    predictions_file = "predictions.txt"
    output_file = "validated_facts.txt"
    
    with open(predictions_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
        for line in f_in:
            line = line.strip()
            if not line:
                continue

            # Expect exactly 3 parts: subject, predicate, object
            parts = line.split()
            if len(parts) != 3:
                print(f"Skipping malformed line: {line}")
                continue
            
            subj = parts[0].strip()
            pred = parts[1].strip()
            obj  = parts[2].strip()

            subj_uri = subj.strip("<>")
            pred_uri = pred.strip("<>")
            obj_uri  = obj.strip("<>")

            # Validate
            score = validator.validate_fact(subj_uri, pred_uri, obj_uri, max_length=3)

            print(f"Fact: {subj} {pred} {obj} => Score: {score}")

            # threshold for writing the facts validated
            if score >= 0.5:
                f_out.write(f"{subj} {pred} {obj}\n")

    print(f"Validation complete. Facts with score >= 0.5 are in '{output_file}'.")

if __name__ == "__main__":
    main()


Discovered local paths: []
Fact: <http://example.org/Diagnosis/10033/PATIENTS/112578> <https://biomedit.ch/rdf/sphn-schema/sphn#hasCode> <http://snomed.info/id/3238004> => Score: 0.0
Discovered local paths: []
Fact: <http://example.org/Diagnosis/10033/PATIENTS/112578> <https://biomedit.ch/rdf/sphn-schema/sphn#hasCode> <http://snomed.info/id/40462002> => Score: 0.0
Discovered local paths: []
Fact: <http://example.org/Diagnosis/10033/PATIENTS/112578> <https://biomedit.ch/rdf/sphn-schema/sphn#hasCode> <http://snomed.info/id/10509002> => Score: 0.0
Discovered local paths: []
Fact: <http://example.org/Diagnosis/10033/PATIENTS/112578> <https://biomedit.ch/rdf/sphn-schema/sphn#hasCode> <http://snomed.info/id/309773000> => Score: 0.0
Discovered local paths: []
Fact: <http://example.org/Diagnosis/10033/PATIENTS/112578> <https://biomedit.ch/rdf/sphn-schema/sphn#hasCode> <http://snomed.info/id/120481000119109> => Score: 0.0
Discovered local paths: []
Fact: <http://example.org/Diagnosis/10033/PATI