# OPSI Ontology Matching Pipeline

## Setup & Config

### Import required packages

In [1]:
from rdflib import Graph, URIRef, RDFS, RDF, OWL, Literal, Namespace
import os
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import yaml
from typing import List, Dict, Optional
from sentence_transformers import SentenceTransformer
from rapidfuzz import fuzz


  from .autonotebook import tqdm as notebook_tqdm


#### Load paths and constans from configuration

In [2]:
SOURCE_PATH = "./notebook/source/pn2024_ontology_source.json"
SOURCE_INDEX_PATH = "./notebook/source/source_index.faiss"
SOURCE_ID_TRACKER_PATH = "./notebook/source/source_tracker.faiss"
TARGET_PATH = "./notebook/target/preglednica_zahtevkov_zavarovanj_target.json"
TARGET_INDEX_PATH = "./notebook/target/target_index.faiss"
TARGET_ID_TRACKER_PATH = "./notebook/target/target_tracker.faiss"
OUTPUT_PATH = "./notebook/result/csvw_alignment_results.json"
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large"
TOP_K = 3
SIMILARITY_THRESHOLD = 0.6
HCB_ENABLED = True

#### Utils functions for ontologies

In [3]:
def load_json(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)

In [4]:
def load_graph(path):
    graph = Graph()
    graph.parse(path)
    return graph

In [5]:
def get_label(graph, uri):
    label = graph.value(uri, RDFS.label)
    return str(label) if isinstance(label, Literal) else None

In [6]:
def extract_related_uris(graph, subject, predicate):
    """Dereferences URIs linked by the predicate and returns their rdfs:label."""
    values = []
    for obj in graph.objects(subject, predicate):
        label = get_label(graph, obj)
        if label:
            values.append(label)
    return values

In [7]:
def extract_superclass_labels(graph, subject):
    """Get human-readable labels of direct superclasses."""
    super_labels = []
    for superclass in graph.objects(subject, RDFS.subClassOf):
        if isinstance(superclass, URIRef):
            label = get_label(graph, superclass)
            if label:
                super_labels.append(label)
        elif (superclass, RDF.type, OWL.Restriction) in graph:
            filler = graph.value(superclass, OWL.someValuesFrom)
            if isinstance(filler, URIRef):
                super_labels.append(str(filler).split("#")[-1])
    return super_labels

#### Utils functions for generating terms JSON

In [8]:
def build_text_for_embedding(label, definition=None, synonyms=None, superclasses=None):
    parts = [f"Concept: {label}"]

    if synonyms:
        parts.append(f"Also known as: {', '.join(synonyms)}")

    if superclasses:
        parts.append(f"Part of: {', '.join(superclasses)}")

    if definition:
        parts.append(f"Defined as: {definition}")

    return ". ".join(parts)

In [9]:
def extract_enriched_terms(graph):
    terms = []
    for s in graph.subjects(RDF.type, OWL.Class):
        label = get_label(graph, s)
        if not label:
            continue

        definition = extract_related_uris(graph, s, OBO.hasDefinition)
        synonyms = extract_related_uris(graph, s, OBO.hasRelatedSynonym)
        superclasses = extract_superclass_labels(graph, s)

        enriched_text = build_text_for_embedding(
            label=label,
            definition=definition[0] if definition else None,
            synonyms=synonyms,
            superclasses=superclasses
        )

        terms.append({
            "uri": str(s),
            "label": label,
            "definition": definition[0] if definition else "",
            "synonyms": synonyms,
            "superclasses": superclasses,
            "text_for_embedding": enriched_text
        })

    return terms

In [10]:
def normalize_embeddings(embeddings: np.ndarray) -> np.ndarray:
    return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

In [11]:
def ontology_indexing(ontology_json, faiss_index_path, id_tracker_json):

    with open(ontology_json, "r", encoding="utf-8") as f:
            ontology_terms = json.load(f)

    texts = []
    ids = []
    valid_terms = []

    for i, term in enumerate(ontology_terms):
        text = term.get("text_for_embedding")
        if not text:
            raise ValueError("Missing 'text_for_embedding'")
        texts.append(text)
        ids.append(abs(hash(term["uri"])) % (10**12))
        valid_terms.append(term)

    # Embedding
    model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    embeddings = model.encode(texts, batch_size=16, show_progress_bar=True)
    embeddings = normalize_embeddings(np.array(embeddings))

    # FAISS indexing
    dimension = embeddings.shape[1]
    base_index = faiss.IndexFlatIP(dimension)
    index = faiss.IndexIDMap(base_index)
    index.add_with_ids(embeddings, np.array(ids))
    os.makedirs(os.path.dirname(faiss_index_path), exist_ok=True)
    faiss.write_index(index, faiss_index_path)

    # Save ID tracker
    id_map = {str(id_): term for id_, term in zip(ids, valid_terms)}
    with open(id_tracker_json, "w", encoding="utf-8") as f:
        json.dump(id_map, f, indent=4, ensure_ascii=False)

#### Utils for matching

In [12]:
def rerank_by_label_similarity(source_label: str, candidates: List[Dict], weight_faiss: float = 0.7, weight_label: float = 0.3) -> List[Dict]:
    """Combines semantic score and lexical similarity to rerank matches."""
    reranked = []
    for match in candidates:
        label_sim = fuzz.ratio(source_label, match["label"]) / 100
        combined_score = weight_faiss * match["score"] + weight_label * label_sim
        reranked.append({**match, "combined_score": combined_score})
    return sorted(reranked, key=lambda x: x["combined_score"], reverse=True)

In [13]:
def build_index_lookup(index_path: str, id_tracker_path: str):
    index = faiss.read_index(index_path)
    with open(id_tracker_path, "r", encoding="utf-8") as f:
        id_map = {int(k): v for k, v in json.load(f).items()}
    return index, id_map

In [14]:
def faiss_batch_search(embeddings: np.ndarray, index, top_k: int):
    return index.search(embeddings, top_k)

In [15]:
def precompute_reverse_matches(
    target_terms: List[Dict],
    reverse_index,
    reverse_id_map: Dict[int, Dict],
    model,
    top_k: int = 1
) -> Dict[str, str]:
    """
    Computes best reverse matches (target → source).
    Returns dict: target_uri → best source_uri.
    """
    reverse_lookup = {}

    valid_terms = [t for t in target_terms if t.get("text_for_embedding")]
    texts = [t["text_for_embedding"] for t in valid_terms]
    uris = [t["uri"] for t in valid_terms]

    # Batch encoding
    embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
    embeddings = normalize_embeddings(np.array(embeddings).astype(np.float32))

    # Batch FAISS search
    D, I = reverse_index.search(embeddings, top_k)

    for i, (indices, scores) in enumerate(zip(I, D)):
        best_idx = indices[0]
        if best_idx == -1:
            continue
        match = reverse_id_map.get(best_idx)
        if match:
            reverse_lookup[uris[i]] = match["uri"]

    return reverse_lookup


In [16]:
def precompute_reverse_matches_topk(
    target_terms: list,
    reverse_index,
    reverse_id_map: dict,
    model,
    top_k: int = 5
) -> dict:
    """
    Computes top-k reverse matches (target → source).
    Returns dict: target_uri → list of source_uris.
    """
    reverse_lookup_k = {}

    valid_terms = [t for t in target_terms if t.get("text_for_embedding")]
    texts = [t["text_for_embedding"] for t in valid_terms]
    uris = [t["uri"] for t in valid_terms]

    # Batch encoding
    embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    # Batch FAISS search
    D, I = reverse_index.search(np.array(embeddings).astype(np.float32), top_k)

    for i, indices in enumerate(I):
        matches = [reverse_id_map[idx]["uri"] for idx in indices if idx != -1 and reverse_id_map.get(idx)]
        reverse_lookup_k[uris[i]] = matches

    return reverse_lookup_k

In [None]:
def embed_terms(terms: List[Dict], model) -> np.ndarray:
    texts = [t["text_for_embedding"] for t in terms if t.get("text_for_embedding")]
    return normalize_embeddings(model.encode(texts, batch_size=32, show_progress_bar=True))


In [18]:
def map_faiss_results(source_terms, D, I, target_id_map):
    matches = []
    for i, (distances, indices) in enumerate(zip(D, I)):
        src = source_terms[i]
        results = []
        for idx, score in zip(indices, distances):
            if idx == -1:
                continue
            t = target_id_map.get(idx)
            if not t:
                continue
            results.append({
                "uri": t["uri"],
                "label": t["label"],
                "score": float(score)
            })
        matches.append({
            "source_uri": src["uri"],
            "source_label": src["label"],
            "top_k_matches": results
        })
    return matches


In [19]:
def apply_label_reranking(matches: List[Dict]):
    reranked = []
    for match in matches:
        ranked = rerank_by_label_similarity(match["source_label"], match["top_k_matches"])
        match["top_k_matches"] = ranked
        match["top_match"] = ranked[0] if ranked else None
        reranked.append(match)
    return reranked


#### Loading RDF into graph

In [20]:
source_graph = load_json(SOURCE_PATH)
target_graph = load_json(TARGET_PATH)

#### Loading embedding model

In [21]:
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

In [23]:
target_graph

[{'uri': 'n16ce517f066c4ab4961e42cfc37c28b2b2',
  'label': 'datum škodnega dogodka',
  'definition': 'Datum, ko je bila škoda prijavljena zavarovalnici.',
  'synonyms': ['datumPN'],
  'superclasses': [],
  'text_for_embedding': 'Concept: datum škodnega dogodka. Also known as: datumPN. Defined as: Datum, ko je bila škoda prijavljena zavarovalnici.'},
 {'uri': 'n16ce517f066c4ab4961e42cfc37c28b2b3',
  'label': 'vzrok nastanka škode',
  'definition': 'Opis ali klasifikacija primarnega vzroka škode.',
  'synonyms': [],
  'superclasses': [],
  'text_for_embedding': 'Concept: vzrok nastanka škode. Defined as: Opis ali klasifikacija primarnega vzroka škode.'},
 {'uri': 'n16ce517f066c4ab4961e42cfc37c28b2b4',
  'label': 'kraj dogodka',
  'definition': 'Ime kraja ali regije, kjer se je zgodil škodni primer.',
  'synonyms': [],
  'superclasses': [],
  'text_for_embedding': 'Concept: kraj dogodka. Defined as: Ime kraja ali regije, kjer se je zgodil škodni primer.'},
 {'uri': 'n16ce517f066c4ab4961e4

#### Generate enriched JSON of ontology terms

Generate enriched terms for human ontology

In [22]:
target_terms = extract_enriched_terms(target_graph)

with open(TARGET_PATH, "w", encoding="utf-8") as f:
    json.dump(target_terms, f, indent=2, ensure_ascii=False)

AttributeError: 'list' object has no attribute 'subjects'

In [24]:
target_terms = target_graph

In [25]:
len(target_terms)

10

Generate enriched terms for mouse ontology

In [None]:
source_terms = extract_enriched_terms(source_graph)

with open(SOURCE_PATH, "w", encoding="utf-8") as f:
    json.dump(source_terms, f, indent=2, ensure_ascii=False)

In [26]:
source_terms = source_graph

In [27]:
len(source_terms)

35

#### Ontology indexing

Create and populate FAISS with human entities

In [28]:
ontology_indexing(ontology_json=TARGET_PATH, faiss_index_path=TARGET_INDEX_PATH, id_tracker_json=TARGET_ID_TRACKER_PATH)

Batches: 100%|██████████| 1/1 [00:10<00:00, 10.32s/it]


Create and populate FAISS with mouse entities

In [29]:
ontology_indexing(ontology_json=SOURCE_PATH, faiss_index_path=SOURCE_INDEX_PATH, id_tracker_json=SOURCE_ID_TRACKER_PATH)

Batches: 100%|██████████| 3/3 [00:02<00:00,  1.21it/s]


#### Execute matching on mouse human pair of ontologies

In [30]:
target_index = faiss.read_index(TARGET_INDEX_PATH)
with open(TARGET_ID_TRACKER_PATH, "r", encoding="utf-8") as f:
    target_id_map = {int(k): v for k, v in json.load(f).items()}

In [31]:
source_index = faiss.read_index(SOURCE_INDEX_PATH)

In [32]:
embs = embed_terms(source_terms, embedding_model)

Batches: 100%|██████████| 2/2 [00:00<00:00, 13.70it/s]


In [33]:
embs_h = embed_terms(target_terms, embedding_model)

Batches: 100%|██████████| 1/1 [00:00<00:00, 41.03it/s]


In [34]:
len(target_id_map)

10

In [35]:
# Step-by-step matching
D, I = faiss_batch_search(embs, target_index, top_k=5)
matches = map_faiss_results(source_terms, D, I, target_id_map)

output_path = "./notebook/result/mouse_to_human_matches_unranked.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(matches, f, indent=2, ensure_ascii=False)

print(f"len match: {len(matches)}")

# Optional refinements
matches = apply_label_reranking(matches)

output_path = "./notebook/result/mouse_to_human_matches_ranked.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(matches, f, indent=2, ensure_ascii=False)

print(f"len match: {len(matches)}")


len match: 35
len match: 35


In [36]:
mouse_index = faiss.read_index(SOURCE_INDEX_PATH)
with open(SOURCE_ID_TRACKER_PATH, "r", encoding="utf-8") as f:
    mouse_id_map = {int(k): v for k, v in json.load(f).items()}

In [37]:
reverse_lookup = precompute_reverse_matches(
    target_terms=target_terms,
    reverse_index=mouse_index,
    reverse_id_map=mouse_id_map,
    model=embedding_model,
    top_k=5
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 39.12it/s]


In [38]:
reverse_lookup_topk = precompute_reverse_matches_topk(
    target_terms=target_terms,
    reverse_index=mouse_index,
    reverse_id_map=mouse_id_map,
    model=embedding_model,
    top_k=3
)

Batches: 100%|██████████| 1/1 [00:00<00:00, 40.79it/s]


In [39]:
len(reverse_lookup)

10

In [40]:
output_path = "./notebook/result/reverse_lookup.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(reverse_lookup, f, indent=2, ensure_ascii=False)

In [41]:
def apply_hcb(matches, reverse_lookup, fallback_threshold=0.95):
    filtered = []
    skipped_top_match = 0
    no_predicited_uri = 0
    for m in matches:
        source_uri = m["source_uri"]
        top_match = m.get("top_match")

        if not top_match:
            skipped_top_match += 1
            continue

        predicted_uri = top_match["uri"]
        confidence = top_match.get("combined_score", top_match.get("score", 0))

        if reverse_lookup.get(predicted_uri) == source_uri:
            filtered.append(m)  # standard HCB
        elif confidence >= fallback_threshold:
            filtered.append(m)  # allow fallback based on confidence
        else: 
            no_predicited_uri += 1

    print(f"skipped_top_match {skipped_top_match}")
    print(f"no_predicited_uri {no_predicited_uri}")

    return filtered


In [42]:
def apply_hcb_with_topk(matches, reverse_lookup_top1, reverse_lookup_topk):
    """
    Filters matches using bidirectional match (HCB), extended to top-k reverse lookup and confidence fallback.
    
    Args:
        matches: list of match dicts (with top_match + score)
        reverse_lookup_top1: dict[target_uri → best source_uri]
        reverse_lookup_topk: dict[target_uri → list of top-k source_uris]
        fallback_threshold: minimum score to allow fallback if HCB fails

    Returns:
        List of filtered matches
    """
    filtered = []
    stats = {
        "strict_hcb": 0,
        "semi_hcb": 0,
        "skipped": 0
    }

    for m in matches:
        source_uri = m["source_uri"]
        top_match = m.get("top_match")

        if not top_match:
            stats["skipped"] += 1
            continue

        predicted_uri = top_match["uri"]

        if reverse_lookup_top1.get(predicted_uri) == source_uri:
            filtered.append(m)
            stats["strict_hcb"] += 1
        elif source_uri in reverse_lookup_topk.get(predicted_uri, []):
            filtered.append(m)
            stats["semi_hcb"] += 1
        else:
            stats["skipped"] += 1

    print("HCB Filtering Summary:")
    for key, count in stats.items():
        print(f"  {key}: {count}")

    return filtered


In [43]:
matches = apply_hcb_with_topk(matches, reverse_lookup, reverse_lookup_topk)

len(matches)

HCB Filtering Summary:
  strict_hcb: 10
  semi_hcb: 2
  skipped: 23


12

In [44]:
output_path = "./notebook/result/opsi_match.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(matches, f, indent=2, ensure_ascii=False)

print(f"Matching complete. Results saved to {output_path}")

Matching complete. Results saved to ./notebook/result/opsi_match.json
