# this file is for testing the implementations in easier way with jupyter

In [2]:
from ontology_loader import OntologyLoader

source_address = 'data/mouse-human/source.xml'
target_address = 'data/mouse-human/target.xml'

source_ontology = OntologyLoader(source_address)
target_ontology = OntologyLoader(target_address)

source_ontology.load()
target_ontology.load()

Ontology loaded successfully from data/mouse-human/source.xml
Ontology loaded successfully from data/mouse-human/target.xml


get_ontology("http://human.owl#")

In [133]:
len(list(source_ontology.ontology.classes()))

2744

In [134]:
print(list(source_ontology.ontology.classes())[520].label)
print(list(source_ontology.ontology.classes())[2031].label)

['lobar bronchus']
['median dorsal digital artery for digit 01']


In [137]:
parents1 = source_ontology.ontology.get_parents_of(list(source_ontology.ontology.classes())[520])
parents2 = source_ontology.ontology.get_parents_of(list(source_ontology.ontology.classes())[2031])


print(parents1[0].label, parents2[0].label)

['bronchus'] ['forelimb digital arteries']


In [44]:
source_ontology.ontology.get_triples()

[(303, 6, 80),
 (304, 6, 15),
 (305, 6, 15),
 (306, 6, 15),
 (307, 6, 15),
 (308, 6, 15),
 (40, 6, 15),
 (309, 6, 15),
 (310, 6, 13),
 (311, 6, 13),
 (312, 6, 13),
 (313, 6, 11),
 (314, 6, 11),
 (314, 9, 315),
 (-1, 6, 17),
 (-1, 18, 311),
 (-1, 24, 316),
 (314, 9, -1),
 (317, 6, 11),
 (317, 9, 34),
 (-2, 6, 17),
 (-2, 18, 311),
 (-2, 24, 318),
 (317, 9, -2),
 (319, 6, 11),
 (319, 9, 320),
 (319, 304, 321),
 (322, 6, 11),
 (322, 9, 320),
 (323, 6, 11),
 (323, 9, 320),
 (324, 6, 11),
 (324, 9, 320),
 (324, 304, 325),
 (326, 6, 11),
 (326, 9, 320),
 (327, 6, 11),
 (327, 9, 328),
 (329, 6, 11),
 (329, 9, 317),
 (329, 304, 330),
 (328, 6, 11),
 (328, 9, 317),
 (331, 6, 11),
 (331, 9, 317),
 (332, 6, 11),
 (332, 9, 317),
 (332, 304, 333),
 (334, 6, 11),
 (334, 9, 317),
 (335, 6, 11),
 (335, 9, 34),
 (-3, 6, 17),
 (-3, 18, 311),
 (-3, 24, 336),
 (335, 9, -3),
 (337, 6, 11),
 (337, 9, 317),
 (338, 6, 11),
 (338, 9, 34),
 (-4, 6, 17),
 (-4, 18, 311),
 (-4, 24, 339),
 (338, 9, -4),
 (340, 6, 11

In [3]:
from collections import defaultdict
import itertools

def build_inverted_index(ontology, external_lexicon=None):
    if ontology is None:
        raise ValueError("Ontology is not loaded!")

    index = defaultdict(set)  # Key: frozenset of normalized terms, Value: set of class IDs
    class_id_map = {}         # Map class URIs to numerical IDs
    current_id = 0

    # Assign numerical IDs to classes
    for cls in ontology.classes():
        class_id_map[cls.iri] = current_id
        current_id += 1

    for cls in ontology.classes():
        cls_id = class_id_map[cls.iri]
        labels = cls.label if hasattr(cls, 'label') else []  # Get all labels

        # Process all labels (primary + synonyms)
        for label in labels:
            # Split into components (e.g., "cellular_secretion" → ["cellular", "secretion"])
            components = label.lower().replace('_', ' ').split()
            
            # Add base terms
            term_set = frozenset(components)
            index[term_set].add(cls_id)

            # Add variations from external lexicon (e.g., "secrete" for "secretion")
            if external_lexicon:
                variations = set()
                for term in components:
                    variations.update(external_lexicon.get_variations(term))  # Hypothetical lexicon method
                if variations:
                    expanded_terms = components + list(variations)
                    term_set_variations = frozenset(expanded_terms)
                    index[term_set_variations].add(cls_id)

    return index, class_id_map

source_index = build_inverted_index(source_ontology.ontology)
target_index = build_inverted_index(target_ontology.ontology)

In [4]:
print(source_index)
print(target_index)

(defaultdict(<class 'set'>, {frozenset({'mouse', 'anatomy'}): {1}, frozenset({'grey', 'spinal', 'cord', 'matter'}): {2}, frozenset({'grey', 'matter'}): {3}, frozenset({'spinal', 'cord'}): {4}, frozenset({'organ', 'system'}): {5}, frozenset({'adult', 'mouse'}): {6}, frozenset({'trunk'}): {7}, frozenset({'anatomic', 'region'}): {8}, frozenset({'body', 'cavity/lining'}): {9}, frozenset({'head/neck'}): {10}, frozenset({'limb'}): {11}, frozenset({'tail'}): {12}, frozenset({'adipose', 'tissue'}): {13}, frozenset({'tissue', 'connective'}): {14}, frozenset({'system', 'cardiovascular'}): {15}, frozenset({'system', 'endocrine'}): {16}, frozenset({'system', 'hemolymphoid'}): {17}, frozenset({'system', 'integumental'}): {18}, frozenset({'muscle'}): {19}, frozenset({'system', 'musculoskeletal'}): {20}, frozenset({'system', 'nervous'}): {21}, frozenset({'organ', 'sensory'}): {22}, frozenset({'organ', 'sensory', 'system'}): {23}, frozenset({'system', 'skeletal'}): {24}, frozenset({'organ', 'visceral'

In [24]:
source_index[0].keys()

dict_keys([frozenset({'mouse', 'anatomy'}), frozenset({'matter', 'spinal', 'grey', 'cord'}), frozenset({'matter', 'grey'}), frozenset({'spinal', 'cord'}), frozenset({'system', 'organ'}), frozenset({'adult', 'mouse'}), frozenset({'trunk'}), frozenset({'region', 'anatomic'}), frozenset({'body', 'cavity/lining'}), frozenset({'head/neck'}), frozenset({'limb'}), frozenset({'tail'}), frozenset({'tissue', 'adipose'}), frozenset({'connective', 'tissue'}), frozenset({'cardiovascular', 'system'}), frozenset({'system', 'endocrine'}), frozenset({'system', 'hemolymphoid'}), frozenset({'system', 'integumental'}), frozenset({'muscle'}), frozenset({'system', 'musculoskeletal'}), frozenset({'nervous', 'system'}), frozenset({'organ', 'sensory'}), frozenset({'system', 'organ', 'sensory'}), frozenset({'system', 'skeletal'}), frozenset({'organ', 'system', 'visceral'}), frozenset({'back'}), frozenset({'abdomen/pelvis/perineum'}), frozenset({'thorax'}), frozenset({'head'}), frozenset({'neck'}), frozenset({'f

In [23]:
source_index[1].keys()

dict_keys(['http://www.w3.org/2002/07/owl#Thing', 'http://mouse.owl#MA_0000001', 'http://mouse.owl#MA_0000002', 'http://mouse.owl#MA_0001112', 'http://mouse.owl#MA_0000216', 'http://mouse.owl#MA_0000003', 'http://mouse.owl#MA_0002405', 'http://mouse.owl#MA_0000004', 'http://mouse.owl#MA_0002433', 'http://mouse.owl#MA_0000005', 'http://mouse.owl#MA_0000006', 'http://mouse.owl#MA_0000007', 'http://mouse.owl#MA_0000008', 'http://mouse.owl#MA_0000009', 'http://mouse.owl#MA_0000011', 'http://mouse.owl#MA_0000010', 'http://mouse.owl#MA_0000012', 'http://mouse.owl#MA_0000013', 'http://mouse.owl#MA_0000014', 'http://mouse.owl#MA_0000015', 'http://mouse.owl#MA_0002418', 'http://mouse.owl#MA_0000016', 'http://mouse.owl#MA_0000017', 'http://mouse.owl#MA_0002442', 'http://mouse.owl#MA_0000018', 'http://mouse.owl#MA_0000019', 'http://mouse.owl#MA_0000020', 'http://mouse.owl#MA_0000021', 'http://mouse.owl#MA_0000022', 'http://mouse.owl#MA_0000023', 'http://mouse.owl#MA_0000024', 'http://mouse.owl#MA

ISUB Implementation

In [4]:
def longest_common_substring(s, t):
    """Find the longest common substring between s and t.
    Returns a tuple (lcs, pos_s, pos_t) where lcs is the substring and
    pos_s, pos_t are its start positions in s and t respectively.
    """
    m, n = len(s), len(t)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    longest = 0
    pos_s = 0
    pos_t = 0
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s[i - 1] == t[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
                if dp[i][j] > longest:
                    longest = dp[i][j]
                    pos_s = i - longest
                    pos_t = j - longest
            else:
                dp[i][j] = 0
    lcs = s[pos_s:pos_s + longest] if longest > 0 else ""
    return lcs, pos_s, pos_t

def _isub_common(s, t):
    """Recursively compute the common part between s and t.
    This function finds the longest common substring and then recurses on
    the parts before and after it. It also adds a bonus for a common prefix.
    """
    # Compute common prefix length (max 4) and bonus factor (0.1 per char)
    prefix = 0
    for i in range(min(len(s), len(t))):
        if s[i] == t[i]:
            prefix += 1
        else:
            break
    prefix = min(prefix, 4)
    prefix_bonus = prefix * 0.1

    lcs, pos_s, pos_t = longest_common_substring(s, t)
    if not lcs:
        return prefix_bonus

    # Recursively process the substrings to the left and right of the LCS
    left_common = _isub_common(s[:pos_s], t[:pos_t])
    right_common = _isub_common(s[pos_s + len(lcs):], t[pos_t + len(lcs):])
    return len(lcs) + left_common + right_common + prefix_bonus

def isub(s, t):
    """
    Compute the ISUB similarity between strings s and t.
    The similarity is given by:
       score = min( 2 * common / (len(s) + len(t)), 1 )
    where 'common' is computed recursively from the longest common substrings
    and includes a bonus for matching prefixes.
    """
    s = s.lower().strip()
    t = t.lower().strip()
    # Handle empty strings: if both are empty, we consider them identical.
    if not s and not t:
        return 1.0
    common = _isub_common(s, t)
    score = 2 * common / (len(s) + len(t))
    return min(score, 1.0)

# alternative implementation
from difflib import SequenceMatcher

def isub_similarity(str1, str2):
    """Compute similarity score between two strings using ISUB (SequenceMatcher)."""
    return SequenceMatcher(None, str1, str2).ratio()

In [116]:
# Example usage:
if __name__ == "__main__":
    examples = [
        ("CellularSecretion", "cellular secretion"),
        ("Trapezoid", "TrapezoidBone"),
        ("ExocrineGlandFluid", "ExocrineGlandFluid")
    ]
    for a, b in examples:
        print(f"ISUB('{a}', '{b}') = {isub(a, b):.4f}")


ISUB('CellularSecretion', 'cellular secretion') = 1.0000
ISUB('Trapezoid', 'TrapezoidBone') = 0.8545
ISUB('ExocrineGlandFluid', 'ExocrineGlandFluid') = 1.0000


In [115]:
a = set("skeletal muscle")
b = set("Muscle")

intersection = len(a & b)
union = len(a | b)
print(intersection / union if union > 0 else 0.0)


0.45454545454545453


In [69]:
print(a)

{' ', 'a', 's', 'u', 't', 'c', 'l', 'm', 'e', 'k'}


In [5]:
from itertools import product
import re

def get_local_name(entity):
    """
    Extract a local name from an entity's IRI.
    For example, 'source.xml.MA_0000165' becomes 'MA_0000165'.
    """
    iri_str = str(entity)
    parts = iri_str.split('.')
    return parts[-1] if parts else iri_str

def compute_anchors(index1, index2, ontology1, ontology2, isub_threshold=0.8):
    """
    Compute initial anchor mappings between two ontologies.
    """
    anchors = {}
    
    # Step 1: Find common term sets between the inverted indexes
    common_term_sets = set(index1.keys()) & set(index2.keys())
    
    # Convert ontology classes to lists for indexing
    ontology1_list = list(ontology1.classes())
    ontology2_list = list(ontology2.classes())
    
    for term_set in common_term_sets:
        # Get class IDs for this term set from both indexes
        classes1 = index1[term_set]
        classes2 = index2[term_set]
        
        # Step 2: Generate candidate pairs and compute ISUB similarity
        for c1_id, c2_id in product(classes1, classes2):
            c1_uri = ontology1_list[c1_id].iri  # Get class URI from ID
            c2_uri = ontology2_list[c2_id].iri
            
            # Get labels for ISUB similarity
            c1_label = ontology1_list[c1_id].label[0] if ontology1_list[c1_id].label else ""
            c2_label = ontology2_list[c2_id].label[0] if ontology2_list[c2_id].label else ""
            
            similarity = isub_similarity(c1_label, c2_label)  # Use ISUB algorithm
            # print(c1_label, c2_label, similarity)
            
            # Step 3: Check if similarity meets the threshold
            if similarity >= isub_threshold:
                # Step 4: Compute structural confidence (optional for initial anchors)
                confidence = compute_structural_confidence(
                    ontology1_list[c1_id], ontology2_list[c2_id], ontology1, ontology2
                )
                
                # Store the anchor mapping with its confidence
                anchors[(f'{c1_id}', f'{c2_id}')] = confidence
    
    return anchors

def compute_structural_confidence(node1, node2, ontology1, ontology2):
    """
    Compute the structural confidence between two ontology nodes.
    """
    def canonical(n):
        """Extract a canonical label for comparison."""
        if hasattr(n, 'label') and n.label:
            return n.label[0]
        elif hasattr(n, 'iri'):
            return get_local_name(n.iri)
        else:
            return str(n)
    
    # Get parents and children of the nodes
    c1_parents = {canonical(p) for p in ontology1.get_parents_of(node1)}
    c1_children = {canonical(c) for c in ontology1.get_children_of(node1)}
    c2_parents = {canonical(p) for p in ontology2.get_parents_of(node2)}
    c2_children = {canonical(c) for c in ontology2.get_children_of(node2)}
    
    # Combine parents and children into a single set for each node
    set1 = c1_parents | c1_children
    set2 = c2_parents | c2_children
    
    # Compute Jaccard similarity on tokenized labels
    return jaccard_similarity_tokens(set1, set2)

def normalize_label(label):
    """
    Normalize a label by lowercasing, removing punctuation, and tokenizing.
    Returns a set of tokens.
    """
    label = label.lower().strip()
    label = re.sub(r'[^\w\s]', '', label)  # Remove punctuation
    tokens = label.split()
    return set(tokens)

def jaccard_similarity_tokens(set1, set2):
    """
    Compute the Jaccard similarity between two sets of tokenized labels.
    """
    tokens1 = set.union(*(normalize_label(s) for s in set1))
    tokens2 = set.union(*(normalize_label(s) for s in set2))
    
    intersection = tokens1 & tokens2
    union = tokens1 | tokens2
    return len(intersection) / len(union) if union else 0.0

anchors = compute_anchors(source_index[0], target_index[0], source_ontology.ontology, target_ontology.ontology)
print(anchors)

{('116', '2442'): 0.0, ('1812', '2870'): 0.0, ('1917', '2862'): 0.0, ('19', '308'): 0.0, ('169', '1056'): 0.0, ('2115', '2457'): 0.0, ('507', '2597'): 0.25, ('2547', '1055'): 0.0, ('2341', '3279'): 0.5, ('813', '2900'): 0.0, ('1417', '1359'): 0.0, ('2507', '1178'): 0.0, ('1403', '1818'): 0.0, ('492', '287'): 0.0, ('1665', '26'): 0.0, ('2087', '1843'): 0.0, ('1430', '719'): 0.0, ('100', '232'): 0.0, ('177', '2974'): 0.0, ('476', '282'): 0.0, ('971', '3040'): 0.0, ('473', '148'): 0.0, ('955', '1426'): 0.0, ('407', '69'): 0.0, ('395', '259'): 0.0, ('503', '453'): 0.0, ('1651', '2884'): 0.0, ('68', '304'): 0.0, ('118', '898'): 0.0, ('2012', '3158'): 0.5, ('1683', '2905'): 0.16666666666666666, ('2249', '1837'): 0.14285714285714285, ('2189', '3245'): 1.0, ('1773', '2934'): 0.0, ('420', '1822'): 0.0, ('1883', '93'): 0.0, ('451', '1054'): 0.0, ('2257', '3253'): 0.0, ('1990', '2557'): 0.0, ('1443', '2244'): 0.0, ('2453', '3244'): 0.5, ('2151', '733'): 0.0, ('410', '1745'): 0.0, ('443', '288'): 

In [19]:
# see anchors values
for key, value in anchors.items():
    print(key, value)

('2342,buccinator', '3280,Buccinator') 0.5
('2440,scalenus dorsalis', '3214,Scalenus_Dorsalis') 0.5
('7,trunk', '122,Trunk') 0.0
('1668,tongue epithelium', '2925,Tongue_Epithelium') 0.0
('2533,cerumen', '1398,Cerumen') 0.0
('393,gastrointestinal system', '244,Gastrointestinal_System') 0.0
('2426,peroneus longus', '3291,Peroneus_Longus') 0.5
('2096,posterior cerebral artery', '747,Posterior_Cerebral_Artery') 0.0
('301,semicircular duct', '1171,Semicircular_Duct') 0.1
('1202,parasympathetic ganglion', '2999,Parasympathetic_Ganglion') 0.0
('1210,enteric ganglion', '3036,Enteric_Ganglion') 0.2
('971,cingulate cortex', '3040,Cingulate_Cortex') 0.0
('420,incisor', '1822,Incisor') 0.0
('2476,exocrine system', '641,Exocrine_System') 0.0
('2536,feces', '1052,Feces') 0.0
('1399,sternal manubrium', '3059,Sternal_Manubrium') 0.0
('2091,perineal artery', '3219,Perineal_Artery') 1.0
('473,myometrium', '148,Myometrium') 0.0
('424,esophagus', '53,Esophagus') 0.16666666666666666
('2056,intercostal arte

In [21]:
def get_neighbors(cls):
    """Retrieve parent and child classes for context expansion."""
    neighbors = set()
    
    # Retrieve superclass and subclass relations
    if hasattr(cls, "is_a"):
        neighbors.update(cls.is_a)  # Direct parent classes

    if hasattr(cls, "subclasses"):
        neighbors.update(cls.subclasses())  # Direct child classes

    return neighbors

def discover_new_mappings(ontology1, ontology2, anchors, threshold=0.8):
    new_mappings = {}

    for (c1, c2) in anchors:
        cls1 = ontology1.search_one(iri=f"*{c1}")  # Find class in ontology 1
        cls2 = ontology2.search_one(iri=f"*{c2}")  # Find class in ontology 2

        if not cls1 or not cls2:
            continue  # Skip if class not found
        
        neighbors1 = get_neighbors(cls1)
        neighbors2 = get_neighbors(cls2)

        for n1 in neighbors1:
            for n2 in neighbors2:
                if hasattr(n1, "label") and hasattr(n2, "label"):
                    label1 = n1.label[0] if n1.label else n1.name
                    label2 = n2.label[0] if n2.label else n2.name
                    sim = isub_similarity(label1, label2)
                    
                    if sim > threshold:
                        new_mappings[(n1.name, n2.name)] = sim  # Store class names

    return new_mappings

new_mappings = discover_new_mappings(source_ontology.ontology, target_ontology.ontology, anchors)
print(new_mappings)

{('Thing', 'Thing'): 1.0, ('MA_0000067', 'MA_0000067'): 1.0, ('MA_0001260', 'MA_0001260'): 1.0, ('MA_0001260', 'MA_0001257'): 0.8636363636363636, ('MA_0001257', 'MA_0001260'): 0.8636363636363636, ('MA_0001257', 'MA_0001257'): 1.0}


In [22]:
def check_unsatisfiability(ontology1, ontology2, mappings):
    """Detect unsatisfiable mappings based on logical conflicts."""
    conflicts = []

    for (c1, c2), confidence in mappings.items():
        cls1 = ontology1.search_one(iri=f"*{c1}")  # Find class in ontology 1
        cls2 = ontology2.search_one(iri=f"*{c2}")  # Find class in ontology 2

        if not cls1 or not cls2:
            continue  # Skip if class not found

        # Check if the classes are explicitly disjoint in their respective ontologies
        if hasattr(cls1, "disjoint_with") and cls2 in cls1.disjoint_with:
            conflicts.append((c1, c2))  # Add to conflicts if disjoint

        # Check if the classes belong to fundamentally different hierarchies
        if hasattr(cls1, "is_a") and hasattr(cls2, "is_a"):
            super1 = {superclass for superclass in cls1.is_a}
            super2 = {superclass for superclass in cls2.is_a}
            
            # If they have no common ancestors, they might be semantically incompatible
            if not super1.intersection(super2):
                conflicts.append((c1, c2))

    return conflicts

conflicts = check_unsatisfiability(source_ontology.ontology, target_ontology.ontology,new_mappings)
print(conflicts)

[('Thing', 'Thing')]


In [23]:
def repair_mappings(mappings, conflicts):
    for conflict in conflicts:
        if conflict in mappings:
            del mappings[conflict]
    return mappings

new_mappings = repair_mappings(new_mappings, conflicts)
print(new_mappings)

{('MA_0000067', 'MA_0000067'): 1.0, ('MA_0001260', 'MA_0001260'): 1.0, ('MA_0001260', 'MA_0001257'): 0.8636363636363636, ('MA_0001257', 'MA_0001260'): 0.8636363636363636, ('MA_0001257', 'MA_0001257'): 1.0}
