# Resnik Similarity Implementation

This notebook implements Resnik similarity calculation for MP ontology terms using the mp.obo file.

In [1]:
import re
import math
from collections import defaultdict

In [2]:
# プロジェクトルートに移動
import os
from pathlib import Path

print(f"Current directory: {os.getcwd()}")

while not Path("LICENSE").exists():
    os.chdir("../")

print(f"Project root: {os.getcwd()}")

Current directory: /mnt/c/Users/akihi/Documents/GitHub/TSUMUGI-dev/notebooks/notebooks-web
Project root: /mnt/c/Users/akihi/Documents/GitHub/TSUMUGI-dev


In [3]:
def parse_obo_file(file_path: str) -> dict[str, dict]:
    """Parse OBO file and extract term information."""
    terms = {}
    current_term = None

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if line == "[Term]":
                current_term = {}
                continue

            if line.startswith("[") and line.endswith("]") and line != "[Term]":
                current_term = None
                continue

            if current_term is None:
                continue

            if ":" in line:
                key, value = line.split(":", 1)
                key = key.strip()
                value = value.strip()

                if key == "id":
                    current_term["id"] = value
                elif key == "name":
                    current_term["name"] = value
                elif key == "is_a":
                    if "is_a" not in current_term:
                        current_term["is_a"] = []
                    parent_id = value.split("!")[0].strip()
                    current_term["is_a"].append(parent_id)
                elif key == "is_obsolete":
                    current_term["is_obsolete"] = value.lower() == "true"

            if line == "" and current_term and "id" in current_term:
                if not current_term.get("is_obsolete", False):
                    terms[current_term["id"]] = current_term
                current_term = None

    return terms

In [4]:
def build_parent_child_relations(
    terms: dict[str, dict],
) -> tuple[dict[str, list[str]], dict[str, list[str]]]:
    """Build parent-child relationships from terms."""
    parents = defaultdict(list)  # term_id -> [parent_ids]
    children = defaultdict(list)  # term_id -> [child_ids]

    for term_id, term_data in terms.items():
        if "is_a" in term_data:
            for parent_id in term_data["is_a"]:
                parents[term_id].append(parent_id)
                children[parent_id].append(term_id)

    return dict(parents), dict(children)

In [5]:
def get_all_ancestors(term_id: str, parents: dict[str, list[str]]) -> set[str]:
    """Get all ancestor terms for a given term."""
    ancestors = set()
    queue = [term_id]

    while queue:
        current = queue.pop(0)
        if current in parents:
            for parent in parents[current]:
                if parent not in ancestors:
                    ancestors.add(parent)
                    queue.append(parent)

    return ancestors

In [6]:
def get_all_descendants(term_id: str, children: dict[str, list[str]]) -> set[str]:
    """Get all descendant terms for a given term."""
    descendants = set()
    queue = [term_id]

    while queue:
        current = queue.pop(0)
        if current in children:
            for child in children[current]:
                if child not in descendants:
                    descendants.add(child)
                    queue.append(child)

    return descendants

In [7]:
def calculate_information_content(
    term_id: str, children: dict[str, list[str]], total_terms: int
) -> float:
    """Calculate information content for a term based on its descendants."""
    descendants = get_all_descendants(term_id, children)
    # Include the term itself in the count
    term_count = len(descendants) + 1
    probability = term_count / total_terms
    return -math.log(probability)

In [8]:
def find_common_ancestors(
    term1_id: str, term2_id: str, parents: dict[str, list[str]]
) -> set[str]:
    """Find common ancestors of two terms."""
    ancestors1 = get_all_ancestors(term1_id, parents)
    ancestors1.add(term1_id)  # Include the term itself

    ancestors2 = get_all_ancestors(term2_id, parents)
    ancestors2.add(term2_id)  # Include the term itself

    return ancestors1.intersection(ancestors2)

In [9]:
def resnik_similarity(
    term1_id: str,
    term2_id: str,
    parents: dict[str, list[str]],
    children: dict[str, list[str]],
    total_terms: int,
) -> float:
    """Calculate Resnik similarity between two terms."""
    if term1_id == term2_id:
        return calculate_information_content(term1_id, children, total_terms)

    common_ancestors = find_common_ancestors(term1_id, term2_id, parents)

    if not common_ancestors:
        return 0.0

    # Find the most informative common ancestor (MICA)
    max_ic = 0.0
    for ancestor in common_ancestors:
        ic = calculate_information_content(ancestor, children, total_terms)
        max_ic = max(max_ic, ic)

    return max_ic

In [10]:
def find_term_by_name(name: str, terms: dict[str, dict]) -> str | None:
    """Find term ID by name."""
    for term_id, term_data in terms.items():
        if term_data.get("name") == name:
            return term_id
    return None

In [11]:
# Load and parse the ontology
obo_file = "./data/ontology/mp.obo"
terms = parse_obo_file(obo_file)
print(f"Loaded {len(terms)} terms")

# Build relationships
parents, children = build_parent_child_relations(terms)
total_terms = len(terms)

print(f"Built relationships for {len(parents)} terms with parents")
print(f"Built relationships for {len(children)} terms with children")

Loaded 14379 terms
Built relationships for 14378 terms with parents
Built relationships for 5823 terms with children


In [12]:
# Find the target terms
term1_name = "preweaning lethality, complete penetrance"
term2_name = "preweaning lethality"

term1_id = find_term_by_name(term1_name, terms)
term2_id = find_term_by_name(term2_name, terms)

print(f"Term 1: {term1_name} -> {term1_id}")
print(f"Term 2: {term2_name} -> {term2_id}")

if term1_id:
    print(f"Term 1 data: {terms[term1_id]}")
if term2_id:
    print(f"Term 2 data: {terms[term2_id]}")

Term 1: preweaning lethality, complete penetrance -> MP:0011100
Term 2: preweaning lethality -> MP:0010770
Term 1 data: {'id': 'MP:0011100', 'name': 'preweaning lethality, complete penetrance', 'is_a': ['MP:0010770']}
Term 2 data: {'id': 'MP:0010770', 'name': 'preweaning lethality', 'is_a': ['MP:0010769']}


In [13]:
# Calculate Resnik similarity
if term1_id and term2_id:
    similarity = resnik_similarity(term1_id, term2_id, parents, children, total_terms)
    print(
        f"\nResnik similarity between '{term1_name}' and '{term2_name}': {similarity:.4f}"
    )

    # Also calculate individual information content
    ic1 = calculate_information_content(term1_id, children, total_terms)
    ic2 = calculate_information_content(term2_id, children, total_terms)

    print(f"Information content of '{term1_name}': {ic1:.4f}")
    print(f"Information content of '{term2_name}': {ic2:.4f}")

    # Show common ancestors
    common_ancestors = find_common_ancestors(term1_id, term2_id, parents)
    print(f"\nCommon ancestors: {len(common_ancestors)}")
    for ancestor in sorted(common_ancestors):
        ancestor_name = terms[ancestor].get("name", "Unknown")
        ancestor_ic = calculate_information_content(ancestor, children, total_terms)
        print(f"  {ancestor}: {ancestor_name} (IC: {ancestor_ic:.4f})")
else:
    print("Could not find one or both terms in the ontology")


Resnik similarity between 'preweaning lethality, complete penetrance' and 'preweaning lethality': 5.6417
Information content of 'preweaning lethality, complete penetrance': 6.7403
Information content of 'preweaning lethality': 5.6417

Common ancestors: 4
  MP:0000001: mammalian phenotype (IC: -0.0000)
  MP:0010768: mortality/aging (IC: 4.7137)
  MP:0010769: abnormal survival (IC: 5.3839)
  MP:0010770: preweaning lethality (IC: 5.6417)
