# Resnik Similarity Implementation

This notebook implements Resnik similarity calculation for MP ontology terms using the mp.obo file.

In [None]:
import re
import math
from collections import defaultdict

In [None]:
# プロジェクトルートに移動
import os
from pathlib import Path

print(f"Current directory: {os.getcwd()}")

while not Path("LICENSE").exists():
    os.chdir("../")

print(f"Project root: {os.getcwd()}")

In [None]:
def parse_obo_file(file_path: str) -> dict[str, dict]:
    """Parse OBO file and extract term information."""
    terms = {}
    current_term = None

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if line == "[Term]":
                current_term = {}
                continue

            if line.startswith("[") and line.endswith("]") and line != "[Term]":
                current_term = None
                continue

            if current_term is None:
                continue

            if ":" in line:
                key, value = line.split(":", 1)
                key = key.strip()
                value = value.strip()

                if key == "id":
                    current_term["id"] = value
                elif key == "name":
                    current_term["name"] = value
                elif key == "is_a":
                    if "is_a" not in current_term:
                        current_term["is_a"] = []
                    parent_id = value.split("!")[0].strip()
                    current_term["is_a"].append(parent_id)
                elif key == "is_obsolete":
                    current_term["is_obsolete"] = value.lower() == "true"

            if line == "" and current_term and "id" in current_term:
                if not current_term.get("is_obsolete", False):
                    terms[current_term["id"]] = current_term
                current_term = None

    return terms

In [None]:
def build_parent_child_relations(
    terms: dict[str, dict],
) -> tuple[dict[str, list[str]], dict[str, list[str]]]:
    """Build parent-child relationships from terms."""
    parents = defaultdict(list)  # term_id -> [parent_ids]
    children = defaultdict(list)  # term_id -> [child_ids]

    for term_id, term_data in terms.items():
        if "is_a" in term_data:
            for parent_id in term_data["is_a"]:
                parents[term_id].append(parent_id)
                children[parent_id].append(term_id)

    return dict(parents), dict(children)

In [None]:
def get_all_ancestors(term_id: str, parents: dict[str, list[str]]) -> set[str]:
    """Get all ancestor terms for a given term."""
    ancestors = set()
    queue = [term_id]

    while queue:
        current = queue.pop(0)
        if current in parents:
            for parent in parents[current]:
                if parent not in ancestors:
                    ancestors.add(parent)
                    queue.append(parent)

    return ancestors

In [None]:
def get_all_descendants(term_id: str, children: dict[str, list[str]]) -> set[str]:
    """Get all descendant terms for a given term."""
    descendants = set()
    queue = [term_id]

    while queue:
        current = queue.pop(0)
        if current in children:
            for child in children[current]:
                if child not in descendants:
                    descendants.add(child)
                    queue.append(child)

    return descendants

In [None]:
def calculate_information_content(
    term_id: str, children: dict[str, list[str]], total_terms: int
) -> float:
    """Calculate information content for a term based on its descendants."""
    descendants = get_all_descendants(term_id, children)
    # Include the term itself in the count
    term_count = len(descendants) + 1
    probability = term_count / total_terms
    return -math.log(probability)

In [None]:
def find_common_ancestors(
    term1_id: str, term2_id: str, parents: dict[str, list[str]]
) -> set[str]:
    """Find common ancestors of two terms."""
    ancestors1 = get_all_ancestors(term1_id, parents)
    ancestors1.add(term1_id)  # Include the term itself

    ancestors2 = get_all_ancestors(term2_id, parents)
    ancestors2.add(term2_id)  # Include the term itself

    return ancestors1.intersection(ancestors2)

In [None]:
def resnik_similarity(
    term1_id: str,
    term2_id: str,
    parents: dict[str, list[str]],
    children: dict[str, list[str]],
    total_terms: int,
) -> float:
    """Calculate Resnik similarity between two terms."""
    if term1_id == term2_id:
        return calculate_information_content(term1_id, children, total_terms)

    common_ancestors = find_common_ancestors(term1_id, term2_id, parents)

    if not common_ancestors:
        return 0.0

    # Find the most informative common ancestor (MICA)
    max_ic = 0.0
    for ancestor in common_ancestors:
        ic = calculate_information_content(ancestor, children, total_terms)
        max_ic = max(max_ic, ic)

    return max_ic

In [None]:
def find_term_by_name(name: str, terms: dict[str, dict]) -> str | None:
    """Find term ID by name."""
    for term_id, term_data in terms.items():
        if term_data.get("name") == name:
            return term_id
    return None

In [None]:
# Load and parse the ontology
obo_file = "./data/ontology/mp.obo"
mp_terms = parse_obo_file(obo_file)
print(f"Loaded {len(mp_terms)} terms")

# Build relationships
parents, children = build_parent_child_relations(mp_terms)
total_terms = len(mp_terms)

print(f"Built relationships for {len(parents)} terms with parents")
print(f"Built relationships for {len(children)} terms with children")

In [None]:
# Find the target terms
term1_name = "preweaning lethality, complete penetrance"
term2_name = "preweaning lethality"

term1_id = find_term_by_name(term1_name, mp_terms)
term2_id = find_term_by_name(term2_name, mp_terms)

print(f"Term 1: {term1_name} -> {term1_id}")
print(f"Term 2: {term2_name} -> {term2_id}")

if term1_id:
    print(f"Term 1 data: {mp_terms[term1_id]}")
if term2_id:
    print(f"Term 2 data: {mp_terms[term2_id]}")

In [None]:
# Calculate Resnik similarity
if term1_id and term2_id:
    similarity = resnik_similarity(term1_id, term2_id, parents, children, total_terms)
    print(
        f"\nResnik similarity between '{term1_name}' and '{term2_name}': {similarity:.4f}"
    )

    # Also calculate individual information content
    ic1 = calculate_information_content(term1_id, children, total_terms)
    ic2 = calculate_information_content(term2_id, children, total_terms)

    print(f"Information content of '{term1_name}': {ic1:.4f}")
    print(f"Information content of '{term2_name}': {ic2:.4f}")

    # Show common ancestors
    common_ancestors = find_common_ancestors(term1_id, term2_id, parents)
    print(f"\nCommon ancestors: {len(common_ancestors)}")
    for ancestor in sorted(common_ancestors):
        ancestor_name = mp_terms[ancestor].get("name", "Unknown")
        ancestor_ic = calculate_information_content(ancestor, children, total_terms)
        print(f"  {ancestor}: {ancestor_name} (IC: {ancestor_ic:.4f})")
else:
    print("Could not find one or both terms in the ontology")

## アノテーションによる重み付け

実際の入力データは以下のように、基礎表現型のあとに、括弧書きで(genotype, sex, life-stage)の順番でアノテーションがあります。

```python
term1_name = "preweaning lethality, complete penetrance (Homo, Male, Early)"
term2_name = "preweaning lethality (Homo, Female, Early)"
```

genotypeはHomo, Hetero, Hemi
sexはFemale, Male
life-stageはEmbryo, Early, Interval, Late
のカテゴリがあります。

このアノテーションの類似度も、Resnik類似度に加味したいです。
具体的な戦略としては以下のように考えています
1. アノテーションの3項目がすべて同じ → 何もしない（x1.0）
2. アノテーションの3項目のうち、2つが同じ → x0.75
3. アノテーションの3項目のうち、1つが同じ → x0.5
4. アノテーションの3項目のうち、すべて異なる → x0.25



In [None]:
def parse_annotation(term_name: str) -> tuple[str, tuple[str, str, str] | None]:
    """Parse annotation from term name.

    Returns:
        tuple: (base_phenotype, (genotype, sex, life_stage)) or (base_phenotype, None)

    Handles both 3-component and 2-component annotations:
    - 3-component: (genotype, sex, life-stage)
    - 2-component: (genotype, life-stage) -> sex becomes "None"
    """
    # First try 3-component pattern: (genotype, sex, life-stage)
    match = re.search(r"(.+?)\s*\(([^,]+),\s*([^,]+),\s*([^)]+)\)", term_name)

    if match:
        base_phenotype = match.group(1).strip()
        genotype = match.group(2).strip()
        sex = match.group(3).strip()
        life_stage = match.group(4).strip()
        return base_phenotype, (genotype, sex, life_stage)

    # Try 2-component pattern: (genotype, life-stage)
    match = re.search(r"(.+?)\s*\(([^,]+),\s*([^)]+)\)", term_name)

    if match:
        base_phenotype = match.group(1).strip()
        genotype = match.group(2).strip()
        life_stage = match.group(3).strip()
        # Set sex to "None" when missing
        return base_phenotype, (genotype, "None", life_stage)

    # No annotation found
    return term_name, None

In [None]:
def calculate_annotation_weight(
    annotation1: tuple[str, str, str] | None, annotation2: tuple[str, str, str] | None
) -> float:
    """Calculate weight based on annotation similarity.

    Args:
        annotation1: (genotype, sex, life_stage) or None
        annotation2: (genotype, sex, life_stage) or None

    Returns:
        float: Weight multiplier based on similarity
    """
    # If either annotation is missing, use neutral weight
    if annotation1 is None or annotation2 is None:
        return 1.0

    # Count matching annotation items
    matches = sum(1 for a1, a2 in zip(annotation1, annotation2) if a1 == a2)

    # Apply weight based on number of matches
    if matches == 3:
        return 1.0  # All same
    elif matches == 2:
        return 0.75  # 2 same
    elif matches == 1:
        return 0.5  # 1 same
    else:
        return 0.25  # All different

In [None]:
def weighted_resnik_similarity(
    term1_name: str,
    term2_name: str,
    mp_terms: dict[str, dict],
    parents: dict[str, list[str]],
    children: dict[str, list[str]],
    total_terms: int,
) -> tuple[float, float, str, str, dict[str, str]]:
    """Calculate weighted Resnik similarity considering annotations.

    Returns:
        tuple: (weighted_similarity, base_similarity, base_phenotype1, base_phenotype2, common_ancestors_info)
    """
    # Parse annotations
    base_phenotype1, annotation1 = parse_annotation(term1_name)
    base_phenotype2, annotation2 = parse_annotation(term2_name)

    # Find term IDs for base phenotypes
    term1_id = find_term_by_name(base_phenotype1, mp_terms)
    term2_id = find_term_by_name(base_phenotype2, mp_terms)

    if not term1_id or not term2_id:
        return 0.0, 0.0, base_phenotype1, base_phenotype2, {}

    # Calculate base Resnik similarity and get common ancestors
    if term1_id == term2_id:
        base_similarity = calculate_information_content(term1_id, children, total_terms)
        # When terms are identical, the MICA is the term itself
        common_ancestors_info = {
            term1_id: mp_terms[term1_id].get("name", "Unknown"),
            "mica_id": term1_id,
            "mica_name": mp_terms[term1_id].get("name", "Unknown"),
            "mica_ic": base_similarity
        }
    else:
        # Find common ancestors
        common_ancestors = find_common_ancestors(term1_id, term2_id, parents)
        
        if not common_ancestors:
            return 0.0, 0.0, base_phenotype1, base_phenotype2, {}
        
        # Find the most informative common ancestor (MICA)
        max_ic = 0.0
        mica_id = None
        mica_name = None
        
        # Collect all common ancestors with their names and IC
        common_ancestors_info = {}
        for ancestor in common_ancestors:
            ic = calculate_information_content(ancestor, children, total_terms)
            ancestor_name = mp_terms[ancestor].get("name", "Unknown")
            common_ancestors_info[ancestor] = ancestor_name
            
            if ic > max_ic:
                max_ic = ic
                mica_id = ancestor
                mica_name = ancestor_name
        
        base_similarity = max_ic
        
        # Add MICA info to the dictionary
        common_ancestors_info["mica_id"] = mica_id
        common_ancestors_info["mica_name"] = mica_name
        common_ancestors_info["mica_ic"] = max_ic

    # Calculate annotation weight
    weight = calculate_annotation_weight(annotation1, annotation2)

    # Apply weight to similarity
    weighted_similarity = base_similarity * weight

    return weighted_similarity, base_similarity, base_phenotype1, base_phenotype2, common_ancestors_info

In [None]:
# Test parsing with missing sex annotation
print("Testing 2-component annotation parsing:")

test_2_component = "preweaning lethality (Homo, Early)"
base, annotation = parse_annotation(test_2_component)

print(f"Term: '{test_2_component}'")
print(f"Base phenotype: '{base}'")
print(f"Annotation: {annotation}")

# Test with another 2-component example
test_cases_2comp = [
    "preweaning lethality (Homo, Early)",
    "preweaning lethality (Hetero, Late)",
    "preweaning lethality, complete penetrance (Hemi, Interval)",
]

print("\nTesting various 2-component cases:")
for term in test_cases_2comp:
    base, ann = parse_annotation(term)
    print(f"  '{term}' -> Base: '{base}', Annotation: {ann}")

In [None]:
# Test different annotation similarity cases
test_cases = [
    # All same (should get 1.25x)
    (
        "preweaning lethality (Homo, Male, Early)",
        "preweaning lethality, complete penetrance (Homo, Male, Early)",
    ),
    # 2 same (should get 1.0x)
    (
        "preweaning lethality (Homo, Male, Early)",
        "preweaning lethality, complete penetrance (Homo, Female, Early)",
    ),
    # 1 same (should get 0.75x)
    (
        "preweaning lethality (Homo, Male, Early)",
        "preweaning lethality, complete penetrance (Hetero, Female, Early)",
    ),
    # All different (should get 0.5x)
    (
        "preweaning lethality (Homo, Male, Early)",
        "preweaning lethality, complete penetrance (Hetero, Female, Late)",
    ),
    # No sex annotation (should get 1.0x)
    (
        "preweaning lethality (Homo, Early)",
        "preweaning lethality, complete penetrance (Homo, Male, Early)",
    ),
]

print("Testing different annotation similarity cases:")
print("=" * 60)

for i, (term1, term2) in enumerate(test_cases, 1):
    weighted_sim, base_sim, base1, base2, ancestors_info = weighted_resnik_similarity(
        term1, term2, mp_terms, parents, children, total_terms
    )

    _, ann1 = parse_annotation(term1)
    _, ann2 = parse_annotation(term2)
    weight = calculate_annotation_weight(ann1, ann2)

    matches = "N/A"
    if ann1 and ann2:
        matches = sum(1 for a1, a2 in zip(ann1, ann2) if a1 == a2)

    print(f"\nCase {i}:")
    print(f"  Term 1: {term1}")
    print(f"  Term 2: {term2}")
    print(f"  Annotation 1: {ann1}")
    print(f"  Annotation 2: {ann2}")
    print(f"  Matches: {matches}/3")
    print(f"  Weight: {weight:.2f}x")
    print(f"  Base similarity: {base_sim:.4f}")
    print(f"  Weighted similarity: {weighted_sim:.4f}")
    
    # Display MICA information
    if ancestors_info:
        print(f"  Most Informative Common Ancestor (MICA):")
        print(f"    ID: {ancestors_info.get('mica_id', 'N/A')}")
        print(f"    Name: {ancestors_info.get('mica_name', 'N/A')}")
        print(f"    IC: {ancestors_info.get('mica_ic', 0):.4f}")
        
        # Display all common ancestors
        print(f"  All common ancestors ({len(ancestors_info) - 3}):")
        for ancestor_id, ancestor_name in sorted(ancestors_info.items()):
            if not ancestor_id.startswith("mica_"):
                print(f"    {ancestor_id}: {ancestor_name}")

# TSUMUGIで解析可能なすべての表現型情報について、Resnik類似度のスコアを付与する

In [None]:
# 表現型
import json

file_path = Path("data", "annotation", "symbol_mptermname.json")

symbol_mptermname = json.load(open(file_path))
print(symbol_mptermname["Dpf2"])

In [None]:
all_mpternames = set()
for terms in symbol_mptermname.values():
    for term in terms:
        all_mpternames.add(term)
print(f"Total unique MP term names after deduplication: {len(all_mpternames)}")

In [None]:
sorted(all_mpternames)[:10]

In [None]:
term1_name = 'abnormal QT variability (Hemi, Early)'
term2_name = 'abnormal QT variability (Hetero, Early)'
# term2_name = 'abnormal adrenal gland morphology (Hemi, Late)'

In [None]:
weighted_sim, base_sim, base1, base2, ancestors_info = weighted_resnik_similarity(
    term1_name,
    term2_name,
    mp_terms,
    parents,
    children,
    total_terms,
)
print(f"Weighted similarity: {weighted_sim:.4f}")
print(f"Base similarity: {base_sim:.4f}")
print(f"Base phenotype 1: {base1}")
print(f"Base phenotype 2: {base2}")

# Display common ancestor information
if ancestors_info:
    print(f"\nMost Informative Common Ancestor (MICA):")
    print(f"  ID: {ancestors_info.get('mica_id', 'N/A')}")
    print(f"  Name: {ancestors_info.get('mica_name', 'N/A')}")
    print(f"  IC: {ancestors_info.get('mica_ic', 0):.4f}")
    
    print(f"\nAll common ancestors ({len(ancestors_info) - 3}):")
    for ancestor_id, ancestor_name in sorted(ancestors_info.items()):
        if not ancestor_id.startswith("mica_"):
            print(f"  {ancestor_id}: {ancestor_name}")
else:
    print("\nNo common ancestors found.")

In [None]:
from itertools import combinations
from tqdm import tqdm

cache_similarities = {}
for term1, term2 in tqdm(combinations(all_mpternames, 2), desc="Calculating similarities", total=len(all_mpternames) * (len(all_mpternames) - 1) // 2):
    weighted_sim, base_sim, base1, base2, ancestors_info = weighted_resnik_similarity(
        term1,
        term2,
        mp_terms,
        parents,
        children,
        total_terms,
    )
    cache_similarities[frozenset([term1, term2])] = {
        "weighted_similarity": weighted_sim,
        "base_similarity": base_sim,
        "base_phenotype1": base1,
        "base_phenotype2": base2,
        "common_ancestors_info": ancestors_info,
    }

print(f"Cached {len(cache_similarities)} term pairs")

In [None]:
import pickle
# Save the cache to a file
with open("data/overlap/resnik_similarity.pkl", "wb") as f:
    pickle.dump(cache_similarities, f)

In [None]:
term1 = "scoliosis (Homo, Early)"
term1 = "increased grip strength (Homo, Early)"
term2 = "increased grip strength (Hetero, Early)"
print(cache_similarities[frozenset([term1, term2])])
print(cache_similarities[frozenset([term2, term1])])