# Resnik Similarity for Phenotype Analysis

このノートブックでは、Jaccard indexの代わりにResnik similarityを使用して表現型類似度を計算します。
頻出表現型（preweaning lethalityなど）による偏りを軽減し、アノテーション（Genotype, Sex, Life stage）を考慮した拡張版を実装します。

In [None]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir("../")

print(os.getcwd())

In [None]:
import json
import math
import re
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict, Counter
from typing import Dict, List, Set, Tuple, Optional
import pickle

## 1. MPO Ontology Parser

In [None]:
class MPOParser:
    """MPO (Mammalian Phenotype Ontology) OBOファイルパーサー"""

    def __init__(self, obo_file_path: str):
        self.obo_file_path = obo_file_path
        self.terms = {}  # MP_ID -> {name, is_a, ...}
        self.name_to_id = {}  # name -> MP_ID
        self.hierarchy = defaultdict(set)  # child_id -> {parent_ids}
        self.children = defaultdict(set)  # parent_id -> {child_ids}

    def parse(self):
        """OBOファイルをパースしてオントロジー構造を構築"""
        print(f"Parsing MPO ontology from {self.obo_file_path}...")

        with open(self.obo_file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # Termブロックに分割
        term_blocks = content.split("[Term]")[1:]  # 最初の空要素を除く

        for block in term_blocks:
            term_data = self._parse_term_block(block)
            if term_data and not term_data.get("is_obsolete", False):
                self.terms[term_data["id"]] = term_data
                self.name_to_id[term_data["name"]] = term_data["id"]

        # 階層関係を構築
        self._build_hierarchy()

        print(f"Parsed {len(self.terms)} MP terms")
        return self

    def _parse_term_block(self, block: str) -> Optional[Dict]:
        """個別のTermブロックをパース"""
        lines = [line.strip() for line in block.strip().split("\n") if line.strip()]

        term_data = {"is_a": [], "synonyms": [], "is_obsolete": False}

        for line in lines:
            if ":" not in line:
                continue

            key, value = line.split(":", 1)
            key = key.strip()
            value = value.strip()

            if key == "id":
                term_data["id"] = value
            elif key == "name":
                term_data["name"] = value
            elif key == "def":
                # "definition text" [references] の形式
                match = re.match(r'"([^"]+)"', value)
                if match:
                    term_data["def"] = match.group(1)
            elif key == "is_a":
                # MP:0000001 ! parent term name の形式
                parent_id = value.split(" !")[0].strip()
                term_data["is_a"].append(parent_id)
            elif key == "synonym":
                # "synonym text" EXACT [] の形式
                match = re.match(r'"([^"]+)"', value)
                if match:
                    term_data["synonyms"].append(match.group(1))
            elif key == "is_obsolete":
                term_data["is_obsolete"] = value.lower() == "true"

        return term_data if "id" in term_data and "name" in term_data else None

    def _build_hierarchy(self):
        """親子関係から階層構造を構築"""
        for term_id, term_data in self.terms.items():
            for parent_id in term_data.get("is_a", []):
                if parent_id in self.terms:
                    self.hierarchy[term_id].add(parent_id)
                    self.children[parent_id].add(term_id)

    def get_ancestors(self, term_id: str) -> Set[str]:
        """指定されたtermの全祖先を取得"""
        ancestors = set()
        stack = [term_id]

        while stack:
            current = stack.pop()
            for parent in self.hierarchy.get(current, set()):
                if parent not in ancestors:
                    ancestors.add(parent)
                    stack.append(parent)

        return ancestors

    def get_term_depth(self, term_id: str) -> int:
        """ルートからの深さを計算"""
        if not self.hierarchy.get(term_id):
            return 0  # ルートノード

        max_depth = 0
        for parent in self.hierarchy[term_id]:
            max_depth = max(max_depth, self.get_term_depth(parent) + 1)

        return max_depth

    def find_lowest_common_ancestor(self, term_a: str, term_b: str) -> Optional[str]:
        """2つのtermの最下位共通祖先（LCA）を見つける"""
        # term名からIDに変換
        id_a = self.name_to_id.get(term_a, term_a)
        id_b = self.name_to_id.get(term_b, term_b)

        if id_a not in self.terms or id_b not in self.terms:
            return None

        # 両方の祖先を取得
        ancestors_a = self.get_ancestors(id_a)
        ancestors_a.add(id_a)  # 自分自身も含める

        ancestors_b = self.get_ancestors(id_b)
        ancestors_b.add(id_b)  # 自分自身も含める

        # 共通祖先
        common_ancestors = ancestors_a & ancestors_b

        if not common_ancestors:
            return None

        # 最も深い（具体的な）共通祖先を選択
        lca = max(common_ancestors, key=lambda x: self.get_term_depth(x))

        return lca

In [None]:
# MPOオントロジーをパース
mpo_parser = MPOParser("data/ontology/mp.obo")
mpo_parser.parse()

print(f"Total terms: {len(mpo_parser.terms)}")
print(f"\nSample terms:")
for i, (term_id, term_data) in enumerate(mpo_parser.terms.items()):
    if i < 5:
        print(f"  {term_id}: {term_data['name']}")
    else:
        break

## 2. 既存データから頻度計算

In [None]:
# 既存のTSUMUGIデータから表現型頻度を計算
def calculate_phenotype_frequencies(symbol_mptermname_path: str) -> Dict[str, int]:
    """遺伝子→表現型マッピングから各表現型の出現頻度を計算"""

    with open(symbol_mptermname_path, "r") as f:
        symbol_mptermname = json.load(f)

    # 全表現型の出現回数をカウント
    phenotype_counts = Counter()

    for gene, phenotypes in symbol_mptermname.items():
        if phenotypes:  # 空でない場合
            for phenotype in phenotypes:
                if phenotype.strip():  # 空文字列でない場合
                    phenotype_counts[phenotype.strip()] += 1

    print(f"Found {len(phenotype_counts)} unique phenotypes")
    print(f"Total phenotype observations: {sum(phenotype_counts.values())}")

    return dict(phenotype_counts)


# 頻度データを計算
phenotype_frequencies = calculate_phenotype_frequencies(
    "data/annotation/symbol_mptermname.json"
)

# 頻出表現型TOP10を確認
top_phenotypes = sorted(
    phenotype_frequencies.items(), key=lambda x: x[1], reverse=True
)[:10]
print("\nTop 10 most frequent phenotypes:")
for phenotype, count in top_phenotypes:
    print(f"  {count:4d}: {phenotype}")

## 3. Information Content計算

In [None]:
def calculate_information_content(phenotype_frequencies: Dict[str, int], mpo_parser: MPOParser) -> Dict[str, float]:
    """各表現型のInformation Content (IC)を計算"""
    
    total_observations = sum(phenotype_frequencies.values())
    ic_scores = {}
    
    for phenotype, frequency in phenotype_frequencies.items():
        # 確率計算
        probability = frequency / total_observations
        
        # Information Content: IC = -log(probability)
        ic_score = -math.log(probability) if probability > 0 else 0.0
        
        ic_scores[phenotype] = ic_score
    
    return ic_scores

def extract_base_phenotype(phenotype: str) -> str:
    """アノテーション付き表現型から基本表現型を抽出"""
    # 括弧で囲まれた部分を除去
    pattern = r"^(.+?)\s*\([^)]+\)$"
    match = re.match(pattern, phenotype.strip())
    
    if match:
        return match.group(1).strip()
    else:
        return phenotype.strip()

# Information Contentを計算
ic_scores = calculate_information_content(phenotype_frequencies, mpo_parser)

print(f"Calculated IC scores for {len(ic_scores)} phenotypes")

# 実際のデータから存在する表現型を使用
sample_phenotypes = [
    "preweaning lethality, complete penetrance (Homo, Early)",
    "abnormal skin morphology (Homo, Early)",
    "abnormal kidney morphology (Homo, Early)"
]

print("\nSample IC scores:")
for phenotype in sample_phenotypes:
    if phenotype in ic_scores:
        freq = phenotype_frequencies[phenotype]
        ic = ic_scores[phenotype]
        print(f"  {phenotype}")
        print(f"    Frequency: {freq}, IC: {ic:.3f}")
    else:
        print(f"  {phenotype} - NOT FOUND in data")

print(f"\nTop 5 phenotypes with highest IC (rarest):")
sorted_ic = sorted(ic_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for phenotype, ic in sorted_ic:
    freq = phenotype_frequencies[phenotype]
    print(f"  IC: {ic:.3f}, Freq: {freq} - {phenotype}")

## 4. 拡張Resnik Similarity実装

In [ ]:
class ExtendedResnikSimilarity:
    """アノテーション考慮型Resnik Similarity計算クラス（改良版）"""

    def __init__(self, mpo_parser: MPOParser, ic_scores: Dict[str, float], phenotype_frequencies: Dict[str, int]):
        self.mpo_parser = mpo_parser
        self.ic_scores = ic_scores
        self.phenotype_frequencies = phenotype_frequencies

        # アノテーション類似度の重み（さらに削減）
        self.annotation_weights = {"genotype": 0.5, "sex": 0.3, "life_stage": 0.2}

        # 統合時の重み（アノテーションの重みを極限まで削減）
        self.base_weight = 0.98
        self.annotation_weight = 0.02

    def calculate_similarity(self, phenotype_a: str, phenotype_b: str) -> float:
        """拡張Resnik similarityを計算"""

        # 完全一致の場合は自身のICスコアを返す
        if phenotype_a == phenotype_b:
            return self.ic_scores.get(phenotype_a, 0.0)

        # 1. 表現型とアノテーションを分離
        base_a, annotations_a = self.parse_phenotype_with_annotations(phenotype_a)
        base_b, annotations_b = self.parse_phenotype_with_annotations(phenotype_b)

        # 2. 基本表現型のResnik similarity
        base_similarity = self.calculate_base_resnik_similarity(base_a, base_b)

        # 3. アノテーション類似度
        annotation_similarity = self.calculate_annotation_similarity(
            annotations_a, annotations_b
        )

        # 4. 統合類似度 - 基本表現型が異なる場合はアノテーションをほぼ無視
        if base_similarity < 0.01:  # 基本表現型がほぼ異なる場合（閾値をより厳しく）
            # アノテーションの重みを極限まで削減
            combined_similarity = 0.999 * base_similarity + 0.001 * annotation_similarity
        elif base_similarity < 0.1:  # 基本表現型が少し類似する場合
            # アノテーションの重みをかなり削減
            combined_similarity = 0.99 * base_similarity + 0.01 * annotation_similarity
        else:
            # 通常の重み（それでもアノテーション重視を削減）
            combined_similarity = (
                self.base_weight * base_similarity
                + self.annotation_weight * annotation_similarity
            )

        return combined_similarity

    def parse_phenotype_with_annotations(self, phenotype: str) -> Tuple[str, Dict]:
        """表現型文字列をパースして基本項目とアノテーションに分離"""
        pattern = r"^(.+?)\s*\(([^)]+)\)$"
        match = re.match(pattern, phenotype.strip())

        if not match:
            return phenotype.strip(), {}

        base_term = match.group(1).strip()
        annotation_str = match.group(2).strip()

        # アノテーションの分類
        annotations = {}

        if annotation_str in ["Homo", "Hetero", "Hemi"]:
            annotations["genotype"] = annotation_str
        elif annotation_str in ["Male", "Female"]:
            annotations["sex"] = annotation_str
        elif annotation_str in ["Early", "Late", "Embryo", "Interval"]:
            annotations["life_stage"] = annotation_str
        else:
            # 複数アノテーションの場合
            parts = [part.strip() for part in annotation_str.split(",")]
            for part in parts:
                if part in ["Homo", "Hetero", "Hemi"]:
                    annotations["genotype"] = part
                elif part in ["Male", "Female"]:
                    annotations["sex"] = part
                elif part in ["Early", "Late", "Embryo", "Interval"]:
                    annotations["life_stage"] = part

        return base_term, annotations

    def calculate_base_resnik_similarity(self, term_a: str, term_b: str) -> float:
        """基本表現型のResnik similarity計算（改良版）"""

        # 同一項目の場合
        if term_a == term_b:
            # 基本表現型に対応する最高ICスコアを取得
            max_ic = 0.0
            for phenotype, ic in self.ic_scores.items():
                base_phenotype = extract_base_phenotype(phenotype)
                if base_phenotype == term_a:
                    max_ic = max(max_ic, ic)
            return max_ic

        # 文字列類似度による代替計算を追加
        string_similarity = self.calculate_string_similarity(term_a, term_b)
        
        # MPOオントロジーでLCAを見つける
        lca_id = self.mpo_parser.find_lowest_common_ancestor(term_a, term_b)
        ontology_similarity = 0.0

        if lca_id is not None:
            # LCAの名前を取得
            lca_name = self.mpo_parser.terms[lca_id]["name"]

            # LCAに対応する最高ICスコアを取得
            max_ic = 0.0
            for phenotype, ic in self.ic_scores.items():
                base_phenotype = extract_base_phenotype(phenotype)
                if base_phenotype == lca_name:
                    max_ic = max(max_ic, ic)
            ontology_similarity = max_ic

        # オントロジー類似度と文字列類似度の組み合わせ
        # オントロジーが利用できない場合は文字列類似度を使用
        if ontology_similarity > 0:
            return max(ontology_similarity, string_similarity)
        else:
            return string_similarity

    def calculate_string_similarity(self, term_a: str, term_b: str) -> float:
        """文字列類似度による補完計算"""
        
        # 共通パターンを検出
        if self.has_common_morphology_pattern(term_a, term_b):
            # abnormal XXX morphology のような共通パターンがある場合
            return 1.0  # 中程度の類似度
        
        # 共通の単語数による類似度
        words_a = set(term_a.lower().split())
        words_b = set(term_b.lower().split())
        
        if not words_a or not words_b:
            return 0.0
            
        common_words = words_a & words_b
        total_words = words_a | words_b
        
        jaccard_sim = len(common_words) / len(total_words)
        
        # 重要な単語（abnormal, morphology等）に重みを付ける
        important_words = {'abnormal', 'morphology', 'increased', 'decreased', 'phenotype'}
        important_common = common_words & important_words
        
        if important_common:
            bonus = len(important_common) * 0.3
            jaccard_sim += bonus
        
        # IC scoreで重み付け
        max_ic_a = 0.0
        max_ic_b = 0.0
        
        for phenotype, ic in self.ic_scores.items():
            base_phenotype = extract_base_phenotype(phenotype)
            if base_phenotype == term_a:
                max_ic_a = max(max_ic_a, ic)
            if base_phenotype == term_b:
                max_ic_b = max(max_ic_b, ic)
        
        # 平均IC scoreで調整
        avg_ic = (max_ic_a + max_ic_b) / 2 if (max_ic_a > 0 and max_ic_b > 0) else 0
        ic_factor = min(avg_ic / 10.0, 1.0)  # IC scoreを0-1にスケール
        
        final_similarity = jaccard_sim * ic_factor
        
        return min(final_similarity, 3.0)  # 最大3.0に制限

    def has_common_morphology_pattern(self, term_a: str, term_b: str) -> bool:
        """共通の形態学的パターンを検出"""
        
        # abnormal XXX morphology パターン
        pattern_abnormal_morphology = r'^abnormal\s+\w+\s+morphology$'
        
        if (re.match(pattern_abnormal_morphology, term_a) and 
            re.match(pattern_abnormal_morphology, term_b)):
            return True
        
        # その他の共通パターンを追加可能
        # increased/decreased XXX level パターン
        pattern_level = r'^(increased|decreased)\s+.*\s+level$'
        if (re.match(pattern_level, term_a) and 
            re.match(pattern_level, term_b)):
            return True
            
        return False

    def calculate_annotation_similarity(
        self, annotations_a: Dict, annotations_b: Dict
    ) -> float:
        """アノテーション間の類似度計算（より厳しく）"""

        total_weight = 0
        similarity_sum = 0

        # 各アノテーションタイプについて類似度計算
        for annotation_type in ["genotype", "sex", "life_stage"]:
            if annotation_type in annotations_a or annotation_type in annotations_b:
                weight = self.annotation_weights[annotation_type]
                total_weight += weight

                if (
                    annotation_type in annotations_a
                    and annotation_type in annotations_b
                ):
                    # 両方にアノテーションが存在
                    if annotations_a[annotation_type] == annotations_b[annotation_type]:
                        similarity_sum += weight * 1.0  # 完全一致
                    else:
                        # 部分的類似度
                        partial_sim = self.get_annotation_partial_similarity(
                            annotation_type,
                            annotations_a[annotation_type],
                            annotations_b[annotation_type],
                        )
                        similarity_sum += weight * partial_sim
                elif (
                    annotation_type in annotations_a or annotation_type in annotations_b
                ):
                    # 片方のみにアノテーション存在 - ペナルティをさらに強化
                    similarity_sum += weight * 0.01  # 0.1→0.01に削減

        if total_weight == 0:
            return 1.0  # 両方ともアノテーションなし

        return similarity_sum / total_weight

    def get_annotation_partial_similarity(
        self, annotation_type: str, value_a: str, value_b: str
    ) -> float:
        """アノテーション値間の部分的類似度（さらに削減）"""

        if annotation_type == "genotype":
            genotype_similarity = {
                ("Hetero", "Homo"): 0.1,  # 0.3→0.1に削減
                ("Hemi", "Homo"): 0.05,   # 0.2→0.05に削減  
                ("Hemi", "Hetero"): 0.05, # 0.2→0.05に削減
            }
            key = tuple(sorted([value_a, value_b]))
            return genotype_similarity.get(key, 0.0)

        elif annotation_type == "sex":
            return 0.0  # Male vs Female: 完全に異なる

        elif annotation_type == "life_stage":
            life_stage_similarity = {
                ("Early", "Interval"): 0.1,  # 0.3→0.1に削減
                ("Early", "Late"): 0.02,     # 0.1→0.02に削減
                ("Early", "Embryo"): 0.02,   # 0.1→0.02に削減
                ("Interval", "Late"): 0.2,   # 0.4→0.2に削減
                ("Interval", "Embryo"): 0.02, # 0.1→0.02に削減
                ("Late", "Embryo"): 0.02,     # 0.1→0.02に削減
            }
            key = tuple(sorted([value_a, value_b]))
            return life_stage_similarity.get(key, 0.0)

        return 0.0

## 5. テスト実行

In [ ]:
# 改良版ExtendedResnikSimilarityクラスで再初期化
extended_resnik = ExtendedResnikSimilarity(mpo_parser, ic_scores, phenotype_frequencies)

# 問題となっていたペアをテスト
problem_pairs = [
    ("preweaning lethality, complete penetrance (Homo, Early)", "abnormal heart morphology (Homo, Early)"),
    ("abnormal skin morphology (Homo, Early)", "abnormal kidney morphology (Homo, Early)"),
    ("abnormal skin morphology (Homo, Early)", "abnormal spleen morphology (Homo, Early)"),
]

print("改良版 Extended Resnik Similarity テスト:")
print("=" * 70)

for phenotype_a, phenotype_b in problem_pairs:
    # 詳細分析
    base_a, ann_a = extended_resnik.parse_phenotype_with_annotations(phenotype_a)
    base_b, ann_b = extended_resnik.parse_phenotype_with_annotations(phenotype_b)
    
    # 各種類似度を個別に計算
    base_sim = extended_resnik.calculate_base_resnik_similarity(base_a, base_b)
    string_sim = extended_resnik.calculate_string_similarity(base_a, base_b)
    ann_sim = extended_resnik.calculate_annotation_similarity(ann_a, ann_b)
    
    # 最終類似度計算
    similarity = extended_resnik.calculate_similarity(phenotype_a, phenotype_b)
    
    print(f"\n📍 {phenotype_a}")
    print(f"   vs")
    print(f"   {phenotype_b}")
    print(f"   📊 結果:")
    print(f"      基本類似度: {base_sim:.6f}")
    print(f"      文字列類似度: {string_sim:.6f}")
    print(f"      アノテーション類似度: {ann_sim:.6f}")
    print(f"      最終類似度: {similarity:.6f}")
    
    # 共通パターンチェック
    has_pattern = extended_resnik.has_common_morphology_pattern(base_a, base_b)
    print(f"      共通パターン: {'Yes' if has_pattern else 'No'}")
    
    # 計算詳細
    if base_sim < 0.01:
        expected_sim = 0.999 * base_sim + 0.001 * ann_sim
        print(f"      計算式: 0.999 × {base_sim:.6f} + 0.001 × {ann_sim:.6f} = {expected_sim:.6f} (極限削減)")
    elif base_sim < 0.1:
        expected_sim = 0.99 * base_sim + 0.01 * ann_sim
        print(f"      計算式: 0.99 × {base_sim:.6f} + 0.01 × {ann_sim:.6f} = {expected_sim:.6f} (強削減)")
    else:
        expected_sim = 0.98 * base_sim + 0.02 * ann_sim
        print(f"      計算式: 0.98 × {base_sim:.6f} + 0.02 × {ann_sim:.6f} = {expected_sim:.6f} (通常)")
    
    # 頻度情報
    freq_a = phenotype_frequencies.get(phenotype_a, 0)
    freq_b = phenotype_frequencies.get(phenotype_b, 0)
    print(f"      頻度: {freq_a} vs {freq_b}")
    
    # Jaccardとの比較（参考）
    jaccard_sim = 1.0 if phenotype_a == phenotype_b else 0.0
    print(f"      Jaccard参考: {jaccard_sim:.6f}")
    
    # 分析
    if phenotype_a != phenotype_b:
        if 'abnormal' in base_a and 'morphology' in base_a and 'abnormal' in base_b and 'morphology' in base_b:
            print(f"      💡 morphology系の表現型ペア -> 類似度向上期待")
        elif 'preweaning lethality' in base_a or 'preweaning lethality' in base_b:
            print(f"      💡 preweaning lethality vs 他 -> 極低類似度期待")

print(f"\n⚙️ 改良点:")
print(f"   - 文字列類似度による補完機能を追加")
print(f"   - 共通パターン検出機能を追加")
print(f"   - abnormal XXX morphology パターンを特別扱い")
print(f"   - オントロジーが利用できない場合の代替手段を提供")

print(f"\n🎯 期待される結果:")
print(f"   1. preweaning lethality vs abnormal heart morphology: ~0.001 (極低)")
print(f"   2. abnormal skin morphology vs abnormal kidney morphology: >0.1 (中程度)")
print(f"   3. 共通パターンを持つ表現型間での適切な類似度向上")

## 6. 性能比較：Jaccard vs Resnik

In [None]:
# Jaccard vs Extended Resnik の比較テスト（修正版）
comparison_pairs = [
    ("preweaning lethality, complete penetrance (Homo, Early)", "abnormal heart morphology (Homo, Early)"),
    ("abnormal skin morphology (Homo, Early)", "abnormal kidney morphology (Homo, Early)"),
    ("abnormal spleen morphology (Homo, Early)", "abnormal kidney morphology (Homo, Early)"),
]

def jaccard_similarity(phenotype_a: str, phenotype_b: str) -> float:
    """従来のJaccard similarity（参考用）"""
    if phenotype_a == phenotype_b:
        return 1.0
    else:
        return 0.0

print("\n修正後の比較: Jaccard vs Extended Resnik Similarity")
print("=" * 70)

for phenotype_a, phenotype_b in comparison_pairs:
    jaccard_sim = jaccard_similarity(phenotype_a, phenotype_b)
    resnik_sim = extended_resnik.calculate_similarity(phenotype_a, phenotype_b)

    print(f"\n📍 {phenotype_a}")
    print(f"   vs")
    print(f"   {phenotype_b}")
    print(f"   📊 Jaccard: {jaccard_sim:.4f}  |  Extended Resnik: {resnik_sim:.4f}")

    # 詳細分析
    base_a, ann_a = extended_resnik.parse_phenotype_with_annotations(phenotype_a)
    base_b, ann_b = extended_resnik.parse_phenotype_with_annotations(phenotype_b)
    base_sim = extended_resnik.calculate_base_resnik_similarity(base_a, base_b)
    ann_sim = extended_resnik.calculate_annotation_similarity(ann_a, ann_b)
    
    print(f"   🔍 詳細:")
    print(f"      基本表現型: '{base_a}' vs '{base_b}' (類似度: {base_sim:.4f})")
    print(f"      アノテーション: {ann_a} vs {ann_b} (類似度: {ann_sim:.4f})")
    
    # 頻度情報とIC情報も表示
    freq_a = phenotype_frequencies.get(phenotype_a, 0)
    freq_b = phenotype_frequencies.get(phenotype_b, 0)
    ic_a = ic_scores.get(phenotype_a, 0)
    ic_b = ic_scores.get(phenotype_b, 0)
    print(f"      頻度: {freq_a} vs {freq_b}")
    print(f"      IC scores: {ic_a:.3f} vs {ic_b:.3f}")
    
    # 改善効果を評価
    if jaccard_sim == 0.0 and resnik_sim < 0.1:
        print(f"      ✅ 改善成功: 非関連表現型の類似度が適切に低い ({resnik_sim:.4f})")
    elif jaccard_sim == 0.0 and resnik_sim >= 0.1:
        print(f"      ⚠️  要改善: 非関連表現型の類似度がまだ高い ({resnik_sim:.4f})")

print(f"\n🎯 Extended Resnik Similarity の利点:")
print(f"   1. 頻出表現型の偏りを軽減")
print(f"   2. オントロジー階層を考慮した意味的類似度")
print(f"   3. アノテーション情報を適切に考慮")
print(f"   4. 非関連表現型間の類似度を適切に低く抑制")

## 7. データ保存

In [None]:
# 計算結果を保存
output_dir = Path("data/resnik_similarity")
output_dir.mkdir(exist_ok=True)

# 1. MPOパーサーを保存
with open(output_dir / "mpo_parser.pkl", "wb") as f:
    pickle.dump(mpo_parser, f)

# 2. IC scores保存
with open(output_dir / "ic_scores.json", "w") as f:
    json.dump(ic_scores, f, indent=2)

# 3. 頻度データ保存
with open(output_dir / "phenotype_frequencies.json", "w") as f:
    json.dump(phenotype_frequencies, f, indent=2)

# 4. ExtendedResnikSimilarityクラス保存
with open(output_dir / "extended_resnik_similarity.pkl", "wb") as f:
    pickle.dump(extended_resnik, f)

print(f"Resnik similarity data saved to {output_dir}/")
print(f"Files saved:")
for file_path in output_dir.glob("*"):
    print(f"  - {file_path.name}")

In [None]:
print("✅ Extended Resnik Similarity implementation completed!")
print(f"\nSummary:")
print(f"- Parsed {len(mpo_parser.terms)} MPO terms")
print(f"- Calculated IC scores for {len(ic_scores)} phenotypes")
print(f"- Implemented annotation-aware similarity calculation")
print(f"- Results saved to data/resnik_similarity/")