# 情報量計算において、親 < 子 のスコアとなっているか確認

- Issue: #114

# セットアップ

In [None]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir("../")

print(os.getcwd())

In [None]:
from pathlib import Path
from pprint import pprint
from collections import Counter

import pandas as pd

P = print
PP = pprint
C = Counter

# 実験

In [None]:
import obonet
import json
graph = obonet.read_obo("data/ontology/mp.obo")

symbol_mptermname = json.load(open("data/annotation/symbol_mptermname.json"))

In [None]:
def parse_obo_file(file_path: str) -> dict[str, dict]:
    """Parse OBO file and extract term information."""
    terms = {}
    current_term = None

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if line == "[Term]":
                current_term = {}
                continue

            if line.startswith("[") and line.endswith("]") and line != "[Term]":
                current_term = None
                continue

            if current_term is None:
                continue

            if ":" in line:
                key, value = line.split(":", 1)
                key = key.strip()
                value = value.strip()

                if key == "id":
                    current_term["id"] = value
                elif key == "name":
                    current_term["name"] = value
                elif key == "is_a":
                    if "is_a" not in current_term:
                        current_term["is_a"] = []
                    parent_id = value.split("!")[0].strip()
                    current_term["is_a"].append(parent_id)
                elif key == "is_obsolete":
                    current_term["is_obsolete"] = value.lower() == "true"

            if line == "" and current_term and "id" in current_term:
                if not current_term.get("is_obsolete", False):
                    terms[current_term["id"]] = current_term
                current_term = None

    return terms

In [None]:
obo = parse_obo_file("data/ontology/mp.obo")
print(list(obo.items())[:3])

In [None]:
from collections import defaultdict

direct_count = defaultdict(int)
for mp_names in symbol_mptermname.values():
    for mp_name in mp_names:
        mp_name = mp_name.split('(')[0].strip()
        direct_count[mp_name] += 1

direct_count = dict(direct_count)
print(list(direct_count.items())[:3])

In [None]:
print(direct_count.get("mammalian phenotype", 0))
print(direct_count.get("abnormal testis morphology", 0))
print(direct_count["small testis"])

In [None]:
from collections import defaultdict, deque

def propagate_frequencies(obo: dict, direct_count: dict) -> dict:
    """
    Propagate frequencies from child terms to parent terms using the 'is_a' hierarchy in obo.

    Parameters:
        obo (dict): Parsed OBO dictionary (term_id → term_info).
        direct_count (dict): Direct annotation frequencies (term_name → count).

    Returns:
        dict: propagated_count (term_name → cumulative count).
    """
    # 1. term_name → term_id の逆引き辞書
    name_to_id = {v["name"]: k for k, v in obo.items()}

    # 2. 初期化（term_idベースでカウントする）
    propagated_count = defaultdict(int)

    # 3. direct_count の name → id 変換
    for name, count in direct_count.items():
        if name not in name_to_id:
            continue  # 無視する（未登録のMP名）
        term_id = name_to_id[name]
        propagated_count[term_id] += count

    # 4. トポロジカルソート準備（子 → 親方向）
    #    各ノードの「親リスト」「子の数（入次数）」を構築
    children_map = defaultdict(list)
    in_degree = defaultdict(int)

    for term_id, info in obo.items():
        for parent_id in info.get("is_a", []):
            children_map[parent_id].append(term_id)
            in_degree[term_id] += 1

    # 入次数が0のノード（ルート）からキューに積む
    queue = deque([term for term in obo if in_degree[term] == 0])

    # 5. トポロジカル順に伝播（子 → 親の順に処理）
    while queue:
        parent_id = queue.popleft()
        for child_id in children_map[parent_id]:
            propagated_count[parent_id] += propagated_count[child_id]
            in_degree[child_id] -= 1
            if in_degree[child_id] == 0:
                queue.append(child_id)

    # 6. term_id → name に戻して返す
    id_to_name = {v["id"]: v["name"] for v in obo.values()}
    propagated_count_named = {
        id_to_name[tid]: count for tid, count in propagated_count.items() if tid in id_to_name
    }

    return propagated_count_named


In [None]:
propagated_count = propagate_frequencies(obo, direct_count)

In [None]:
print(propagated_count.get("mammalian phenotype", 0))
print(propagated_count.get("abnormal testis morphology", 0))
print(propagated_count.get("abnormal testis size", 0))
print(propagated_count["small testis"])

In [None]:
def check_propagation_monotonicity(obo: dict, propagated_count: dict) -> list[tuple[str, str, int, int]]:
    """
    Check that for all parent-child relationships, propagated_count[parent] ≥ propagated_count[child].

    Parameters:
        obo (dict): Parsed OBO dictionary.
        propagated_count (dict): Term name → propagated count.

    Returns:
        List of violations: [(parent_name, child_name, parent_count, child_count), ...]
    """
    id_to_name = {v["id"]: v["name"] for v in obo.values()}
    name_to_count = propagated_count
    violations = []

    for child_id, info in obo.items():
        child_name = id_to_name.get(child_id)
        if child_name not in name_to_count:
            continue

        child_count = name_to_count[child_name]

        for parent_id in info.get("is_a", []):
            parent_name = id_to_name.get(parent_id)
            if parent_name is None or parent_name not in name_to_count:
                continue
            parent_count = name_to_count[parent_name]

            if parent_count < child_count:
                violations.append((parent_name, child_name, parent_count, child_count))

    return violations


# 使用例
violations = check_propagation_monotonicity(obo, propagated_count)

# 表示（Pandasで表に）
import pandas as pd
violations_df = pd.DataFrame(violations, columns=["Parent", "Child", "Parent_Count", "Child_Count"])
print(violations_df)

In [None]:
# effect_sizeが1のもの（評価がバイナリのものを除外）
df_impc_filtered = df_impc[df_impc["effect_size"].abs() != 1]

In [None]:
# Plot the distribution of effect size values
df_impc_filtered["effect_size"].abs().dropna().plot.hist(bins=1000)

In [None]:
df_impc_filtered[df_impc_filtered["effect_size"].abs() < 20]["effect_size"].abs().dropna().plot.hist(bins=100)


In [None]:
df_impc_filtered_eps = df_impc_filtered["effect_size"] + 1e-6


In [None]:
# 対数変換してPlot
import numpy as np
df_impc_filtered_eps[df_impc_filtered_eps.abs() < 20].apply(lambda x: np.log10(x)).dropna().plot.hist(bins=100)

- 効果量は対数変換したほうが良さそう

## 表現型類似度

In [None]:
df_similarity = pd.read_parquet("data/TSUMUGI_RawData/TSUMUGI_v1.0.0_raw_data.parquet")

In [None]:
df_similarity["Phenodigm Score"].plot.hist(bins=100)

In [None]:
df_similarity[df_similarity["Phenodigm Score"] > 0]["Phenodigm Score"].plot.hist(bins=100)

In [None]:
df_similarity[df_similarity["Phenodigm Score"] > 0]["Phenodigm Score"].apply(lambda x: np.log10(x)).dropna().plot.hist(bins=100)

In [None]:
# Phenodigmスコアを対数変換
pheno_log = np.log1p(df_similarity["Phenodigm Score"])
pheno_log[pheno_log > 0].plot.hist(bins=100)

* 表現型類似度も対数変換すると良さそう