# Run allですべてのデータを準備する

* URL: https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/

In [None]:
TSUMUGI_VERSION = "1.0.0"
IMPC_RELEASE = 23.0

columns = [
    "marker_symbol",
    "marker_accession_id",
    "mp_term_name",
    "mp_term_id",
    "p_value",
    "effect_size",
    "female_ko_effect_p_value",
    "male_ko_effect_p_value",
    "female_ko_parameter_estimate",
    "sex_effect_p_value",
    "male_ko_parameter_estimate",  # sex differences
    "genotype_effect_p_value",
    "genotype_effect_parameter_estimate",
    "zygosity",  # zygosity
    "pipeline_name",
    "procedure_name",  # life-stage
    "allele_symbol",  # map to Phendigm
]

In [None]:
P = print
from pathlib import Path
from itertools import combinations
import os
import numpy as np
import pandas as pd
import shutil
import pickle
import json
import gzip
import urllib.request
from tqdm import tqdm
import hashlib

In [None]:
# Move up to top directory

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir("../")

print(os.getcwd())

## 1. Download IMPC dataset

In [None]:
# Phenodigm dataが存在していない場合には、ダウンロードを促す

if not Path("data", "phenodigm", "impc_phenodigm.csv").exists():
    raise FileNotFoundError(
        "Please download impc phenodigm data from https://diseasemodels.research.its.qmul.ac.uk/."
    )

In [None]:
# パスの設定
data_dir = Path("data/impc")
data_dir.mkdir(parents=True, exist_ok=True)
csv_path = data_dir / f"statistical-results-ALL-{IMPC_RELEASE}.csv"

# ファイルが存在しない場合にダウンロードして解凍
if not csv_path.exists():
    # ダウンロード URL
    url = f"https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/release-{IMPC_RELEASE}/results/statistical-results-ALL.csv.gz"

    print(f"Downloading and extracting: {url}")

    # URL からファイルサイズ取得（tqdmのため）
    with urllib.request.urlopen(url) as response:
        total_size = int(response.info().get("Content-Length", -1))
        with tqdm.wrapattr(
            response,
            "read",
            total=total_size,
            desc="Downloading",
            unit="B",
            unit_scale=True,
        ) as r:
            with gzip.GzipFile(fileobj=r) as uncompressed:
                with open(csv_path, "wb") as out_file:
                    shutil.copyfileobj(uncompressed, out_file)

In [None]:
%%bash

# wc -l data/impc/statistical-results*.csv
# Release 22.1: 3165335
# Release 23.0: 2159931

# 1 min

## 2. Filter dataset by P value < 0.0001 (10^-4)


In [None]:
if not Path("data", f"statistical_filtered-{IMPC_RELEASE}.csv").exists():
    path_df_statistical_filtered = Path(
        "data", "impc", f"statistical-results-ALL-{IMPC_RELEASE}.csv"
    )
    df_statistical_all = pd.read_csv(path_df_statistical_filtered)
    df_statistical_all = df_statistical_all[columns]

    # Filter by p_value < 0.0001
    threshold = 0.0001
    filter_pvalue = df_statistical_all["p_value"] < threshold
    filter_female_ko_pvalue = df_statistical_all["female_ko_effect_p_value"] < threshold
    filter_male_ko_pvalue = df_statistical_all["male_ko_effect_p_value"] < threshold

    df_statistical_filtered = df_statistical_all[
        filter_pvalue | filter_female_ko_pvalue | filter_male_ko_pvalue
    ]

    # Filter by mp_term_id and mp_term_name are not NaN
    df_statistical_filtered = df_statistical_filtered.dropna(subset=["mp_term_id"])
    df_statistical_filtered = df_statistical_filtered.dropna(subset=["mp_term_name"])

    # Filter by effect_size is not NaN
    df_statistical_filtered = df_statistical_filtered.dropna(subset=["effect_size"])
    df_statistical_filtered.to_csv(
        f"data/statistical_filtered-{IMPC_RELEASE}.csv", index=False
    )  # 2 sec

# 30 seconds

In [None]:
df_statistical_filtered = pd.read_csv(f"data/statistical_filtered-{IMPC_RELEASE}.csv")

In [None]:
print(len(df_statistical_filtered))
# Release 22.0: 54059 rows
# Release 22.1: 54059 rows
# Release 23.0: 49299 rows

## Split data by mp_term_name

In [None]:
df_statistical_filtered = pd.read_csv(f"data/statistical_filtered-{IMPC_RELEASE}.csv")

In [None]:
# data/mp_term_nameを作成

output_path = Path("data", "mp_term_name")
if output_path.exists():
    shutil.rmtree(output_path)
output_path.mkdir(parents=True, exist_ok=True)

In [None]:
# 名前をクリーンにする関数を定義
def clean_name(name):
    return name.replace("/", "_").replace(" ", "_")


# mp_term_nameをクリーニングし、ユニークな値を取得
unique_mp_term_names = df_statistical_filtered["mp_term_name"].unique()

In [None]:
# ユニークなmp_term_nameごとにフィルタリングしてCSVに保存: 5 sec
for mp_term_name in unique_mp_term_names:
    df_mp_term = df_statistical_filtered[
        df_statistical_filtered["mp_term_name"] == mp_term_name
    ]
    clean_mp_term_name = clean_name(mp_term_name)
    df_mp_term.to_csv(f"data/mp_term_name/{clean_mp_term_name}.csv", index=False)
# 5 sec

## 3. TSUMUGIに必要なアノテーション情報を整理する

In [None]:
df_statistical_filtered = pd.read_csv(f"data/statistical_filtered-{IMPC_RELEASE}.csv")

### Annotate life stages

In [None]:
# life_stageの初期割り当て
def assign_life_stage(pipeline_name):
    if pd.isna(pipeline_name):
        return "Early"
    if "Interval" in pipeline_name or "interval" in pipeline_name:
        return "Interval"
    elif "Late" in pipeline_name or "late" in pipeline_name:
        return "Late"
    else:
        return "Early"


df_statistical_filtered["life_stage"] = df_statistical_filtered["pipeline_name"].apply(
    assign_life_stage
)

# Embryo 表現型に該当する procedure_name の一覧
embryo_phenotyping = [
    "Gross Morphology Embryo E9.5",
    "Viability E9.5 Secondary Screen",
    "OPT E9.5",
    "MicroCT E9.5",
    "Gross Morphology Placenta E9.5",
    "Gross Morphology Embryo E12.5",
    "Embryo LacZ",
    "Gross Morphology Placenta E12.5",
    "Viability E12.5 Secondary Screen",
    "Viability E14.5-E15.5 Secondary Screen",
    "Gross Morphology Placenta E14.5-E15.5",
    "MicroCT E14.5-E15.5",
    "Gross Morphology Embryo E14.5-E15.5",
    "Viability E18.5 Secondary Screen",
    "MicroCT E18.5",
    "Gross Morphology Embryo E18.5",
    "Gross Morphology Placenta E18.5",
]

# life_stageをEmbryoに上書き
df_statistical_filtered.loc[
    df_statistical_filtered["procedure_name"].isin(embryo_phenotyping), "life_stage"
] = "Embryo"
df_annotated = df_statistical_filtered.reset_index(drop=True)

In [None]:
print(len(df_annotated))
print(df_annotated["life_stage"].value_counts())
# 54059
# life_stage
# Early       45724
# Embryo       4253
# Late         4024
# Interval       58
# Name: count, dtype: int64

### Annotate Sex differences

In [None]:
threshold = 0.0001

# 条件リスト
conditions = [
    (df_annotated["sex_effect_p_value"] < threshold)
    & (df_annotated["female_ko_effect_p_value"] < threshold)
    & (df_annotated["male_ko_effect_p_value"] > threshold),
    (df_annotated["sex_effect_p_value"] < threshold)
    & (df_annotated["male_ko_effect_p_value"] < threshold)
    & (df_annotated["female_ko_effect_p_value"] > threshold),
]

# 条件に対応する値
choices = ["female", "male"]

# np.selectで列を設定
df_annotated["sexdual_dimorphism"] = np.select(conditions, choices, default=None)
df_annotated = df_annotated.reset_index(drop=True)

# 結果を確認
print(IMPC_RELEASE)
print(df_annotated["sexdual_dimorphism"].value_counts())

# RELEASE 22.1
# male      4915
# female    4146

# RELEASE 23.0
# male      5026
# female    4344

In [None]:
# 確認
df_annotated.dropna(subset=["sexdual_dimorphism"])[
    ["p_value", "sexdual_dimorphism", "effect_size"]
].head(10)

### 遺伝型、性差、ライフステージのアノテーションを統合する

In [None]:
print(df_annotated["zygosity"].value_counts())
# RELEASE 22.1
# zygosity
# homozygote      41444
# heterozygote    11921
# hemizygote        694

# RELEASE 23.0
# homozygote      37820
# heterozygote    10896
# hemizygote        583

In [None]:
# アノテーション列を追加（inplace）
def make_annotation(row) -> list[str]:
    # 遺伝型
    if row["zygosity"] == "homozygote":
        annotate = "Homo"
    elif row["zygosity"] == "heterozygote":
        annotate = "Hetero"
    else:
        annotate = "Hemi"

    # 性別
    if row["sexdual_dimorphism"] == "female":
        annotate += ", Female"
    elif row["sexdual_dimorphism"] == "male":
        annotate += ", Male"

    # life stage
    if row["life_stage"] in {"Embryo", "Early", "Interval", "Late"}:
        annotate += f", {row['life_stage']}"

    annotations = []

    annotations.append(f"{row['mp_term_name']} ({annotate})")

    return annotations


df_annotated["annotation"] = df_annotated.apply(make_annotation, axis=1)

df_exploded = df_annotated.explode("annotation").reset_index(drop=True)

# marker_symbol ごとに annotation をリスト化＆ソート
marker_annotation_map = df_exploded.groupby("marker_symbol")["annotation"].apply(
    lambda x: sorted(set(x))
)

In [None]:
# 例：Rhdの注釈を表示
print(marker_annotation_map["Rhd"])
# 例：Amtの注釈を表示 (Embryo)
print(marker_annotation_map["Amt"])
# 例：Spag4の注釈を表示 (重複が削除されているか)
print(marker_annotation_map["Spag4"])

In [None]:
Path("data/annotation").mkdir(exist_ok=True, parents=True)
file_path = "data/annotation/symbol_mptermname.json"
marker_annotation_map.to_json(file_path, indent=4)

# json.dump(marker_annotation_map, open(file_path, "w"), indent=4, sort_keys=True)

In [None]:
%%bash

grep -c "Male" data/annotation/symbol_mptermname.json | sed "s|^|Male: |"
grep -c "Female" data/annotation/symbol_mptermname.json | sed "s|^|Feale: |"

grep -c "Homo" data/annotation/symbol_mptermname.json | sed "s|^|Homo: |"
grep -c "Hetero" data/annotation/symbol_mptermname.json | sed "s|^|Hetero: |"
grep -c "Hemi" data/annotation/symbol_mptermname.json | sed "s|^|Hemi: |"

grep -c "Embryo" data/annotation/symbol_mptermname.json | sed "s|^|Embryo: |"
grep -c "Early" data/annotation/symbol_mptermname.json | sed "s|^|Early: |"
grep -c "Interval" data/annotation/symbol_mptermname.json | sed "s|^|Interval: |"
grep -c "Late" data/annotation/symbol_mptermname.json | sed "s|^|Late: |"

# RELEASE 22.1
# Male: 4915
# Feale: 4146
# Homo: 41444
# Hetero: 11921
# Hemi: 694
# Embryo: 4253
# Early: 45724
# Interval: 58
# Late: 4024

# RELEASE 23.0
# Male: 4480
# Feale: 3557
# Homo: 30977
# Hetero: 9625
# Hemi: 492
# Embryo: 4207
# Early: 34324
# Interval: 54
# Late: 2509

###  Phenodigmを用いたヒト疾患情報を取得する

In [None]:
df_phenodigm = pd.read_csv(Path("data", "phenodigm", "impc_phenodigm.csv"))
P(len(df_phenodigm))
# 3405

In [None]:
# 各行について空白の数をカウント
space_counts = df_phenodigm["Mouse model description"].str.count(" ")

# 空白の数が2でない行を抽出（== split して3つにならない行）
invalid_rows = df_phenodigm[space_counts != 2]

# 結果表示
print(f"全体の件数: {len(df_phenodigm)}")
print(f"空白がちょうど2つでない行数: {len(invalid_rows)}")
print(invalid_rows.head())
# -> たった2つしかなく、`Phex<not yet available>`なので、この2つは無視する

In [None]:
df_phenodigm = df_phenodigm[space_counts == 2]
P(len(df_phenodigm))
# 3403

In [None]:
df_phenodigm[["allele_symbol", "zygosity", "life_stage"]] = df_phenodigm[
    "Mouse model description"
].str.split(" ", n=2, expand=True)
df_phenodigm = df_phenodigm.drop(columns=["Mouse model description"])

In [None]:
P(df_phenodigm.columns)
P(df_phenodigm["allele_symbol"].head(3))
P(df_phenodigm["zygosity"].head(3))
P(df_phenodigm["life_stage"].head(3))

In [None]:
# phenodigmの表記とimpcデータの表記を揃える

df_phenodigm = df_phenodigm.replace(
    {"zygosity": {"hom": "homozygote", "het": "heterozygote", "hem": "hemizygote"}}
)
df_phenodigm["life_stage"] = df_phenodigm["life_stage"].str.capitalize()
print(df_phenodigm["zygosity"].value_counts())
print(df_phenodigm["life_stage"].value_counts())

In [None]:
df_annotated_phenodigm = (
    df_annotated.set_index(["allele_symbol", "life_stage", "zygosity"])
    .join(
        df_phenodigm.set_index(["allele_symbol", "life_stage", "zygosity"]),
        how="left",
        rsuffix="_phenodigm",
    )
    .reset_index()
)
print(len(df_annotated_phenodigm))
# 63645

In [None]:
columns_to_keep = ["marker_symbol", "Disorder name", "life_stage", "zygosity"]
df_annotated_phenodigm = (
    df_annotated_phenodigm[columns_to_keep]
    .dropna(subset=["Disorder name"])
    .reset_index(drop=True)
)
df_annotated_phenodigm

In [None]:
# アノテーション列を追加（inplace）
def make_annotation(row) -> list[str]:
    # 遺伝型
    if row["zygosity"] == "homozygote":
        annotate = "Homo"
    elif row["zygosity"] == "heterozygote":
        annotate = "Hetero"
    else:
        annotate = "Hemi"

    # life stage
    if row["life_stage"] in {"Embryo", "Early", "Interval", "Late"}:
        annotate += f", {row['life_stage']}"

    annotations = []

    annotations.append(f"{row['Disorder name']} ({annotate})")

    return annotations


df_annotated_phenodigm["annotation"] = df_annotated_phenodigm.apply(
    make_annotation, axis=1
)

df_exploded = df_annotated_phenodigm.explode("annotation").reset_index(drop=True)

# marker_symbol ごとに annotation をリスト化＆ソート
marker_annotation_map = df_exploded.groupby("marker_symbol")["annotation"].apply(
    lambda x: sorted(set(x))
)

In [None]:
# 例：Phenodigmの注釈を表示 (Embryo)
print(marker_annotation_map["Arhgap31"])

In [None]:
Path("data/annotation").mkdir(exist_ok=True, parents=True)
file_path = "data/annotation/symbol_disordername.json"
marker_annotation_map.to_json(file_path, indent=4)

### mp term nameとIMPCのPhenotype URLを紐付ける

In [None]:
df_select = df_statistical_filtered[["mp_term_id", "mp_term_name"]].drop_duplicates()
# df_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
df_select

In [None]:
dict_phenotype_url = dict()
for index, row in df_select.iterrows():
    mp_tern_id = row["mp_term_id"]
    impc_url = f"https://www.mousephenotype.org/data/phenotypes/{mp_tern_id}"
    mp_term_name = row["mp_term_name"]
    dict_phenotype_url[mp_term_name] = impc_url

print(dict_phenotype_url["small lymph nodes"])

In [None]:
with open("data/annotation/mptermname_phenotypeurl.tsv", "w") as f:
    for term, url in dict_phenotype_url.items():
        f.write(f"{term}\t{url}\n")

In [None]:
%%bash

head -n 3 data/annotation/mptermname_phenotypeurl.tsv
wc -l data/annotation/mptermname_phenotypeurl.tsv
# Release 22.0: 664
# Release 23.0: 659

### marker symbolとMGI accession idを紐付ける

In [None]:
df_select = df_statistical_filtered[
    ["marker_symbol", "marker_accession_id"]
].drop_duplicates()
# df_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
df_select
# Release 22.1: 7746 rows
# Release 23.0: 7934 rows

In [None]:
dict_symbol_id = dict()
for index, row in df_select.iterrows():
    dict_symbol_id[row["marker_symbol"]] = row["marker_accession_id"]
print(dict_symbol_id["Ncam1"])

In [None]:
json.dump(
    dict_symbol_id,
    open("data/annotation/symbol_mgiid.json", "w"),
    indent=4,
    sort_keys=True,
)
Path("data/annotation/symbol_mgiid.tsv").write_text(
    "\n".join([f"{k}\t{v}" for k, v in dict_symbol_id.items()])
)

In [None]:
%%bash
head -n 3 data/annotation/symbol_mgiid.tsv

## 4. 表現型の類似度を求める

In [None]:
file_path = Path("data/overlap/gene_pair_mp_similarity_phenodigm.pkl")

if not file_path.exists():
    raise FileNotFoundError(
        f"Please run 038-resnik-simple.ipynb to generate {file_path}."
    )

gene_pair_mp_similarity = pickle.load(open("data/overlap/gene_pair_mp_similarity_phenodigm.pkl", "rb"))
gene_pair_mp_similarity_filtered = [
    item for item in gene_pair_mp_similarity if item[4] >= 2
]

In [None]:
print(gene_pair_mp_similarity_filtered[-3:])
print(len(gene_pair_mp_similarity_filtered))

### 生データをCSV形式で出力 （ダウンロード用）

In [None]:
df_similarity = pd.DataFrame(gene_pair_mp_similarity)
df_similarity.columns = [
    "Gene1",
    "Gene2",
    "Phenodigm Score",
    "Jaccard Similarity",
    "Number of shared phenotype",
    "List of shared phenotypes",
]
df_similarity.reindex(
    columns=[
        "Gene1",
        "Gene2",
        "Phenodigm Score",
        "Number of shared phenotype",
        "Jaccard Similarity",
        "List of shared phenotypes",
    ]
)
# df_similarity["List of shared phenotypes"] = df_similarity["List of shared phenotypes"].apply(json.dumps)
# 30 sec

In [None]:
output_dir = Path("data", "TSUMUGI_RawData")
output_dir.mkdir(parents=True, exist_ok=True)
path_csv = output_dir / Path(f"TSUMUGI_v{TSUMUGI_VERSION}_raw_data.csv.gz")
path_parquet = output_dir / Path(f"TSUMUGI_v{TSUMUGI_VERSION}_raw_data.parquet")


def get_head1000_hash(df: pd.DataFrame) -> str:
    # head(1000)だけを対象にハッシュ化
    csv_bytes = df.head(1000).to_csv(index=False, lineterminator="\n").encode("utf-8")
    return hashlib.md5(csv_bytes).hexdigest()


def file_head1000_hash(path: Path) -> str | None:
    if not path.exists():
        return None
    with gzip.open(path, "rt", encoding="utf-8") as f:
        lines = [next(f) for _ in range(1001)]  # 1行目がヘッダー
        csv_content = "".join(lines).encode("utf-8")
        return hashlib.md5(csv_content).hexdigest()


# 比較
new_hash = get_head1000_hash(df_similarity)
existing_hash = file_head1000_hash(path_csv)

if new_hash != existing_hash:
    df_similarity.to_csv(path_csv, index=False, compression="gzip", lineterminator="\n")
    df_similarity.to_parquet(path_parquet, index=False)
    print("🔄 ファイルを更新しました")
    # 3 min
else:
    print("✅ 内容に変更がないためスキップしました")

## 表現型ごとのネットワークを出力

In [None]:
# gene_pair_mp_similarity_filtered = pickle.load(
#     open("data/overlap/gene_pair_mp_similarity_filtered_phenodigm.pkl", "rb")
# )

In [None]:
df_similarity = pd.DataFrame(
    gene_pair_mp_similarity_filtered,
    columns=[
        "marker1",
        "marker2",
        "phenodigm_score",
        "phenotype_similarity",
        "shared_mp_number",
        "shared_mp",
    ],
)
print(len(df_similarity))
# version 0.2.2: 133281  rows × 5 columns
# version 0.3.0: 261216  rows × 5 columns

In [None]:
# Phenodigmスコアを対数変換
df_similarity["phenodigm_score"] = np.log1p(df_similarity["phenodigm_score"])

In [None]:
df_marker_phenotype = json.load(open("data/annotation/symbol_mptermname.json"))
df_marker_phenotype = pd.DataFrame(
    df_marker_phenotype.items(), columns=["marker_symbol", "mp_term_name"]
)
print(len(df_marker_phenotype))
# TSUMUGI v0.2.2: 7626 rows
# TSUMUGI v0.3.0: 7746 rows
# TSUMUGI v0.3.1: 7746 rows
# TSUMUGI v0.3.2: 7954 rows

In [None]:
dict_marker_phenotype = dict(
    zip(df_marker_phenotype.marker_symbol, df_marker_phenotype.mp_term_name)
)

In [None]:
dict_marker_disease = json.load(open("data/annotation/symbol_disordername.json"))

In [None]:
output_dir = Path("data/network/mp_term_name")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)

In [None]:
path_target_phenotypes = list(Path("data", "mp_term_name").glob("*.csv"))

"""
ノードが多すぎるとWebページが描画できない問題を回避するため、
ノード数を閾値（upper_limit）以下にするために最適なphenodigm_scoreを求める
"""
number_of_nodes = 125
tolerance = 25
upper_limit = number_of_nodes + tolerance
lower_limit = number_of_nodes - tolerance

for path_target_phenotype in tqdm(path_target_phenotypes, desc="Processing MP terms"):
    columns = ["marker_symbol", "effect_size"]
    df_marker_effect = pd.read_csv(path_target_phenotype, usecols=columns).dropna(
        subset=["effect_size"]
    )
    df_marker_effect["effect_size"] = df_marker_effect["effect_size"].abs()

    # バイナリ表現型以外では、effect_sizeを対数変換する
    is_binary = df_marker_effect["effect_size"].isin([0, 1]).all()
    if not is_binary:
        df_marker_effect["effect_size"] = np.log1p(df_marker_effect["effect_size"])

    # * effect sizeの絶対値が最大の行を取得 (Homo/Heteroで異なる効果量がある場合に、ひとまず最大値を採用する← 今後の考慮事項)
    idx = df_marker_effect.groupby("marker_symbol")["effect_size"].idxmax()
    df_max = df_marker_effect.loc[idx]

    dict_marker_effect = dict(zip(df_max["marker_symbol"], df_max["effect_size"]))

    target_phenotype = path_target_phenotype.stem
    target_phenotype_space = target_phenotype.replace("_", " ")
    gene_symbols = df_marker_effect["marker_symbol"]

    # --- 1. phenotypeを生じるgene_symbolsを含むエッジのみ抽出 ---------------------------
    df_filtered = df_similarity[
        df_similarity["marker1"].isin(gene_symbols)
        & df_similarity["marker2"].isin(gene_symbols)
        & df_similarity["shared_mp"].apply(
            lambda lst: any(target_phenotype_space in term for term in lst)
        )
    ]

    # --- 2. 初期状態のノードの数を確認 -------------------------------
    nodes = set(
        pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True)
    )
    num_nodes = len(nodes)

    if num_nodes > upper_limit:
        # --- 3. 離散スコア値で探索 ----------------------------------------
        discrete_scores = df_filtered.loc[:, "phenodigm_score"].unique()
        discrete_scores = np.sort(discrete_scores)[::-1]  # 降順

        best_thr = None
        best_diff = float("inf")

        lo, hi = 0, len(discrete_scores) - 1
        while lo <= hi:
            mid_idx = (lo + hi) // 2
            thr = discrete_scores[mid_idx]

            df_mid = df_filtered[df_filtered["phenodigm_score"] >= thr]
            nodes = set(
                pd.concat([df_mid["marker1"], df_mid["marker2"]], ignore_index=True)
            )
            num_nodes = len(nodes)

            # ▼ ノード数が許容範囲なら候補にする
            if num_nodes <= upper_limit:
                diff = abs(num_nodes - number_of_nodes)
                if diff < best_diff:
                    best_diff = diff
                    best_thr = thr
                # さらにノードを減らせるか？ → 閾値を **上げる**（スコアを大きく）
                lo = mid_idx + 1
            else:
                # ノードが多すぎ → 閾値を **上げる**（スコアを大きく）
                hi = mid_idx - 1

        # ------------ 最終の閾値 ---------------------------------------
        if best_thr is None:
            # Upper limit以下がどうしても存在しないケース → 最小ノードになる閾値
            best_thr = discrete_scores[hi + 1]  # hi は最後に -1 されているので +1

        df_filtered = df_filtered[df_filtered["phenodigm_score"] >= best_thr]

    # --- 4. フィルタリングされたエッジからノードを入手 -------------------------------
    nodes = set(
        pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True)
    )

    # ----------------------------------------------------
    # ネットワーク図のためのノードとエッジを作成
    # ----------------------------------------------------

    # ----------------------------------------------------
    # NodeをJSON形式に変換
    # ----------------------------------------------------
    node_json = []
    for node in nodes:
        phenotype = dict_marker_phenotype.get(node, "")
        disease = dict_marker_disease.get(node, "")
        node_color = dict_marker_effect[node] if node in dict_marker_effect else 0.0
        node_json.append(
            {
                "data": {
                    "id": node,
                    "label": node,
                    "phenotype": phenotype,
                    "disease": disease,
                    "node_color": node_color,
                }
            }
        )

    # ----------------------------------------------------
    # edgesを用意
    # ----------------------------------------------------
    df_edge = df_filtered[["marker1", "marker2", "phenodigm_score", "shared_mp"]]
    rows = df_edge.to_dict(orient="records")
    # EdgeをJSON形式に変換
    edge_json = [
        {
            "data": {
                "source": r["marker1"],
                "target": r["marker2"],
                "phenotype": r["shared_mp"],
                "edge_size": r["phenodigm_score"],
            }
        }
        for r in rows
    ]
    # ----------------------------------------------------
    # EdgeとNodeを統合して、出力
    # ----------------------------------------------------

    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{target_phenotype}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)


# 3m

In [None]:
%%bash

ls -lhS data/network/mp_term_name/ | head -n 5
echo "----------------------"
ls -lhS data/network/mp_term_name/ | tail -n 5

# TSUMUGI v0.2.2: total 5.3M
# TSUMUGI v0.3.0: total 5.5M
# TSUMUGI v0.3.1: total 5.1M <- 該当の表現型を含むネットワークのみを表示 （Issue: #54）
# TSUMUGI v0.3.2: total 3.0M

In [None]:
%%bash
# ファイルサイズが最大、最小のgene symbolのnode数を確認
zcat data/network/mp_term_name/edema.json.gz | grep -c "node_color"
zcat data/network/mp_term_name/prenatal_lethality_prior_to_heart_atrial_septation.json.gz | grep -c "node_color"
zcat data/network/mp_term_name/preweaning_lethality,_complete_penetrance.json.gz | grep -c "node_color"
zcat data/network/mp_term_name/convulsive_seizures.json.gz | grep -c "node_color"

# 137
# 119
# 264
# 2

## 遺伝子ごとのネットワークを出力

In [None]:
gene_symbols = df_similarity.marker1.unique().tolist()
gene_symbols += df_similarity.marker2.unique().tolist()
gene_symbols = list(set(gene_symbols))
gene_symbols.sort()  # 以下のfor文で、どこまで遺伝子が処理されたのか途中経過を見積もるためのソート
P(gene_symbols[:3])
P(len(gene_symbols))
# version 0.2.2: 4139
# version 0.3.0: 6812 (Life stageを考慮 + 類似度を追加)
# version 0.3.1: 6812
# version 0.3.2: 5583

In [None]:
Path("data/overlap/available_gene_symbols.txt").write_text(
    "\n".join(gene_symbols) + "\n"
)

In [None]:
output_dir = Path("data", "network", "gene_symbol")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)
# 10 sec

In [None]:
number_of_nodes = 125
tolerance = 25  # tolerance for the number of nodes
upper_limit = number_of_nodes + tolerance
lower_limit = number_of_nodes - tolerance

for gene_symbol in tqdm(gene_symbols, desc="Processing Gene Symbols"):
    """
    ノードが多すぎるとWebページが描画できない問題を回避するため、
    ノード数を200以下にするために最適なphenodigm_scoreを求める
    """
    # --- 1. gene_symbol を含むエッジのみ抽出 ---------------------------
    df_filtered = df_similarity[
        (df_similarity["marker1"] == gene_symbol)
        | (df_similarity["marker2"] == gene_symbol)
    ]

    # --- 2. 初期状態のノードの数を確認 -------------------------------
    nodes = set(
        pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True)
    )
    num_nodes = len(nodes)

    if num_nodes > upper_limit:
        # --- 3. 離散スコア値で探索 ----------------------------------------
        # gene_symbol と結ばれたエッジのスコア一覧（重複なし）を降順で取得
        discrete_scores = df_filtered.loc[:, "phenodigm_score"].unique()
        discrete_scores = np.sort(discrete_scores)[::-1]  # 降順

        best_thr = None
        best_diff = float("inf")

        lo, hi = 0, len(discrete_scores) - 1
        while lo <= hi:
            mid_idx = (lo + hi) // 2
            thr = discrete_scores[mid_idx]

            df_mid = df_filtered[df_filtered["phenodigm_score"] >= thr]
            # gene_symbol を含むエッジのみ抽出
            df_mid = df_mid[
                (df_mid["marker1"] == gene_symbol) | (df_mid["marker2"] == gene_symbol)
            ]
            nodes = set(
                pd.concat([df_mid["marker1"], df_mid["marker2"]], ignore_index=True)
            )

            if gene_symbol not in nodes:
                # gene_symbol が落ちた → 閾値が高すぎる（スコアを下げる）
                lo = mid_idx + 1
                continue

            num_nodes = len(nodes)
            # ▼ ノード数が許容範囲なら候補にする
            if num_nodes <= upper_limit:
                diff = abs(num_nodes - number_of_nodes)
                if diff < best_diff:
                    best_diff = diff
                    best_thr = thr
                # さらにノードを減らせるか？ → 閾値を **上げる**（スコアを大きく）
                lo = mid_idx + 1
            else:
                # ノードが多すぎ → 閾値を **上げる**（スコアを大きく）
                hi = mid_idx - 1

        # ------------ 最終の閾値 ---------------------------------------
        if best_thr is None:
            # Upper limit 以下がどうしても存在しない極端ケース → 最小ノードになる閾値
            best_thr = discrete_scores[hi + 1]  # hi は最後に -1 されているので +1

        df_filtered = df_filtered[df_filtered["phenodigm_score"] >= best_thr]
        # gene_symbol を含むエッジのみ抽出
        df_filtered = df_filtered[
            (df_filtered["marker1"] == gene_symbol)
            | (df_filtered["marker2"] == gene_symbol)
        ]

    nodes = set(
        pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True)
    )

    # ------------
    # ネットワーク図のためのノードとエッジを作成
    # ------------

    # nodesを用意
    node_json = []
    for node in nodes:
        phenotype = dict_marker_phenotype[node]
        disease = dict_marker_disease.get(node, "")
        # ノードの色を決定（gene_symbol の場合は 1、それ以外は 0）
        node_color = 1.0 if node == gene_symbol else 0.0

        node_json.append(
            {
                "data": {
                    "id": node,
                    "label": node,
                    "node_color": node_color,
                    "phenotype": phenotype,
                    "disease": disease,
                }
            }
        )

    # edgesを用意
    rows = df_similarity[
        (df_similarity["marker1"].isin(nodes)) & (df_similarity["marker2"].isin(nodes))
    ].to_dict(orient="records")

    # EdgeをJSON形式に変換
    edge_json = [
        {
            "data": {
                "source": r["marker1"],
                "target": r["marker2"],
                "phenotype": r["shared_mp"],
                "edge_size": r["phenodigm_score"],
            }
        }
        for r in rows
    ]

    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{gene_symbol}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)

# 10m

In [None]:
# available_mp_termsを作成

mp_terms = {}
for path_mp_term in Path("data", "mp_term_name").glob("*.csv"):
    mp_term = path_mp_term.stem
    if not Path("data", "network", "mp_term_name", f"{mp_term}.json.gz").exists():
        continue
    mp_term_name_space = mp_term.replace("_", " ")
    mp_terms[mp_term_name_space] = mp_term

json.dump(mp_terms, open("data/overlap/available_mp_terms.json", "w"), indent=2)
pd.DataFrame(mp_terms.keys()).to_csv(
    "data/overlap/available_mp_terms.txt", index=False, header=False, sep="\t"
)

print(len(mp_terms))

# TSUMUGI v0.3.2: 440

In [None]:
%%bash
ls -lhS data/network/gene_symbol/ | head -n 5
echo "----------------------"
ls -lhS data/network/gene_symbol/ | tail -n 5
# 30 sec
# version 0.3.0: total 170M
# version 0.3.1: total 168M
# version 0.3.2: total 50M

In [None]:
%%bash
# ファイルサイズが最大、最小のgene symbolのnode数を確認
zcat data/network/gene_symbol/Dstn.json.gz | grep -c "node_color"
zcat data/network/gene_symbol/Rab10.json.gz | grep -c "node_color"
zcat data/network/gene_symbol/Plekha8.json.gz | grep -c "node_color"
# 144
# 95
# 2

zcat data/network/gene_symbol/Tcerg1.json.gz | grep -c "node_color"
zcat data/network/gene_symbol/Jmjd7.json.gz | grep -c "node_color"


## Finish call

In [None]:
%%bash

uname -a # OS name
date +"%Y/%m/%d %H:%M:%S" # Last update