# Run allですべてのデータを準備する

* URL: https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/

In [1]:
TSUMUGI_VERSION = "0.3.2"
IMPC_RELEASE = 23.0

columns = ["marker_symbol", "marker_accession_id", "mp_term_name", "mp_term_id", "p_value", "effect_size",
           "female_ko_effect_p_value", "male_ko_effect_p_value", "female_ko_parameter_estimate","sex_effect_p_value", "male_ko_parameter_estimate", # sex differences
           "genotype_effect_p_value", "genotype_effect_parameter_estimate",
           "zygosity", # zygosity
           "pipeline_name", "procedure_name", # life-stage
           "allele_symbol", # map to Phendigm
           ]


In [2]:
P = print
from pprint import pprint as PP
from collections import Counter as C
from pathlib import Path
from collections import defaultdict
from itertools import combinations
import os
import csv
import numpy as np
import pandas as pd
import polars as pl
import shutil
import pickle
import json
import gzip
import networkx as nx
import urllib.request
from tqdm import tqdm
import hashlib


In [3]:
# Move up to top directory

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir('../')

print(os.getcwd())

/mnt/c/Users/akihi/Documents/GitHub/TSUMUGI-dev/notebooks/notebools-web
/mnt/c/Users/akihi/Documents/GitHub/TSUMUGI-dev


## 1. Download IMPC dataset

In [4]:
# Phenodigm dataが存在していない場合には、ダウンロードを促す

if not Path("data", "phenodigm", "impc_phenodigm.csv").exists():
    raise FileNotFoundError("Please download impc phenodigm data from https://diseasemodels.research.its.qmul.ac.uk/.")

In [5]:
# パスの設定
data_dir = Path("data/impc")
data_dir.mkdir(parents=True, exist_ok=True)
csv_path = data_dir / f"statistical-results-ALL-{IMPC_RELEASE}.csv"

# ファイルが存在しない場合にダウンロードして解凍
if not csv_path.exists():
    # ダウンロード URL
    url = f"https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/release-{IMPC_RELEASE}/results/statistical-results-ALL.csv.gz"

    print(f"Downloading and extracting: {url}")

    # URL からファイルサイズ取得（tqdmのため）
    with urllib.request.urlopen(url) as response:
        total_size = int(response.info().get("Content-Length", -1))
        with tqdm.wrapattr(response, "read", total=total_size, desc="Downloading", unit="B", unit_scale=True) as r:
            with gzip.GzipFile(fileobj=r) as uncompressed:
                with open(csv_path, 'wb') as out_file:
                    shutil.copyfileobj(uncompressed, out_file)

In [6]:
%%bash

# wc -l data/impc/statistical-results*.csv
# Release 22.1: 3165335
# Release 23.0: 2159931

# 1 min

## 2. Filter dataset by P value < 0.0001 (10^-4)


In [7]:
if not Path("data", f"statistical_filtered-{IMPC_RELEASE}.csv").exists():
    path_df_statistical_filtered = Path("data", "impc", f"statistical-results-ALL-{IMPC_RELEASE}.csv")
    df_statistical_all = pd.read_csv(path_df_statistical_filtered)
    df_statistical_all = df_statistical_all[columns]
    
    # Filter by p_value < 0.0001
    threshold = 0.0001
    filter_pvalue = df_statistical_all["p_value"] < threshold
    filter_female_ko_pvalue = df_statistical_all["female_ko_effect_p_value"] < threshold
    filter_male_ko_pvalue = df_statistical_all["male_ko_effect_p_value"] < threshold

    df_statistical_filtered = df_statistical_all[filter_pvalue | filter_female_ko_pvalue | filter_male_ko_pvalue]

    # Filter by mp_term_id and mp_term_name are not NaN
    df_statistical_filtered = df_statistical_filtered.dropna(subset=["mp_term_id"])
    df_statistical_filtered = df_statistical_filtered.dropna(subset=["mp_term_name"])

    # Filter by effect_size is not NaN
    df_statistical_filtered = df_statistical_filtered.dropna(subset=["effect_size"])
    df_statistical_filtered.to_csv(f"data/statistical_filtered-{IMPC_RELEASE}.csv", index=False) # 2 sec

# 30 seconds

In [8]:
df_statistical_filtered = pd.read_csv(f"data/statistical_filtered-{IMPC_RELEASE}.csv")

In [9]:
print(len(df_statistical_filtered))
# Release 22.0: 54059 rows
# Release 22.1: 54059 rows
# Release 23.0: 49299 rows

49299


## Split data by mp_term_name

In [10]:
df_statistical_filtered = pd.read_csv(f"data/statistical_filtered-{IMPC_RELEASE}.csv")

In [11]:
# data/mp_term_nameを作成

output_path = Path("data", "mp_term_name")
if output_path.exists():
    shutil.rmtree(output_path)
output_path.mkdir(parents=True, exist_ok=True)

In [12]:
# 名前をクリーンにする関数を定義
def clean_name(name):
    return name.replace("/", "_").replace(" ", "_")

# mp_term_nameをクリーニングし、ユニークな値を取得
unique_mp_term_names = df_statistical_filtered['mp_term_name'].unique()

In [13]:
# ユニークなmp_term_nameごとにフィルタリングしてCSVに保存: 5 sec
for mp_term_name in unique_mp_term_names:
    df_mp_term = df_statistical_filtered[df_statistical_filtered['mp_term_name'] == mp_term_name]
    clean_mp_term_name = clean_name(mp_term_name)
    df_mp_term.to_csv(f"data/mp_term_name/{clean_mp_term_name}.csv", index=False)
# 5 sec

## 3. TSUMUGIに必要なアノテーション情報を整理する

In [14]:
df_statistical_filtered = pd.read_csv(f"data/statistical_filtered-{IMPC_RELEASE}.csv")

### Annotate life stages

In [15]:
# life_stageの初期割り当て
def assign_life_stage(pipeline_name):
    if pd.isna(pipeline_name):
        return "Early"
    if "Interval" in pipeline_name or "interval" in pipeline_name:
        return "Interval"
    elif "Late" in pipeline_name or "late" in pipeline_name:
        return "Late"
    else:
        return "Early"

df_statistical_filtered["life_stage"] = df_statistical_filtered["pipeline_name"].apply(assign_life_stage)

# Embryo 表現型に該当する procedure_name の一覧
embryo_phenotyping = [
    "Gross Morphology Embryo E9.5",
    "Viability E9.5 Secondary Screen",
    "OPT E9.5",
    "MicroCT E9.5",
    "Gross Morphology Placenta E9.5",
    "Gross Morphology Embryo E12.5",
    "Embryo LacZ",
    "Gross Morphology Placenta E12.5",
    "Viability E12.5 Secondary Screen",
    "Viability E14.5-E15.5 Secondary Screen",
    "Gross Morphology Placenta E14.5-E15.5",
    "MicroCT E14.5-E15.5",
    "Gross Morphology Embryo E14.5-E15.5",
    "Viability E18.5 Secondary Screen",
    "MicroCT E18.5",
    "Gross Morphology Embryo E18.5",
    "Gross Morphology Placenta E18.5"
]

# life_stageをEmbryoに上書き
df_statistical_filtered.loc[df_statistical_filtered["procedure_name"].isin(embryo_phenotyping), "life_stage"] = "Embryo"
df_annotated = df_statistical_filtered.reset_index(drop=True)

In [16]:
print(len(df_annotated))
print(df_annotated["life_stage"].value_counts())
# 54059
# life_stage
# Early       45724
# Embryo       4253
# Late         4024
# Interval       58
# Name: count, dtype: int64

49299
life_stage
Early       41961
Embryo       4381
Late         2897
Interval       60
Name: count, dtype: int64


### Annotate Sex differences

In [17]:
threshold = 0.0001

# 条件リスト
conditions = [
    (df_annotated["sex_effect_p_value"] < threshold) & (df_annotated["female_ko_effect_p_value"] < threshold) & (df_annotated["male_ko_effect_p_value"] > threshold),
    (df_annotated["sex_effect_p_value"] < threshold) & (df_annotated["male_ko_effect_p_value"] < threshold) & (df_annotated["female_ko_effect_p_value"] > threshold)
]

# 条件に対応する値
choices = ["female", "male"]

# np.selectで列を設定
df_annotated["sexdual_dimorphism"] = np.select(conditions, choices, default=None)
df_annotated = df_annotated.reset_index(drop=True)

# 結果を確認
print(IMPC_RELEASE)
print(df_annotated["sexdual_dimorphism"].value_counts())

# RELEASE 22.1
# male      4915
# female    4146

# RELEASE 23.0
# male      5026
# female    4344

23.0
sexdual_dimorphism
male      5026
female    4344
Name: count, dtype: int64


In [18]:
# 確認
df_annotated.dropna(subset=["sexdual_dimorphism"])[["p_value", "sexdual_dimorphism", "effect_size"]].head(10)

Unnamed: 0,p_value,sexdual_dimorphism,effect_size
10,0.0001260278,male,2.230004
11,5.111543e-06,female,0.435955
12,0.0163733,male,-1.42984
13,0.5153139,male,1.088084
15,8.911086e-08,female,1.811095
16,3.849604e-08,female,1.602556
20,0.02362372,male,2.379854
26,9.895246e-06,female,1.095692
27,2.294615e-07,female,1.443321
30,2.508208e-08,male,-0.704889


### 遺伝型、性差、ライフステージのアノテーションを統合する

In [19]:
print(df_annotated["zygosity"].value_counts())
# RELEASE 22.1
# zygosity
# homozygote      41444
# heterozygote    11921
# hemizygote        694

# RELEASE 23.0
# homozygote      48037
# heterozygote    14706
# hemizygote        902


zygosity
homozygote      37820
heterozygote    10896
hemizygote        583
Name: count, dtype: int64


In [20]:
# colmuns = ["marker_symbol", "marker_accession_id", "mp_term_name", "mp_term_id", "p_value", "female_ko_effect_p_value", "male_ko_effect_p_value", "sexdual_dimorphism", "zygosity", "life_stage", "effect_size",]

# df_annotated = df_annotated[colmuns]

In [21]:
# mp_term = "increased fasting circulating glucose level".strip()
# df_annotated[
#     (df_annotated["marker_symbol"] == "Dnase1l2") &
#     (df_annotated["mp_term_name"] == mp_term)
# ]

In [22]:
# アノテーション列を追加（inplace）
def make_annotation(row) -> list[str]:
    # 遺伝型
    if row['zygosity'] == 'homozygote':
        annotate = "Homo"
    elif row['zygosity'] == 'heterozygote':
        annotate = "Hetero"
    else:
        annotate = "Hemi"

    # 性別
    if row['sexdual_dimorphism'] == "female":
        annotate += ", Female"
    elif row['sexdual_dimorphism'] == "male":
        annotate += ", Male"

    # life stage
    if row['life_stage'] in {"Embryo", "Early", "Interval", "Late"}:
        annotate += f", {row['life_stage']}"

    annotations = []

    annotations.append(f"{row['mp_term_name']} ({annotate})")

    return annotations

df_annotated["annotation"] = df_annotated.apply(make_annotation, axis=1)

df_exploded = df_annotated.explode("annotation").reset_index(drop=True)

# marker_symbol ごとに annotation をリスト化＆ソート
marker_annotation_map = (
    df_exploded
    .groupby("marker_symbol")["annotation"]
    .apply(lambda x: sorted(set(x)))
)

In [23]:
# 例：Rhdの注釈を表示
print(marker_annotation_map["Rhd"])
# 例：Amtの注釈を表示 (Embryo)
print(marker_annotation_map["Amt"])
# 例：Spag4の注釈を表示 (重複が削除されているか)
print(marker_annotation_map["Spag4"])


['abnormal skin condition (Homo, Early)', 'decreased circulating HDL cholesterol level (Homo, Male, Early)', 'decreased circulating alkaline phosphatase level (Homo, Female, Early)', 'decreased circulating cholesterol level (Homo, Male, Early)', 'decreased circulating free fatty acids level (Homo, Early)', 'decreased hemoglobin content (Homo, Male, Early)', 'decreased mean corpuscular hemoglobin (Homo, Early)', 'decreased mean corpuscular hemoglobin concentration (Homo, Early)', 'decreased mean corpuscular volume (Homo, Early)', 'increased exploration in new environment (Homo, Early)']
['abnormal abdominal wall morphology (Homo, Embryo)', 'abnormal embryo size (Homo, Embryo)', 'abnormal facial morphology (Homo, Embryo)', 'abnormal head shape (Homo, Embryo)', 'abnormal head size (Homo, Embryo)', 'abnormal limb morphology (Homo, Embryo)', 'abnormal placenta size (Homo, Embryo)', 'abnormal retina blood vessel morphology (Hetero, Early)', 'abnormal retina vasculature morphology (Hetero, Ea

In [24]:
Path("data/annotation").mkdir(exist_ok=True, parents=True)
file_path = "data/annotation/symbol_mptermname.json"
marker_annotation_map.to_json(file_path, indent=4)

# json.dump(marker_annotation_map, open(file_path, "w"), indent=4, sort_keys=True)


In [25]:
%%bash

grep -c "Male" data/annotation/symbol_mptermname.json | sed "s|^|Male: |"
grep -c "Female" data/annotation/symbol_mptermname.json | sed "s|^|Feale: |"

grep -c "Homo" data/annotation/symbol_mptermname.json | sed "s|^|Homo: |"
grep -c "Hetero" data/annotation/symbol_mptermname.json | sed "s|^|Hetero: |"
grep -c "Hemi" data/annotation/symbol_mptermname.json | sed "s|^|Hemi: |"

grep -c "Embryo" data/annotation/symbol_mptermname.json | sed "s|^|Embryo: |"
grep -c "Early" data/annotation/symbol_mptermname.json | sed "s|^|Early: |"
grep -c "Interval" data/annotation/symbol_mptermname.json | sed "s|^|Interval: |"
grep -c "Late" data/annotation/symbol_mptermname.json | sed "s|^|Late: |"

# RELEASE 22.1
# Male: 4915
# Feale: 4146
# Homo: 41444
# Hetero: 11921
# Hemi: 694
# Embryo: 4253
# Early: 45724
# Interval: 58
# Late: 4024

# RELEASE 23.0
# Male: 4480
# Feale: 3557
# Homo: 30977
# Hetero: 9625
# Hemi: 492
# Embryo: 4207
# Early: 34324
# Interval: 54
# Late: 2509

Male: 4480
Feale: 3557
Homo: 30977
Hetero: 9625
Hemi: 492
Embryo: 4207
Early: 34324
Interval: 54
Late: 2509


###  Phenodigmを用いたヒト疾患情報を取得する

In [26]:
df_phenodigm = pd.read_csv(Path("data", "phenodigm", "impc_phenodigm.csv"))
P(len(df_phenodigm))
# 3405

3405


In [27]:
# 各行について空白の数をカウント
space_counts = df_phenodigm['Mouse model description'].str.count(' ')

# 空白の数が2でない行を抽出（== split して3つにならない行）
invalid_rows = df_phenodigm[space_counts != 2]

# 結果表示
print(f"全体の件数: {len(df_phenodigm)}")
print(f"空白がちょうど2つでない行数: {len(invalid_rows)}")
print(invalid_rows.head())
# -> たった2つしかなく、`Phex<not yet available>`なので、この2つは無視する

全体の件数: 3405
空白がちょうど2つでない行数: 2
      .rownames  Disorder id                                Disorder name  \
586         587  OMIM:307800  Hypophosphatemic Rickets, X-Linked Dominant   
3210       3211  ORPHA:89936                    X-Linked Hypophosphatemia   

     Human gene symbol            Mouse model description  PhenoDigm % score  \
586               PHEX  Phex<not yet available> hom early             68.845   
3210              PHEX  Phex<not yet available> hom early             60.095   

                              Matching human phenotypes  \
586   HP:0002148,HP:0002749,HP:0002982,HP:0008144,HP...   
3210  HP:0100686,HP:0008144,HP:0001363,HP:0004349,HP...   

                              Matching mouse phenotypes  
586   MP:0002896,MP:0005627,MP:0000198,MP:0000195,MP...  
3210  MP:0002896,MP:0005627,MP:0000198,MP:0000195,MP...  


In [28]:
df_phenodigm = df_phenodigm[space_counts == 2]
P(len(df_phenodigm))
# 3403

3403


In [29]:
df_phenodigm[['allele_symbol', 'zygosity', 'life_stage']] = df_phenodigm['Mouse model description'].str.split(' ', n=2, expand=True)
df_phenodigm = df_phenodigm.drop(columns=['Mouse model description'])

In [30]:
P(df_phenodigm.columns)
P(df_phenodigm["allele_symbol"].head(3))
P(df_phenodigm["zygosity"].head(3))
P(df_phenodigm["life_stage"].head(3))

Index(['.rownames', 'Disorder id', 'Disorder name', 'Human gene symbol',
       'PhenoDigm % score', 'Matching human phenotypes',
       'Matching mouse phenotypes', 'allele_symbol', 'zygosity', 'life_stage'],
      dtype='object')
0    Arhgap31<em1(IMPC)Bay>
1    Arhgap31<em1(IMPC)Bay>
2     Twist1<em1(IMPC)Rbrc>
Name: allele_symbol, dtype: object
0    het
1    hom
2    het
Name: zygosity, dtype: object
0    embryo
1    embryo
2     early
Name: life_stage, dtype: object


In [31]:
# phenodigmの表記とimpcデータの表記を揃える

df_phenodigm = df_phenodigm.replace({'zygosity': {'hom': 'homozygote', 'het': 'heterozygote','hem': 'hemizygote'}})
df_phenodigm['life_stage'] = df_phenodigm['life_stage'].str.capitalize()
print(df_phenodigm["zygosity"].value_counts())
print(df_phenodigm["life_stage"].value_counts())

zygosity
homozygote      1928
heterozygote    1383
hemizygote        92
Name: count, dtype: int64
life_stage
Early     2723
Embryo     477
Late       192
Middle      11
Name: count, dtype: int64


In [32]:
df_annotated_phenodigm = \
    df_annotated.set_index(['allele_symbol','life_stage','zygosity']) \
    .join(df_phenodigm.set_index(['allele_symbol','life_stage','zygosity']), how='left', rsuffix='_phenodigm') \
    .reset_index()
print(len(df_annotated_phenodigm))

63645


In [33]:
columns_to_keep = [
    "marker_symbol", "Disorder name", "life_stage", "zygosity"
    ]
df_annotated_phenodigm = df_annotated_phenodigm[columns_to_keep].dropna(subset=["Disorder name"]).reset_index(drop=True)
df_annotated_phenodigm

Unnamed: 0,marker_symbol,Disorder name,life_stage,zygosity
0,Baz1b,Williams Syndrome,Early,heterozygote
1,Trp53,Osteogenic Sarcoma,Early,homozygote
2,Trp53,Papilloma Of Choroid Plexus,Early,homozygote
3,Trp53,Cushing Disease,Early,homozygote
4,Got2,Developmental And Epileptic Encephalopathy 82,Embryo,homozygote
...,...,...,...,...
27091,Gdf5,Angel-Shaped Phalango-Epiphyseal Dysplasia,Early,homozygote
27092,Gdf5,Brachydactyly Type C,Early,homozygote
27093,Gdf5,Brachydactyly Type A1,Early,homozygote
27094,Gdf5,Brachydactyly Type A2,Early,homozygote


In [34]:
# アノテーション列を追加（inplace）
def make_annotation(row) -> list[str]:
    # 遺伝型
    if row['zygosity'] == 'homozygote':
        annotate = "Homo"
    elif row['zygosity'] == 'heterozygote':
        annotate = "Hetero"
    else:
        annotate = "Hemi"

    # life stage
    if row['life_stage'] in {"Embryo", "Early", "Interval", "Late"}:
        annotate += f", {row['life_stage']}"

    annotations = []

    annotations.append(f"{row['Disorder name']} ({annotate})")

    return annotations

df_annotated_phenodigm["annotation"] = df_annotated_phenodigm.apply(make_annotation, axis=1)

df_exploded = df_annotated_phenodigm.explode("annotation").reset_index(drop=True)

# marker_symbol ごとに annotation をリスト化＆ソート
marker_annotation_map = (
    df_exploded
    .groupby("marker_symbol")["annotation"]
    .apply(lambda x: sorted(set(x)))
)

In [35]:
# 例：Phenodigmの注釈を表示 (Embryo)
print(marker_annotation_map["Arhgap31"])

['Adams-Oliver Syndrome (Hetero, Embryo)', 'Adams-Oliver Syndrome (Homo, Embryo)', 'Adams-Oliver Syndrome 1 (Hetero, Embryo)', 'Adams-Oliver Syndrome 1 (Homo, Embryo)']


In [36]:
Path("data/annotation").mkdir(exist_ok=True, parents=True)
file_path = "data/annotation/symbol_disordername.json"
marker_annotation_map.to_json(file_path, indent=4)

### mp term nameとIMPCのPhenotype URLを紐付ける

In [37]:
df_select = df_statistical_filtered[['mp_term_id', 'mp_term_name']].drop_duplicates()
# df_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
df_select

Unnamed: 0,mp_term_id,mp_term_name
0,MP:0001510,abnormal coat appearance
1,MP:0011100,"preweaning lethality, complete penetrance"
2,MP:0004738,abnormal auditory brainstem response
3,MP:0001303,abnormal lens morphology
4,MP:0001147,small testis
...,...,...
47215,MP:0011503,distended jejunum
47367,MP:0002862,abnormal righting response
48121,MP:0004727,absent epididymis
48347,MP:0000023,abnormal ear position


In [38]:
dict_phenotype_url = dict()
for index, row in df_select.iterrows():
    mp_tern_id = row['mp_term_id']
    impc_url = f"https://www.mousephenotype.org/data/phenotypes/{mp_tern_id}"
    mp_term_name = row['mp_term_name']
    dict_phenotype_url[mp_term_name] = impc_url

print(dict_phenotype_url["small lymph nodes"])

https://www.mousephenotype.org/data/phenotypes/MP:0002217


In [39]:
with open('data/annotation/mptermname_phenotypeurl.tsv', 'w') as f:
    for term, url in dict_phenotype_url.items():
        f.write(f"{term}\t{url}\n")

In [40]:
%%bash

head -n 3 data/annotation/mptermname_phenotypeurl.tsv
wc -l data/annotation/mptermname_phenotypeurl.tsv
# Release 22.0: 664
# Release 23.0: 659

abnormal coat appearance	https://www.mousephenotype.org/data/phenotypes/MP:0001510
preweaning lethality, complete penetrance	https://www.mousephenotype.org/data/phenotypes/MP:0011100
abnormal auditory brainstem response	https://www.mousephenotype.org/data/phenotypes/MP:0004738
659 data/annotation/mptermname_phenotypeurl.tsv


### marker symbolとMGI accession idを紐付ける

In [41]:
df_select = df_statistical_filtered[['marker_symbol', 'marker_accession_id']].drop_duplicates()
# df_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
df_select
# Release 22.1: 7746 rows
# Release 23.0: 7934 rows

Unnamed: 0,marker_symbol,marker_accession_id
0,Ap1ar,MGI:2384822
1,Kif21a,MGI:109188
2,Baiap2l2,MGI:2652819
3,Wars1,MGI:104630
4,Prss47,MGI:2685120
...,...,...
49018,Cacng3,MGI:1859165
49023,Dkk4,MGI:2385299
49229,Slc9a3,MGI:105064
49248,Srek1,MGI:2145245


In [42]:
dict_symbol_id = dict()
for index, row in df_select.iterrows():
    dict_symbol_id[row['marker_symbol']] = row['marker_accession_id']
print(dict_symbol_id["Ncam1"])

MGI:97281


In [43]:
json.dump(dict_symbol_id, open("data/annotation/symbol_mgiid.json", "w"), indent=4, sort_keys=True)
Path("data/annotation/symbol_mgiid.tsv").write_text("\n".join([f"{k}\t{v}" for k, v in dict_symbol_id.items()]))

143882

In [44]:
%%bash
head -n 3 data/annotation/symbol_mgiid.tsv

Ap1ar	MGI:2384822
Kif21a	MGI:109188
Baiap2l2	MGI:2652819


## 4. 表現型の類似度を求める

In [45]:
file_path = Path("data", "annotation", "symbol_mptermname.json")

symbol_mptermname = json.load(open(file_path))
print(symbol_mptermname["Dpf2"])

['abnormal craniofacial morphology (Homo, Embryo)', 'abnormal embryo size (Homo, Embryo)', 'abnormal heart morphology (Homo, Embryo)', 'abnormal kidney morphology (Hetero, Early)', 'abnormal limb morphology (Homo, Embryo)', 'abnormal seminal vesicle morphology (Hetero, Early)', 'abnormal tail morphology (Homo, Embryo)', 'edema (Homo, Embryo)', 'pallor (Homo, Embryo)', 'preweaning lethality, complete penetrance (Homo, Early)']


In [46]:
# 空リストを持つ要素を除外しつつ、値を list → set に変換して重複を削除する
symbol_mptermname = {k: set(v) for k, v in symbol_mptermname.items() if v}
print(symbol_mptermname["Dpf2"])

{'abnormal embryo size (Homo, Embryo)', 'edema (Homo, Embryo)', 'abnormal tail morphology (Homo, Embryo)', 'abnormal limb morphology (Homo, Embryo)', 'abnormal kidney morphology (Hetero, Early)', 'pallor (Homo, Embryo)', 'preweaning lethality, complete penetrance (Homo, Early)', 'abnormal heart morphology (Homo, Embryo)', 'abnormal craniofacial morphology (Homo, Embryo)', 'abnormal seminal vesicle morphology (Hetero, Early)'}


### Jaccard係数で集合の類似度を計算

In [47]:

gene_pair_mp_similarity = []

for a, b in tqdm(combinations(symbol_mptermname, 2), desc="Calculating gene pair MP term similarity"):
    shared_mp = sorted(symbol_mptermname[a] & symbol_mptermname[b])
    shared_mp_number = len(shared_mp)
    union_mp_number = len(symbol_mptermname[a] | symbol_mptermname[b])
    overlap_ratio = shared_mp_number / union_mp_number

    gene_pair_mp_similarity.append([a, b, round(overlap_ratio, 3), shared_mp_number, shared_mp])

## 3 min

Calculating gene pair MP term similarity: 31629081it [01:47, 293081.83it/s]


In [48]:
print(gene_pair_mp_similarity[:3])
print(len(gene_pair_mp_similarity))
# Release 22.0: 29996385
# Release 22.1: 29996385
# Release 23.0: 31470211

[['1110059G10Rik', '1500009L16Rik', 0.0, 0, []], ['1110059G10Rik', '1600014C10Rik', 0.0, 0, []], ['1110059G10Rik', '1600029I14Rik', 0.0, 0, []]]
31629081


### 重複する表現型が閾値以上のものを抽出

In [49]:
similarity_threshold = 0.2
num_shared_mp = 2

gene_pair_mp_similarity_filtered = []
for record in tqdm(gene_pair_mp_similarity, desc="Filtering gene pair MP term similarity"):
    if record[2] >= similarity_threshold and record[3] >= num_shared_mp:
        gene_pair_mp_similarity_filtered.append(record)

# 5 sec

Filtering gene pair MP term similarity: 100%|██████████| 31629081/31629081 [00:04<00:00, 6362329.75it/s]


In [50]:
print(gene_pair_mp_similarity_filtered[:3])
print(len(gene_pair_mp_similarity_filtered))
# Release 21.1: 134880
# Release 22.0: 133281 <- Homo/Hetero/Hemiおよび♂・♀の完全一致を考慮するようになったため、減少
# Release 22.1: 133281
# v0.3.0: 261,216 <- Similarity_threshodのor条件をつけたため、増加
# Release 23.0 TSUMUGI v0.3.2: 205,460 (similarity >= 0.2 and num phenotype >= 2)

[['1500009L16Rik', 'Cldnd2', 0.25, 2, ['increased circulating calcium level (Homo, Early)', 'increased circulating serum albumin level (Homo, Early)']], ['1500009L16Rik', 'Eif2ak2', 0.333, 2, ['increased circulating calcium level (Homo, Early)', 'increased circulating serum albumin level (Homo, Early)']], ['1500009L16Rik', 'Lrrtm3', 0.222, 2, ['increased circulating aspartate transaminase level (Homo, Early)', 'increased circulating serum albumin level (Homo, Early)']]]
205460


In [51]:
Path("data", "overlap").mkdir(exist_ok=True, parents=True)
pickle.dump(gene_pair_mp_similarity, open("data/overlap/gene_pair_mp_similarity.pkl", "wb"))
pickle.dump(gene_pair_mp_similarity_filtered, open("data/overlap/gene_pair_mp_similarity_filtered.pkl", "wb"))

# 1 min

### 生データをCSV形式で出力 （ダウンロード用）

In [None]:
df_similarity = pd.DataFrame(gene_pair_mp_similarity)
df_similarity.columns = ["Gene1", "Gene2", "Jaccard Similarity", "Number of shared phenotype", "List of shared phenotypes"]
df_similarity.reindex(
    columns=["Gene1", "Gene2", "Number of shared phenotype", "Jaccard Similarity", "List of shared phenotypes"]
)
# df_similarity["List of shared phenotypes"] = df_similarity["List of shared phenotypes"].apply(json.dumps)
# 30 sec

Unnamed: 0,Gene1,Gene2,Jaccard Similarity,Number of shared phenotype,List of shared phenotypes
0,1110059G10Rik,1500009L16Rik,0.00,0,[]
1,1110059G10Rik,1600014C10Rik,0.00,0,[]
2,1110059G10Rik,1600029I14Rik,0.00,0,[]
3,1110059G10Rik,1700003F12Rik,0.00,0,[]
4,1110059G10Rik,1700008O03Rik,0.00,0,[]
...,...,...,...,...,...
31629076,Zyg11b,Zzz3,0.00,0,[]
31629077,Zyg11b,a,0.00,0,[]
31629078,Zzef1,Zzz3,0.00,0,[]
31629079,Zzef1,a,0.05,1,"[abnormal kidney morphology (Homo, Early)]"


In [53]:
output_dir = Path("data", "TSUMUGI_RawData")
output_dir.mkdir(parents=True, exist_ok=True)
path_csv = output_dir / Path(f"TSUMUGI_v{TSUMUGI_VERSION}_raw_data.csv.gz")
path_parquet = output_dir / Path(f"TSUMUGI_v{TSUMUGI_VERSION}_raw_data.parquet")

def get_head1000_hash(df: pd.DataFrame) -> str:
    # head(1000)だけを対象にハッシュ化
    csv_bytes = df.head(1000).to_csv(index=False, lineterminator='\n').encode('utf-8')
    return hashlib.md5(csv_bytes).hexdigest()

def file_head1000_hash(path: Path) -> str | None:
    if not path.exists():
        return None
    with gzip.open(path, "rt", encoding="utf-8") as f:
        lines = [next(f) for _ in range(1001)]  # 1行目がヘッダー
        csv_content = ''.join(lines).encode('utf-8')
        return hashlib.md5(csv_content).hexdigest()

# 比較
new_hash = get_head1000_hash(df_similarity)
existing_hash = file_head1000_hash(path_csv)

if new_hash != existing_hash:
    df_similarity.to_csv(path_csv, index=False, compression="gzip", lineterminator='\n')
    df_similarity.to_parquet(path_parquet, index=False)
    print("🔄 ファイルを更新しました")
    # 3 min
else:
    print("✅ 内容に変更がないためスキップしました")

✅ 内容に変更がないためスキップしました


In [54]:
# df_pandas = pd.read_parquet(path_parquet)
# print(df_pandas)
# # 15 sec

In [55]:
# df_polars = pl.read_parquet(path_parquet)
# print(df_polars.head())
# print(df_polars.shape)
# # print(df_polars.schema)

In [56]:
# df_overlap_filtered = pd.DataFrame(shared_ratios_filtered)
# df_overlap_filtered.columns = ["Gene1", "Gene2", "Jaccard Similarity", "Number of shared phenotype", "List of shared phenotypes"]
# df_overlap_filtered.reindex(
#     columns=["Gene1", "Gene2", "Number of shared phenotype", "Jaccard Similarity", "List of shared phenotypes"]
# )
# df_overlap_filtered["List of shared phenotypes"] = df_overlap_filtered["List of shared phenotypes"].apply(json.dumps)

# df_overlap_filtered.to_csv("data/TSUMUGI_filtered_data.csv.gz", index=False, compression="gzip", lineterminator='\n')

## 表現型ごとのネットワークを出力

In [84]:
gene_pair_mp_similarity_filtered = pickle.load(open("data/overlap/gene_pair_mp_similarity_filtered.pkl", "rb"))

In [85]:
df_similarity = pd.DataFrame(
    gene_pair_mp_similarity_filtered, columns=["marker1", "marker2", "phenotype_similarity", "shared_mp_number", "shared_mp"]
)
print(len(df_similarity))
# version 0.2.2: 133281  rows × 5 columns
# version 0.3.0: 261216  rows × 5 columns

205460


In [90]:
df_marker_phenotype = json.load(open("data/annotation/symbol_mptermname.json"))
df_marker_phenotype = pd.DataFrame(df_marker_phenotype.items(), columns=["marker_symbol", "mp_term_name"])
print(len(df_marker_phenotype))
# TSUMUGI v0.2.2: 7626 rows
# TSUMUGI v0.3.0: 7746 rows
# TSUMUGI v0.3.1: 7746 rows
# TSUMUGI v0.3.2: 7954 rows

7954


In [91]:
dict_marker_phenotype = dict(zip(df_marker_phenotype.marker_symbol, df_marker_phenotype.mp_term_name))

In [92]:
dict_marker_disease = json.load(open("data/annotation/symbol_disordername.json"))

In [93]:
output_dir = Path("data/network/mp_term_name")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)


In [None]:
path_target_phenotypes = list(Path("data", "mp_term_name").glob("*.csv"))

"""
ノードが多すぎるとWebページが描画できない問題を回避するため、
ノード数を閾値（upper_limit）以下にするために最適なphenotype_similarityを求める
"""
number_of_nodes = 125
tolerance = 25
upper_limit = number_of_nodes + tolerance
lower_limit = number_of_nodes - tolerance

for path_target_phenotype in tqdm(path_target_phenotypes, desc="Processing MP terms"):

    columns = ["marker_symbol", "effect_size"]
    df_marker_effect = pd.read_csv(path_target_phenotype, usecols=columns).dropna(subset=["effect_size"])
    df_marker_effect["effect_size"] = df_marker_effect["effect_size"].abs()
    # * effect sizeの絶対値が最大の行を取得 (Homo/Heteroで異なる効果量がある場合に、ひとまず最大値を採用する← 今後の考慮事項)
    idx = df_marker_effect.groupby("marker_symbol")["effect_size"].idxmax()
    df_max = df_marker_effect.loc[idx]

    dict_marker_effect = dict(zip(df_max["marker_symbol"], df_max["effect_size"]))

    target_phenotype = path_target_phenotype.stem
    target_phenotype_space = target_phenotype.replace("_", " ")
    gene_symbols = df_marker_effect['marker_symbol']

    # --- 1. phenotypeを生じるgene_symbolsを含むエッジのみ抽出 ---------------------------
    df_filtered = df_similarity[
        df_similarity['marker1'].isin(gene_symbols) &
        df_similarity['marker2'].isin(gene_symbols) &
        df_similarity['shared_mp'].apply(lambda lst: any(target_phenotype_space in term for term in lst))
    ]

    # --- 2. 初期状態のノードの数を確認 -------------------------------
    nodes = set(pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True))
    num_nodes = len(nodes)

    if num_nodes > upper_limit:
            # --- 3. 離散スコア値で探索 ----------------------------------------
            discrete_scores = (
                df_filtered
                .loc[:, "phenotype_similarity"]
                .unique()
            )
            discrete_scores = np.sort(discrete_scores)[::-1]      # 降順

            best_thr  = None
            best_diff = float("inf")

            lo, hi = 0, len(discrete_scores) - 1
            while lo <= hi:
                mid_idx = (lo + hi) // 2
                thr = discrete_scores[mid_idx]

                df_mid = df_filtered[df_filtered["phenotype_similarity"] >= thr]
                nodes = set(pd.concat([df_mid["marker1"], df_mid["marker2"]], ignore_index=True))
                num_nodes = len(nodes)

                # ▼ ノード数が許容範囲なら候補にする
                if num_nodes <= upper_limit:
                    diff = abs(num_nodes - number_of_nodes)
                    if diff < best_diff:
                        best_diff = diff
                        best_thr  = thr
                    # さらにノードを減らせるか？ → 閾値を **上げる**（スコアを大きく）
                    lo = mid_idx + 1
                else:
                    # ノードが多すぎ → 閾値を **上げる**（スコアを大きく）
                    hi = mid_idx - 1

            # ------------ 最終の閾値 ---------------------------------------
            if best_thr is None:
                # Upper limit以下がどうしても存在しないケース → 最小ノードになる閾値
                best_thr = discrete_scores[hi + 1]  # hi は最後に -1 されているので +1

            df_filtered = df_filtered[df_filtered["phenotype_similarity"] >= best_thr]

    # --- 4. フィルタリングされたエッジからノードを入手 -------------------------------
    nodes = set(pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True))

    # ----------------------------------------------------
    # ネットワーク図のためのノードとエッジを作成
    # ----------------------------------------------------

    # ----------------------------------------------------
    # NodeをJSON形式に変換
    # ----------------------------------------------------
    node_json = []
    for node in nodes:
        phenotype = dict_marker_phenotype.get(node, "")
        disease = dict_marker_disease.get(node, "")
        node_color = dict_marker_effect[node] if node in dict_marker_effect else 0.0
        node_json.append({
            "data": {
                "id": node,
                "label": node,
                "phenotype": phenotype,
                "disease": disease,
                "node_color": node_color,
            }
        })

    # ----------------------------------------------------
    # edgesを用意
    # ----------------------------------------------------
    df_edge = df_filtered[["marker1", "marker2", "phenotype_similarity", "shared_mp"]]
    rows = df_edge.to_dict(orient="records")
    # EdgeをJSON形式に変換
    edge_json = [
        {
            "data": {
                "source":   r["marker1"],
                "target":   r["marker2"],
                "phenotype": r["shared_mp"],
                "edge_size": r["phenotype_similarity"],
            }
        }
        for r in rows
    ]
    # ----------------------------------------------------
    # EdgeとNodeを統合して、出力
    # ----------------------------------------------------

    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{target_phenotype}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)


# 3m

Processing MP terms:   0%|          | 0/659 [00:00<?, ?it/s]

Processing MP terms: 100%|██████████| 659/659 [02:45<00:00,  3.98it/s]


In [99]:
%%bash

ls -lhS data/network/mp_term_name/ | head -n 5
echo "----------------------"
ls -lhS data/network/mp_term_name/ | tail -n 5

# TSUMUGI v0.2.2: total 5.3M
# TSUMUGI v0.3.0: total 5.5M
# TSUMUGI v0.3.1: total 5.1M <- 該当の表現型を含むネットワークのみを表示 （Issue: #54）
# TSUMUGI v0.3.2: total 3.0M

total 3.0M
-rwxrwxrwx 1 kuno kuno  32K Jun 22 05:02 prenatal_lethality_prior_to_heart_atrial_septation.json.gz
-rwxrwxrwx 1 kuno kuno  27K Jun 22 05:00 abnormal_placenta_morphology.json.gz
-rwxrwxrwx 1 kuno kuno  26K Jun 22 05:00 abnormal_neural_tube_closure.json.gz
-rwxrwxrwx 1 kuno kuno  24K Jun 22 05:00 abnormal_embryo_turning.json.gz
----------------------
-rwxrwxrwx 1 kuno kuno  359 Jun 22 05:00 abnormal_glucose_homeostasis.json.gz
-rwxrwxrwx 1 kuno kuno  357 Jun 22 05:02 increased_tidal_volume.json.gz
-rwxrwxrwx 1 kuno kuno  336 Jun 22 05:02 prolonged_PQ_interval.json.gz
-rwxrwxrwx 1 kuno kuno  328 Jun 22 05:02 single_kidney.json.gz
-rwxrwxrwx 1 kuno kuno  310 Jun 22 05:00 convulsive_seizures.json.gz


In [101]:
%%bash
# ファイルサイズが最大、最小のgene symbolのnode数を確認
zcat data/network/mp_term_name/edema.json.gz | grep -c "node_color"
zcat data/network/mp_term_name/prenatal_lethality_prior_to_heart_atrial_septation.json.gz | grep -c "node_color"
zcat data/network/mp_term_name/preweaning_lethality,_complete_penetrance.json.gz | grep -c "node_color"
zcat data/network/mp_term_name/convulsive_seizures.json.gz | grep -c "node_color"

137
119
264
2


## 遺伝子ごとのネットワークを出力

In [102]:
gene_symbols = df_similarity.marker1.unique().tolist()
gene_symbols += df_similarity.marker2.unique().tolist()
gene_symbols = list(set(gene_symbols))
gene_symbols.sort()  # 以下のfor文で、どこまで遺伝子が処理されたのか途中経過を見積もるためのソート
P(gene_symbols[:3])
P(len(gene_symbols))
# version 0.2.2: 4139
# version 0.3.0: 6812 (Life stageを考慮 + 類似度を追加)
# version 0.3.1: 6812
# version 0.3.2: 5583

['1500009L16Rik', '1600014C10Rik', '1700003F12Rik']
5583


In [103]:
Path("data/overlap/available_gene_symbols.txt").write_text("\n".join(gene_symbols) + "\n")

35523

In [113]:
output_dir = Path("data", "network", "gene_symbol")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)
# 10 sec

In [147]:
number_of_nodes = 125
tolerance = 25  # tolerance for the number of nodes
upper_limit = number_of_nodes + tolerance
lower_limit = number_of_nodes - tolerance

for gene_symbol in tqdm(gene_symbols, desc="Processing Gene Symbols"):
    """
    ノードが多すぎるとWebページが描画できない問題を回避するため、
    ノード数を200以下にするために最適なphenotype_similarityを求める
    """
    # --- 1. gene_symbol を含むエッジのみ抽出 ---------------------------
    df_filtered = df_similarity[
        (df_similarity["marker1"] == gene_symbol) |
        (df_similarity["marker2"] == gene_symbol)
    ]

    # --- 2. 初期状態のノードの数を確認 -------------------------------
    nodes = set(pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True))
    num_nodes = len(nodes)

    if num_nodes > upper_limit:
        # --- 3. 離散スコア値で探索 ----------------------------------------
        # gene_symbol と結ばれたエッジのスコア一覧（重複なし）を降順で取得
        discrete_scores = (
            df_filtered
            .loc[:, "phenotype_similarity"]
            .unique()
        )
        discrete_scores = np.sort(discrete_scores)[::-1]      # 降順

        best_thr  = None
        best_diff = float("inf")

        lo, hi = 0, len(discrete_scores) - 1
        while lo <= hi:
            mid_idx = (lo + hi) // 2
            thr = discrete_scores[mid_idx]

            df_mid = df_filtered[df_filtered["phenotype_similarity"] >= thr]
            # gene_symbol を含むエッジのみ抽出
            df_mid = df_mid[
                (df_mid["marker1"] == gene_symbol) |
                (df_mid["marker2"] == gene_symbol)
            ]
            nodes = set(pd.concat([df_mid["marker1"], df_mid["marker2"]], ignore_index=True))

            if gene_symbol not in nodes:
                # gene_symbol が落ちた → 閾値が高すぎる（スコアを下げる）
                lo = mid_idx + 1
                continue

            num_nodes = len(nodes)
            # ▼ ノード数が許容範囲なら候補にする
            if num_nodes <= upper_limit:
                diff = abs(num_nodes - number_of_nodes)
                if diff < best_diff:
                    best_diff = diff
                    best_thr  = thr
                # さらにノードを減らせるか？ → 閾値を **上げる**（スコアを大きく）
                lo = mid_idx + 1
            else:
                # ノードが多すぎ → 閾値を **上げる**（スコアを大きく）
                hi = mid_idx - 1

        # ------------ 最終の閾値 ---------------------------------------
        if best_thr is None:
            # Upper limit 以下がどうしても存在しない極端ケース → 最小ノードになる閾値
            best_thr = discrete_scores[hi + 1]  # hi は最後に -1 されているので +1

        df_filtered = df_filtered[df_filtered["phenotype_similarity"] >= best_thr]
        # gene_symbol を含むエッジのみ抽出
        df_filtered = df_filtered[
            (df_filtered["marker1"] == gene_symbol) |
            (df_filtered["marker2"] == gene_symbol)
        ]

    nodes = set(pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True))

    # ------------
    # ネットワーク図のためのノードとエッジを作成
    # ------------

    # nodesを用意
    node_json = []
    for node in nodes:
        phenotype = dict_marker_phenotype[node]
        disease = dict_marker_disease.get(node, "")
        # ノードの色を決定（gene_symbol の場合は 1、それ以外は 0）
        node_color = 1.0 if node == gene_symbol else 0.0

        node_json.append({
            "data": {
                "id": node,
                "label": node,
                "node_color": node_color,
                "phenotype": phenotype,
                "disease": disease,
                }
            })

    # edgesを用意
    rows = df_similarity[(df_similarity["marker1"].isin(nodes)) & (df_similarity["marker2"].isin(nodes))].to_dict(orient="records")

    # EdgeをJSON形式に変換
    edge_json = [
        {
            "data": {
                "source":   r["marker1"],
                "target":   r["marker2"],
                "phenotype": r["shared_mp"],
                "edge_size": r["phenotype_similarity"],
            }
        }
        for r in rows
    ]

    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{gene_symbol}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)

# 10m

Processing Gene Symbols:   0%|          | 0/5583 [00:00<?, ?it/s]

In [149]:
%%bash
ls -lhS data/network/gene_symbol/ | head -n 5
echo "----------------------"
ls -lhS data/network/gene_symbol/ | tail -n 5
# 30 sec
# version 0.3.0: total 170M
# version 0.3.1: total 168M
# version 0.3.2: total 50M

total 50M
-rwxrwxrwx 1 kuno kuno  78K Jun 22 06:36 Dstn.json.gz
-rwxrwxrwx 1 kuno kuno  77K Jun 22 06:39 Pdss2.json.gz
-rwxrwxrwx 1 kuno kuno  76K Jun 22 06:38 Mettl16.json.gz
-rwxrwxrwx 1 kuno kuno  70K Jun 22 06:36 Chd1.json.gz
----------------------
-rwxrwxrwx 1 kuno kuno  248 Jun 22 06:37 Kbtbd12.json.gz
-rwxrwxrwx 1 kuno kuno  247 Jun 22 06:35 Ankhd1.json.gz
-rwxrwxrwx 1 kuno kuno  247 Jun 22 06:37 Jmjd7.json.gz
-rwxrwxrwx 1 kuno kuno  244 Jun 22 06:39 Plekha8.json.gz
-rwxrwxrwx 1 kuno kuno  244 Jun 22 06:40 Tmem174.json.gz


In [148]:
%%bash
# ファイルサイズが最大、最小のgene symbolのnode数を確認
zcat data/network/gene_symbol/Dstn.json.gz | grep -c "node_color"
zcat data/network/gene_symbol/Rab10.json.gz | grep -c "node_color"
zcat data/network/gene_symbol/Plekha8.json.gz | grep -c "node_color"

144


95
2


## test

In [109]:
# gene_symbol = "Coa6"  # 例としてCdk9を指定
# df_filtered = df_similarity[(df_similarity["marker1"] == gene_symbol) | (df_similarity["marker2"] == gene_symbol)]

# nodes = set(df_filtered["marker1"].to_list() + df_filtered["marker2"].to_list())
# num_nodes = len(nodes)

# G = nx.from_pandas_edgelist(df_filtered, "marker1", "marker2")

# # ノードAと直接つながっているノードのみを取得
# neighbors = list(G.neighbors(gene_symbol))
# subgraph_nodes = [gene_symbol] + neighbors
# subgraph = G.subgraph(subgraph_nodes)

# print(num_nodes)
# print(len(subgraph.nodes))

In [110]:
# number_of_nodes = 200
# tolerance = 25  # tolerance for the number of nodes
# gene_symbol = "Coa6"  # 例としてCdk9を指定
# df_filtered = df_similarity[(df_similarity["marker1"] == gene_symbol) | (df_similarity["marker2"] == gene_symbol)]

# G = nx.from_pandas_edgelist(df_filtered, "marker1", "marker2")

# # ノードAと直接つながっているノードのみを取得
# neighbors = list(G.neighbors(gene_symbol))
# subgraph_nodes = [gene_symbol] + neighbors
# subgraph = G.subgraph(subgraph_nodes)

# # 二分探索の範囲
# low, high = df_filtered["phenotype_similarity"].min(), df_filtered["phenotype_similarity"].max()
# best_phenotype_similarity = None

# while low <= high:
#     mid = (low + high) / 2

#     # phenotype_similarity >= mid のデータをフィルタリング
#     df_mid = df_filtered[df_filtered["phenotype_similarity"] >= mid]

#     G = nx.from_pandas_edgelist(df_mid, "marker1", "marker2")
#     # ノードAと直接つながっているノードのみを取得
#     if gene_symbol in G:
#         neighbors = list(G.neighbors(gene_symbol))
#     else:
#     # mid が大きすぎてノードが除外されすぎた（＝subgraph が小さくなりすぎた）と考えて、探索範囲の上限 (high) を下げる
#         high = mid - 1e-6
#         continue

#     subgraph_nodes = [gene_symbol] + neighbors
#     subgraph = G.subgraph(subgraph_nodes)

#     node_count = len(subgraph.nodes)
#     # ターゲットノード数に近い場合、結果を保存
#     if number_of_nodes - tolerance < node_count < number_of_nodes + tolerance:
#         best_phenotype_similarity = mid
#         break
#     elif node_count > number_of_nodes:
#         # ノード数が多い場合、範囲を上げる
#         best_phenotype_similarity = mid
#         low = mid + 1e-6
#     else:
#         # ノード数が少ない場合、範囲を下げる
#         best_phenotype_similarity = mid
#         high = mid - 1e-6

# df_nodes = df_filtered[df_filtered["phenotype_similarity"] >= best_phenotype_similarity]
# G = nx.from_pandas_edgelist(df_nodes, "marker1", "marker2")
# # ノードAと直接つながっているノードのみを取得
# neighbors = list(G.neighbors(gene_symbol))
# subgraph_nodes = [gene_symbol] + neighbors
# subgraph = G.subgraph(subgraph_nodes)

# P(len(subgraph.nodes))  # 200

In [144]:
# gene_symbol = "Coa6"  # 例としてCdk9を指定
gene_symbol = "Rab10"

"""
ノードが多すぎるとWebページが描画できない問題を回避するため、
ノード数を200以下にするために最適なphenotype_similarityを求める
"""
print(gene_symbol)
# --- 1. gene_symbol を含むエッジのみ抽出 ---------------------------
df_filtered = df_similarity[
    (df_similarity["marker1"] == gene_symbol) |
    (df_similarity["marker2"] == gene_symbol)
]

# --- 2. 初期状態のノードの数を確認 -------------------------------
nodes = set(pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True))
num_nodes = len(nodes)

if num_nodes > upper_limit:
    # --- 3. 離散スコア値で探索 ----------------------------------------
    # gene_symbol と結ばれたエッジのスコア一覧（重複なし）を降順で取得
    discrete_scores = (
        df_filtered
        .loc[:, "phenotype_similarity"]
        .unique()
    )
    discrete_scores = np.sort(discrete_scores)[::-1]      # 降順

    best_thr  = None
    best_diff = float("inf")

    lo, hi = 0, len(discrete_scores) - 1
    while lo <= hi:
        mid_idx = (lo + hi) // 2
        thr = discrete_scores[mid_idx]

        df_mid = df_filtered[df_filtered["phenotype_similarity"] >= thr]
        # gene_symbol を含むエッジのみ抽出
        df_mid = df_mid[
            (df_mid["marker1"] == gene_symbol) |
            (df_mid["marker2"] == gene_symbol)
        ]
        nodes = set(pd.concat([df_mid["marker1"], df_mid["marker2"]], ignore_index=True))

        if gene_symbol not in nodes:
            # gene_symbol が落ちた → 閾値が高すぎる（スコアを下げる）
            lo = mid_idx + 1
            continue

        num_nodes = len(nodes)
        # ▼ ノード数が許容範囲なら候補にする
        if num_nodes <= upper_limit:
            diff = abs(num_nodes - number_of_nodes)
            if diff < best_diff:
                best_diff = diff
                best_thr  = thr
            # さらにノードを減らせるか？ → 閾値を **上げる**（スコアを大きく）
            lo = mid_idx + 1
        else:
            # ノードが多すぎ → 閾値を **上げる**（スコアを大きく）
            hi = mid_idx - 1

    # ------------ 最終の閾値 ---------------------------------------
    if best_thr is None:
        # Upper limit 以下がどうしても存在しない極端ケース → 最小ノードになる閾値
        best_thr = discrete_scores[hi + 1]  # hi は最後に -1 されているので +1

    df_filtered = df_filtered[df_filtered["phenotype_similarity"] >= best_thr]

nodes = set(pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True))

# ------------
# ネットワーク図のためのノードとエッジを作成
# ------------

# nodesを用意
node_json = []
for node in nodes:
    phenotype = dict_marker_phenotype[node]
    disease = dict_marker_disease.get(node, "")
    # ノードの色を決定（gene_symbol の場合は 1、それ以外は 0）
    node_color = 1.0 if node == gene_symbol else 0.0

    node_json.append({
        "data": {
            "id": node,
            "label": node,
            "node_color": node_color,
            "phenotype": phenotype,
            "disease": disease,
            }
        })

# edgesを用意
rows = df_similarity[(df_similarity["marker1"].isin(nodes)) & (df_similarity["marker2"].isin(nodes))].to_dict(orient="records")

# rows = df_filtered.to_dict(orient="records")
# EdgeをJSON形式に変換
edge_json = [
    {
        "data": {
            "source":   r["marker1"],
            "target":   r["marker2"],
            "phenotype": r["shared_mp"],
            "edge_size": r["phenotype_similarity"],
        }
    }
    for r in rows
]

network_json = node_json + edge_json


Rab10


In [145]:
x = df_similarity[(df_similarity["marker1"].isin(nodes)) & (df_similarity["marker2"].isin(nodes))]
node_x = set(pd.concat([x["marker1"], x["marker2"]], ignore_index=True))
print(len(node_x))

95


In [137]:
PP(edge_json[:3])

[{'data': {'edge_size': 0.25,
           'phenotype': ['abnormal retina blood vessel morphology (Hetero, '
                         'Early)',
                         'preweaning lethality, complete penetrance (Homo, '
                         'Early)'],
           'source': '4930444P10Rik',
           'target': 'Rpain'}},
 {'data': {'edge_size': 0.4,
           'phenotype': ['embryonic lethality prior to organogenesis (Homo, '
                         'Embryo)',
                         'preweaning lethality, complete penetrance (Homo, '
                         'Early)'],
           'source': '4933427D14Rik',
           'target': 'Aamp'}},
 {'data': {'edge_size': 0.222,
           'phenotype': ['embryonic lethality prior to organogenesis (Homo, '
                         'Embryo)',
                         'preweaning lethality, complete penetrance (Homo, '
                         'Early)'],
           'source': '4933427D14Rik',
           'target': 'Actl6a'}}]


In [146]:
nodes = set(pd.concat([df_filtered["marker1"], df_filtered["marker2"]], ignore_index=True))
print(len(nodes))

95


In [112]:
%%bash

uname -a # OS name
date +"%Y/%m/%d %H:%M:%S" # Last update

Linux think-x12-2024 6.6.87.2-microsoft-standard-WSL2 #1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux
2025/06/22 05:17:56
