# Run allですべてのデータを準備する

In [1]:
RELEASE = 22.1

## 1. Download IMPC dataset

In [2]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir('../')

print(os.getcwd())

/mnt/c/Users/akihi/Documents/GitHub/TSUMUGI-dev/notebooks/notebools-web
/mnt/c/Users/akihi/Documents/GitHub/TSUMUGI-dev


In [4]:
P = print
from pprint import pprint as PP
from collections import Counter as C
from pathlib import Path
from collections import defaultdict
from itertools import combinations
import csv
import numpy as np
import pandas as pd
import shutil
import pickle
import json
import gzip
import networkx as nx
import datetime
import urllib.request
from tqdm import tqdm


In [5]:
%%bash
pwd

/mnt/c/Users/akihi/Documents/GitHub/TSUMUGI-dev


In [6]:
date_str = datetime.date.today().strftime("%Y-%m-%d")

# パスの設定
data_dir = Path("data/impc")
data_dir.mkdir(parents=True, exist_ok=True)
csv_path = data_dir / f"statistical-results-ALL-{RELEASE}.csv"
readme_path = data_dir / "README.md"

# ファイルが存在しない場合にダウンロードして解凍
if not csv_path.exists():
    # ダウンロード URL
    url = f"https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/release-{RELEASE}/results/statistical-results-ALL.csv.gz"

    print(f"Downloading and extracting: {url}")

    # URL からファイルサイズ取得（tqdmのため）
    with urllib.request.urlopen(url) as response:
        total_size = int(response.info().get("Content-Length", -1))
        with tqdm.wrapattr(response, "read", total=total_size, desc="Downloading", unit="B", unit_scale=True) as r:
            with gzip.GzipFile(fileobj=r) as uncompressed:
                with open(csv_path, 'wb') as out_file:
                    shutil.copyfileobj(uncompressed, out_file)

In [None]:
%%bash

wc -l data/impc/statistical-results*.csv
# Release 22.1: 3165335
# 1 min

1662972 data/impc/statistical-results-ALL-22.1.csv


## 2. Filter dataset by P value < 0.0001 (10^-4)


In [8]:
path_data = Path("data", "impc", f"statistical-results-ALL-{RELEASE}.csv")
data = pd.read_csv(path_data)
# 30 seconds

  data = pd.read_csv(path_data)


In [9]:
print(len(data))
# Release 21.1: 2062772
# Release 22.0: 3165334

1662972


In [10]:
# Filter by p_value < 0.0001
threshold = 0.0001
filter_pvalue = data["p_value"] < threshold
filter_female_ko_pvalue = data["female_ko_effect_p_value"] < threshold
filter_male_ko_pvalue = data["male_ko_effect_p_value"] < threshold

data_filtered = data[filter_pvalue | filter_male_ko_pvalue | filter_male_ko_pvalue]

# Filter by mp_term_id and mp_term_name are not NaN
data_filtered = data_filtered.dropna(subset=["mp_term_id"])
data_filtered = data_filtered.dropna(subset=["mp_term_name"])

# Filter by effect_size is not NaN
data_filtered = data_filtered.dropna(subset=["effect_size"])

In [11]:
print(len(data_filtered))
# Release 22.0: 54059 rows
# Release 22.1: 54059 rows

39169


In [12]:
data_filtered.to_csv(f"data/statistical_filtered-{RELEASE}.csv", index=False) # 2 sec

## Split data by mp_term_name

In [13]:
data = data_filtered

In [14]:
# data/mp_term_nameを作成

output_path = Path("data", "mp_term_name")
if output_path.exists():
    shutil.rmtree(output_path)
output_path.mkdir(parents=True, exist_ok=True)

In [15]:
# 名前をクリーンにする関数を定義
def clean_name(name):
    return name.replace("/", "_").replace(" ", "_")

# mp_term_nameをクリーニングし、ユニークな値を取得
unique_mp_term_names = data['mp_term_name'].unique()

In [16]:
# ユニークなmp_term_nameごとにフィルタリングしてCSVに保存: 5 sec
for mp_term_name in unique_mp_term_names:
    filtered_df = data[data['mp_term_name'] == mp_term_name]
    clean_mp_term_name = clean_name(mp_term_name)
    filtered_df.to_csv(f"data/mp_term_name/{clean_mp_term_name}.csv", index=False)
# 5 sec

In [17]:
print(len(data))

39169


## 3. TSUMUGIに必要なアノテーション情報を整理する

In [18]:
data_annotated = data.copy()

### Annotate life stages

In [19]:
# # life_stageの初期割り当て
# def assign_life_stage(pipeline_name):
#     if pd.isna(pipeline_name):
#         return "Early"
#     if "Interval" in pipeline_name or "interval" in pipeline_name:
#         return "Interval"
#     elif "Late" in pipeline_name or "late" in pipeline_name:
#         return "Late"
#     else:
#         return "Early"

# data["life_stage"] = data["pipeline_name"].apply(assign_life_stage)

# # Embryo 表現型に該当する procedure_name の一覧
# embryo_phenotyping = [
#     "Gross Morphology Embryo E9.5",
#     "Viability E9.5 Secondary Screen",
#     "OPT E9.5",
#     "MicroCT E9.5",
#     "Gross Morphology Placenta E9.5",
#     "Gross Morphology Embryo E12.5",
#     "Embryo LacZ",
#     "Gross Morphology Placenta E12.5",
#     "Viability E12.5 Secondary Screen",
#     "Viability E14.5-E15.5 Secondary Screen",
#     "Gross Morphology Placenta E14.5-E15.5",
#     "MicroCT E14.5-E15.5",
#     "Gross Morphology Embryo E14.5-E15.5",
#     "Viability E18.5 Secondary Screen",
#     "MicroCT E18.5",
#     "Gross Morphology Embryo E18.5",
#     "Gross Morphology Placenta E18.5"
# ]

# # life_stageをEmbryoに上書き
# data.loc[data["procedure_name"].isin(embryo_phenotyping), "life_stage"] = "Embryo"
# data_annotated = data.reset_index(drop=True)

In [20]:
# print(len(data_annotated))
# print(data_annotated["life_stage"].value_counts())

### Annotate Sex differences

In [21]:
threshold = 0.0001

# 条件リスト
conditions = [
    (data_annotated["sex_effect_p_value"] < threshold) & (data_annotated["female_ko_effect_p_value"] < threshold) & (data_annotated["male_ko_effect_p_value"] > threshold),
    (data_annotated["sex_effect_p_value"] < threshold) & (data_annotated["male_ko_effect_p_value"] < threshold) & (data_annotated["female_ko_effect_p_value"] > threshold)
]

# 条件に対応する値
choices = ["female", "male"]

# np.selectで列を設定
data_annotated["sexdual_dimorphism"] = np.select(conditions, choices, default=None)
data_annotated = data_annotated.reset_index(drop=True)

# 結果を確認
print(RELEASE)
print(data_annotated["sexdual_dimorphism"].value_counts())

# RELEASE 22.1
# sexdual_dimorphism
# male      4915
# female    4146

print(len(data_annotated))

22.1
sexdual_dimorphism
male      4056
female    3504
Name: count, dtype: int64
39169


In [22]:
# 確認
data_annotated.dropna(subset=["sexdual_dimorphism"])[["p_value", "sexdual_dimorphism", "effect_size", "genotype_effect_parameter_estimate", "female_ko_parameter_estimate", "male_ko_parameter_estimate"]].head(10)

Unnamed: 0,p_value,sexdual_dimorphism,effect_size,genotype_effect_parameter_estimate,female_ko_parameter_estimate,male_ko_parameter_estimate
0,0.405457,male,-0.898679,-0.151456,-0.151456,-0.888437
3,1e-06,female,0.514058,23.474848,23.474848,-10.137384
6,0.004613,male,1.693153,0.045169,0.045169,0.086361
8,0.418544,male,1.00792,0.103463,0.102222,0.450179
12,0.025244,male,-1.28475,-0.916716,-0.916716,-2.023327
26,0.000102,male,-0.759035,-0.340968,-0.2373,-0.432685
34,0.729042,male,1.153897,-0.314216,-0.314216,2.384736
39,2.2e-05,female,0.813224,0.660234,0.660234,0.325016
43,0.322559,male,0.341517,-0.43272,-0.43272,1.930874
47,1.5e-05,female,-2.357432,-0.007335,-0.007335,-0.003611


### 遺伝型、性差、ライフステージのアノテーションを統合する

In [23]:
print(data_annotated["zygosity"].value_counts())

zygosity
homozygote      30010
heterozygote     8689
hemizygote        470
Name: count, dtype: int64


In [24]:
# アノテーション列を追加（inplace）
def make_annotation(row):
    # 遺伝型
    if row['zygosity'] == 'homozygote':
        annotate = "Homo"
    elif row['zygosity'] == 'heterozygote':
        annotate = "Hetero"
    else:
        annotate = "Hemi"

    # 性別
    if row['sexdual_dimorphism'] == "female":
        annotate += ", Female"
    elif row['sexdual_dimorphism'] == "male":
        annotate += ", Male"

    # # life_stage
    # if row['life_stage'] in {"Embryo", "Early", "Interval", "Late"}:
    #     annotate += f", {row['life_stage']}"

    return f"{row['mp_term_name']} ({annotate})"

data_annotated["annotation"] = data_annotated.apply(make_annotation, axis=1)

# marker_symbol ごとに annotation をリスト化＆ソート
marker_annotation_map = (
    data_annotated
    .groupby("marker_symbol")["annotation"]
    .apply(lambda x: sorted(x.tolist()))
)


In [25]:
# 例：Rhdの注釈を表示
print(marker_annotation_map["Rhd"])
# 例：Amtの注釈を表示 (Embryo)
print(marker_annotation_map["Amt"])

['abnormal skin condition (Homo)', 'decreased circulating HDL cholesterol level (Homo, Male)', 'decreased circulating cholesterol level (Homo, Male)', 'decreased circulating free fatty acids level (Homo)', 'decreased mean corpuscular hemoglobin (Homo)', 'decreased mean corpuscular hemoglobin concentration (Homo)', 'increased exploration in new environment (Homo)']
['abnormal abdominal wall morphology (Homo)', 'abnormal embryo size (Homo)', 'abnormal facial morphology (Homo)', 'abnormal head shape (Homo)', 'abnormal placenta size (Homo)', 'abnormal retina blood vessel morphology (Hetero)', 'abnormal retina vasculature morphology (Hetero)', 'anophthalmia (Homo)', 'cleft palate (Homo)', 'exencephaly (Homo)', 'preweaning lethality, complete penetrance (Homo)', 'preweaning lethality, complete penetrance (Homo)', 'preweaning lethality, complete penetrance (Homo)', 'short tibia (Hetero, Male)', 'spina bifida (Homo)']


In [26]:

Path("data/annotation").mkdir(exist_ok=True, parents=True)
file_path = "data/annotation/symbol_mptermname.json"
marker_annotation_map.to_json(file_path, indent=4)

# json.dump(marker_annotation_map, open(file_path, "w"), indent=4, sort_keys=True)


In [27]:
%%bash

grep -c "Male" data/annotation/symbol_mptermname.json | sed "s|^|Male: |"
grep -c "Female" data/annotation/symbol_mptermname.json | sed "s|^|Feale: |"

grep -c "Homo" data/annotation/symbol_mptermname.json | sed "s|^|Homo: |"
grep -c "Hetero" data/annotation/symbol_mptermname.json | sed "s|^|Hetero: |"
grep -c "Hemi" data/annotation/symbol_mptermname.json | sed "s|^|Hemi: |"

grep -c "Embryo" data/annotation/symbol_mptermname.json | sed "s|^|Embryo: |"
grep -c "Early" data/annotation/symbol_mptermname.json | sed "s|^|Early: |"
grep -c "Interval" data/annotation/symbol_mptermname.json | sed "s|^|Interval: |"
grep -c "Late" data/annotation/symbol_mptermname.json | sed "s|^|Late: |"

# Male: 4915
# Feale: 4146
# Homo: 41444
# Hetero: 11921
# Hemi: 694
# Embryo: 4253
# Early: 45724
# Interval: 58
# Late: 4024


Male: 4056
Feale: 3504


Homo: 30010
Hetero: 8689
Hemi: 470
Embryo: 0
Early: 0
Interval: 0
Late: 0


### mp term nameとIMPCのPhenotype URLを紐付ける

In [28]:
data_select = data[['mp_term_id', 'mp_term_name']].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select

Unnamed: 0,mp_term_id,mp_term_name
1,MP:0000186,decreased circulating HDL cholesterol level
78,MP:0011100,"preweaning lethality, complete penetrance"
173,MP:0013279,increased fasting circulating glucose level
189,MP:0000194,increased circulating calcium level
214,MP:0005011,increased eosinophil cell number
...,...,...
1569576,MP:0000480,increased rib number
1571656,MP:0004955,increased thymus weight
1590789,MP:0006415,absent testes
1619941,MP:0009477,small cecum


In [29]:
data_dict_url = dict()
for index, row in data_select.iterrows():
    mp_tern_id = row['mp_term_id']
    impc_url = f"https://www.mousephenotype.org/data/phenotypes/{mp_tern_id}"
    mp_term_name = row['mp_term_name']
    data_dict_url[mp_term_name] = impc_url

print(data_dict_url["small lymph nodes"])

https://www.mousephenotype.org/data/phenotypes/MP:0002217


In [30]:
with open('data/annotation/mptermname_phenotypeurl.tsv', 'w') as f:
    for term, url in data_dict_url.items():
        f.write(f"{term}\t{url}\n")

In [31]:
%%bash

head -n 3 data/annotation/mptermname_phenotypeurl.tsv
wc -l data/annotation/mptermname_phenotypeurl.tsv # Release 22.0: 664 

decreased circulating HDL cholesterol level	https://www.mousephenotype.org/data/phenotypes/MP:0000186
preweaning lethality, complete penetrance	https://www.mousephenotype.org/data/phenotypes/MP:0011100
increased fasting circulating glucose level	https://www.mousephenotype.org/data/phenotypes/MP:0013279
659 data/annotation/mptermname_phenotypeurl.tsv


### marker symbolとMGI accession idを紐付ける

In [32]:
data_select = data[['marker_symbol', 'marker_accession_id']].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select

Unnamed: 0,marker_symbol,marker_accession_id
1,Rhd,MGI:1202882
78,Dpf2,MGI:109529
107,Gna13,MGI:95768
173,Dynlrb2,MGI:1922715
189,Slc16a2,MGI:1203732
...,...,...
1658318,Abca4,MGI:109424
1659362,Polq,MGI:2155399
1659481,Pdyn,MGI:97535
1661314,Tecrl,MGI:2444966


In [33]:
data_dict = dict()
for index, row in data_select.iterrows():
    data_dict[row['marker_symbol']] = row['marker_accession_id']
print(data_dict["Ncam1"])

MGI:97281


In [34]:
json.dump(data_dict, open("data/annotation/symbol_mgiid.json", "w"), indent=4, sort_keys=True)
Path("data/annotation/symbol_mgiid.tsv").write_text("\n".join([f"{k}\t{v}" for k, v in data_dict.items()]))

133560

In [35]:
%%bash
head -n 3 data/annotation/symbol_mgiid.json
head -n 3 data/annotation/symbol_mgiid.tsv

{
    "0610010K14Rik": "MGI:1915609",
    "0610040J01Rik": "MGI:1923511",
Rhd	MGI:1202882
Dpf2	MGI:109529
Gna13	MGI:95768


## 4. 表現型の類似度を求める

In [36]:
file_path = Path("data", "annotation", "symbol_mptermname.json")

symbol_mptermname = json.load(open(file_path))
print(symbol_mptermname["Dpf2"])

['abnormal craniofacial morphology (Homo)', 'abnormal embryo size (Homo)', 'abnormal heart morphology (Homo)', 'abnormal limb morphology (Homo)', 'abnormal seminal vesicle morphology (Hetero)', 'edema (Homo)', 'pallor (Homo)', 'preweaning lethality, complete penetrance (Homo)', 'preweaning lethality, complete penetrance (Homo)']


In [37]:
symbol_mptermname = {k: set(v) for k, v in symbol_mptermname.items() if v}
print(symbol_mptermname["Dpf2"])

{'abnormal craniofacial morphology (Homo)', 'abnormal limb morphology (Homo)', 'abnormal seminal vesicle morphology (Hetero)', 'abnormal heart morphology (Homo)', 'edema (Homo)', 'abnormal embryo size (Homo)', 'preweaning lethality, complete penetrance (Homo)', 'pallor (Homo)'}


### Jaccard係数で集合の類似度を計算

In [38]:

overlapped_ratios_all = []

for a, b in combinations(symbol_mptermname, 2):
    overlapped_mp = sorted(symbol_mptermname[a] & symbol_mptermname[b])
    overlapped_mp_number = len(overlapped_mp)
    union_mp_number = len(symbol_mptermname[a] | symbol_mptermname[b])
    overlap_ratio = overlapped_mp_number / union_mp_number

    overlapped_ratios_all.append([a, b, round(overlap_ratio, 3), overlapped_mp_number, overlapped_mp])

## 46s

In [39]:
print(overlapped_ratios_all[:3])
print(len(overlapped_ratios_all))
# Release 22.0: 29996385
# Release 22.1: 29996385

[['0610010K14Rik', '0610040J01Rik', 0.0, 0, []], ['0610010K14Rik', '1110059G10Rik', 0.0, 0, []], ['0610010K14Rik', '1500009L16Rik', 0.0, 0, []]]
27258036


### 重複する表現型が閾値以上のものを抽出

In [40]:

num_overlapped_mp = 3

overlapped_ratios_filtered = []
for record in overlapped_ratios_all:
    if record[3] >= num_overlapped_mp:
        overlapped_ratios_filtered.append(record)

In [41]:
print(overlapped_ratios_filtered[:3])
print(len(overlapped_ratios_filtered))
# Release 21.1: 134880
# Release 22.0: 133281 <- Homo/Hetero/Hemiおよび♂・♀の完全一致を考慮するようになったため、減少
# Release 22.1: 133281

[['0610010K14Rik', 'Acvr1', 0.6, 3, ['embryonic lethality prior to organogenesis (Homo)', 'embryonic lethality prior to tooth bud stage (Homo)', 'preweaning lethality, complete penetrance (Homo)']], ['0610010K14Rik', 'Adss2', 0.375, 3, ['embryonic lethality prior to organogenesis (Homo)', 'embryonic lethality prior to tooth bud stage (Homo)', 'preweaning lethality, complete penetrance (Homo)']], ['0610010K14Rik', 'Ahcy', 0.5, 3, ['embryonic lethality prior to organogenesis (Homo)', 'embryonic lethality prior to tooth bud stage (Homo)', 'preweaning lethality, complete penetrance (Homo)']]]
49899


In [42]:
Path("data", "overlap").mkdir(exist_ok=True, parents=True)
pickle.dump(overlapped_ratios_all, open("data/overlap/overlapped_ratios_all.pkl", "wb"))
pickle.dump(overlapped_ratios_filtered, open("data/overlap/overlapped_ratios_filtered.pkl", "wb"))

# 18 sec

### 生データをCSV形式で出力 （ダウンロード用）

In [43]:
df_overlap = pd.DataFrame(overlapped_ratios_all)
df_overlap.columns = ["Gene1", "Gene2", "Jaccard Similarity", "Number of shared phenotype", "List of shared phenotypes"]
df_overlap.reindex(
    columns=["Gene1", "Gene2", "Number of shared phenotype", "Jaccard Similarity", "List of shared phenotypes"]
)
df_overlap



Unnamed: 0,Gene1,Gene2,Jaccard Similarity,Number of shared phenotype,List of shared phenotypes
0,0610010K14Rik,0610040J01Rik,0.000,0,[]
1,0610010K14Rik,1110059G10Rik,0.000,0,[]
2,0610010K14Rik,1500009L16Rik,0.000,0,[]
3,0610010K14Rik,1600014C10Rik,0.000,0,[]
4,0610010K14Rik,1600029I14Rik,0.000,0,[]
...,...,...,...,...,...
27258031,Zyg11b,Zzz3,0.000,0,[]
27258032,Zyg11b,a,0.000,0,[]
27258033,Zzef1,Zzz3,0.000,0,[]
27258034,Zzef1,a,0.062,1,[abnormal kidney morphology (Homo)]


In [44]:
df_overlap.to_csv("data/TSUMUGI_raw_data.csv.gz", index=False, compression="gzip", lineterminator='\n')
# 3 min

## 表現型ごとのネットワークを出力

In [45]:
df_overlap = pd.DataFrame(
    overlapped_ratios_filtered, columns=["marker1", "marker2", "overlap_ratio", "overlapped_mp_number", "overlapped_mp"]
)
df_overlap  # 133281  rows × 5 columns

Unnamed: 0,marker1,marker2,overlap_ratio,overlapped_mp_number,overlapped_mp
0,0610010K14Rik,Acvr1,0.600,3,[embryonic lethality prior to organogenesis (H...
1,0610010K14Rik,Adss2,0.375,3,[embryonic lethality prior to organogenesis (H...
2,0610010K14Rik,Ahcy,0.500,3,[embryonic lethality prior to organogenesis (H...
3,0610010K14Rik,Anapc4,0.750,3,[embryonic lethality prior to organogenesis (H...
4,0610010K14Rik,Ankfy1,0.500,3,[embryonic lethality prior to organogenesis (H...
...,...,...,...,...,...
49894,Zfp462,Zfp91,0.300,3,"[abnormal embryo size (Homo), edema (Homo), pr..."
49895,Zfp503,Zfp91,0.167,3,"[abnormal embryo size (Homo), edema (Homo), pr..."
49896,Zfp593,Zmynd12,0.231,3,"[decreased lean body mass (Homo, Male), increa..."
49897,Zfp612,Zfp641,0.500,3,"[decreased mean corpuscular volume (Homo), hyp..."


In [46]:
marker_mp = json.load(open("data/annotation/symbol_mptermname.json"))
marker_mp = pd.DataFrame(marker_mp.items(), columns=["marker_symbol", "mp_term_name"])
marker_mp # 7626 rows × 2 columns

Unnamed: 0,marker_symbol,mp_term_name
0,0610010K14Rik,[embryonic lethality prior to organogenesis (H...
1,0610040J01Rik,"[abnormal heart morphology (Homo), abnormal sp..."
2,1110059G10Rik,[vertebral transformation (Homo)]
3,1500009L16Rik,"[decreased bone mineral density (Homo), increa..."
4,1600014C10Rik,"[abnormal coat/hair pigmentation (Homo), abnor..."
...,...,...
7379,Zwint,[embryonic lethality prior to organogenesis (H...
7380,Zyg11b,[decreased locomotor activity (Hetero)]
7381,Zzef1,"[abnormal coat/hair pigmentation (Homo), abnor..."
7382,Zzz3,"[abnormal cornea morphology (Hetero), corneal ..."


In [47]:
output_dir = Path("data/network/mp_term_name")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)


In [48]:
path_mp_terms = list(Path("data", "mp_term_name").glob("*.csv"))
# print(path_mp_terms[:3])
# print(len(path_mp_terms))
# path_mp_term = Path("data", "mp_term_name", "decreased_circulating_glucose_level.csv")

"""
ノードが多すぎるとWebページが描画できない問題を回避するため、
ノード数を200以下にするために最適なoverlap_ratioを求める
"""
number_of_nodes = 200

for path_mp_term in path_mp_terms:
    mp_term = path_mp_term.stem
    # print(mp_term)

    df_marker_effect = pd.read_csv(path_mp_term)
    df_marker_effect = df_marker_effect[["marker_symbol", "effect_size"]].drop_duplicates()
    df_marker_effect = df_marker_effect.dropna(subset=["effect_size"])

    # Absolute value of effect size
    df_marker_effect.loc[:, "effect_size"] = df_marker_effect["effect_size"].abs()

    df_filtered = df_overlap[
        (df_overlap['marker1'].isin(df_marker_effect['marker_symbol'])) &
        (df_overlap['marker2'].isin(df_marker_effect['marker_symbol']))
    ]
    # 二分探索の範囲
    low, high = df_filtered["overlap_ratio"].min(), df_filtered["overlap_ratio"].max()
    best_overlap_ratio = None

    while low <= high:
        mid = (low + high) / 2

        # overlap_ratio >= mid のデータをフィルタリング
        df_mid = df_filtered[df_filtered["overlap_ratio"] >= mid]

        ## 出力
        ### Nodeを作成する
        df_marker1 = df_mid[["marker1"]]
        df_marker2 = df_mid[["marker2"]]
        df_node_marker1 = pd.merge(df_marker1, df_marker_effect, left_on='marker1', right_on='marker_symbol', how='inner')[["marker_symbol"]]
        df_node_marker2 = pd.merge(df_marker2, df_marker_effect, left_on='marker2', right_on='marker_symbol', how='inner')[["marker_symbol"]]
        df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()
        df_node = pd.merge(df_node, marker_mp, how='inner', on='marker_symbol')
        df_node = pd.merge(df_node, df_marker_effect, how='inner', on='marker_symbol')

        node_count = len(df_node)
        # ターゲット列数に近い場合、結果を保存
        if number_of_nodes - 25 < node_count < number_of_nodes + 25:
            best_overlap_ratio = mid
            break
        elif node_count > number_of_nodes:
            # 列数が多い場合、範囲を上げる
            best_overlap_ratio = mid
            low = mid + 1e-6
        else:
            # 列数が少ない場合、範囲を下げる
            best_overlap_ratio = mid
            high = mid - 1e-6

    df_filtered = df_filtered[df_filtered["overlap_ratio"] >= best_overlap_ratio]

    ## 出力
    ### Nodeを作成する
    df_marker1 = df_filtered[["marker1"]]
    df_marker2 = df_filtered[["marker2"]]
    df_node_marker1 = pd.merge(df_marker1, df_marker_effect, left_on='marker1', right_on='marker_symbol', how='inner')[["marker_symbol"]]
    df_node_marker2 = pd.merge(df_marker2, df_marker_effect, left_on='marker2', right_on='marker_symbol', how='inner')[["marker_symbol"]]

    df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()
    df_node = pd.merge(df_node, marker_mp, how='inner', on='marker_symbol')
    df_node = pd.merge(df_node, df_marker_effect, how='inner', on='marker_symbol')

    if len(df_node) == 0:
        continue

    # print(mp_term, len(df_node))

    # NodeをJSON形式に変換
    node_json = []
    for _, row in df_node.iterrows():
        node_json.append({
            "data": {
                "id": row['marker_symbol'],
                "label": row['marker_symbol'],
                "annotation": row['mp_term_name'],
                "node_color": row['effect_size']
            }
        })
    ### Edgeを作成する
    df_edge = df_filtered[["marker1", "marker2", "overlap_ratio", "overlapped_mp"]]
    # EdgeをJSON形式に変換
    edge_json = []
    for _, row in df_edge.iterrows():
        edge_json.append({
            "data": {
                "source": row['marker1'],
                "target": row['marker2'],
                "annotation": row['overlapped_mp'],
                "edge_size": row['overlap_ratio']
            }
        })

    ### EdgeとNodeを統合して、出力
    # Combine node and edge

    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{mp_term}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)

# 1m30s

In [49]:
%%bash

ls -lhS data/network/mp_term_name/ | head -n 5

# total 5.3M
# -rwxrwxrwx 1 aki aki  73K Feb 22 12:02 edema.json.gz
# -rwxrwxrwx 1 aki aki  63K Feb 22 12:02 enlarged_kidney.json.gz
# -rwxrwxrwx 1 aki aki  63K Feb 22 12:02 abnormal_lymph_node_morphology.json.gz
# -rwxrwxrwx 1 aki aki  58K Feb 22 12:02 small_kidney.json.gz

total 3.7M
-rwxrwxrwx 1 kuno kuno  53K Apr  9 06:36 embryonic_growth_retardation.json.gz
-rwxrwxrwx 1 kuno kuno  38K Apr  9 06:35 abnormal_bone_structure.json.gz
-rwxrwxrwx 1 kuno kuno  35K Apr  9 06:36 edema.json.gz
-rwxrwxrwx 1 kuno kuno  35K Apr  9 06:35 decreased_body_length.json.gz


## 遺伝子ごとのネットワークを出力

In [50]:
marker_mp_dict = dict(zip(marker_mp.marker_symbol, marker_mp.mp_term_name))

In [51]:
gene_symbols = df_overlap.marker1.unique().tolist()
gene_symbols += df_overlap.marker2.unique().tolist()
gene_symbols = list(set(gene_symbols))
gene_symbols.sort()  # 以下のfor文で、どこまで遺伝子が処理されたのか途中経過を見積もるためのソート
P(gene_symbols[:3])
P(len(gene_symbols))  # 6003

['0610010K14Rik', '0610040J01Rik', '1500009L16Rik']
3242


In [52]:
output_dir = Path("data", "network", "gene_symbol")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)


In [53]:
for gene_symbol in gene_symbols:
    """
    ノードが多すぎるとWebページが描画できない問題を回避するため、
    ノード数を200以下にするために最適なoverlap_ratioを求める
    """
    # 今の処理
    df_filtered = df_overlap[(df_overlap["marker1"] == gene_symbol) | (df_overlap["marker2"] == gene_symbol)]

    G = nx.from_pandas_edgelist(df_filtered, "marker1", "marker2")

    # ノードAと直接つながっているノードのみを取得
    neighbors = list(G.neighbors(gene_symbol))
    subgraph_nodes = [gene_symbol] + neighbors
    subgraph = G.subgraph(subgraph_nodes)

    target_number_of_nodes = 200
    if len(subgraph.nodes) > target_number_of_nodes:
        # 二分探索の範囲
        low, high = df_filtered["overlap_ratio"].min(), df_filtered["overlap_ratio"].max()
        best_overlap_ratio = None

        while low <= high:
            mid = (low + high) / 2

            # overlap_ratio >= mid のデータをフィルタリング
            df_mid = df_filtered[df_filtered["overlap_ratio"] >= mid]

            G = nx.from_pandas_edgelist(df_mid, "marker1", "marker2")
            # ノードAと直接つながっているノードのみを取得
            try:
                neighbors = list(G.neighbors(gene_symbol))
            except:
                high = mid - 1e-6
                continue
            subgraph_nodes = [gene_symbol] + neighbors
            subgraph = G.subgraph(subgraph_nodes)

            row_count = len(subgraph.nodes)
            # ターゲット列数に近い場合、結果を保存
            if target_number_of_nodes - 25 < row_count < target_number_of_nodes + 25:
                best_overlap_ratio = mid
                break
            elif row_count > target_number_of_nodes:
                # 列数が多い場合、範囲を上げる
                best_overlap_ratio = mid
                low = mid + 1e-6
            else:
                # 列数が少ない場合、範囲を下げる
                best_overlap_ratio = mid
                high = mid - 1e-6

        df_nodes = df_filtered[df_filtered["overlap_ratio"] >= best_overlap_ratio]
        G = nx.from_pandas_edgelist(df_nodes, "marker1", "marker2")
        # ノードAと直接つながっているノードのみを取得
        neighbors = list(G.neighbors(gene_symbol))
        subgraph_nodes = [gene_symbol] + neighbors
        subgraph = G.subgraph(subgraph_nodes)

    # nodesを用意
    node_json = []
    for node in subgraph.nodes():
        annotation = marker_mp_dict[node]
        node_color = 1 if node == gene_symbol else 0
        node_json.append({"data": {"id": node, "label": node, "node_color": node_color, "annotation": annotation}})

    # edgesを用意
    df_edge = df_overlap[
        (df_overlap["marker1"].isin(subgraph.nodes())) & (df_overlap["marker2"].isin(subgraph.nodes()))
    ]

    edge_json = []
    for edge in df_edge.itertuples():
        edge_json.append(
            {
                "data": {
                    "source": edge.marker1,
                    "target": edge.marker2,
                    "edge_size": edge.overlap_ratio,
                    "annotation": edge.overlapped_mp,
                }
            }
        )
    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{gene_symbol}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)

# 4m

In [54]:
%%bash
ls -lhS data/network/gene_symbol/ | head -n 5 # total 4.6G -> 133M → 975M

total 34M
-rwxrwxrwx 1 kuno kuno 104K Apr  9 06:47 Uggt1.json.gz
-rwxrwxrwx 1 kuno kuno  95K Apr  9 06:47 Traf3ip1.json.gz
-rwxrwxrwx 1 kuno kuno  95K Apr  9 06:39 Sptan1.json.gz
-rwxrwxrwx 1 kuno kuno  92K Apr  9 06:39 Pigq.json.gz


In [55]:
Path("data/overlap/available_gene_symbols.txt").write_text("\n".join(gene_symbols) + "\n")
print(len(gene_symbols))  # 4416 -> 4244 → 6003 → 4139

3242


In [56]:
%%bash

uname -a # OS name
date +"%Y/%m/%d %H:%M:%S" # Last update

Linux think-x12-2024 5.15.167.4-microsoft-standard-WSL2 #1 SMP Tue Nov 5 00:21:55 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux
2025/04/09 06:47:54
