# Run allですべてのデータを準備する

In [65]:
RELEASE = 23.0

columns = ["marker_symbol", "marker_accession_id", "mp_term_name", "mp_term_id", "p_value", "sex_effect_p_value", "female_ko_effect_p_value", "male_ko_effect_p_value", "zygosity", "effect_size",]


## 1. Download IMPC dataset

In [2]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir('../')

print(os.getcwd())

/mnt/e/Research/TSUMUGI-dev-main/notebooks/notebools-web
/mnt/e/Research/TSUMUGI-dev-main


In [3]:
P = print
from pprint import pprint as PP
from collections import Counter as C
from pathlib import Path
from collections import defaultdict
from itertools import combinations
import csv
import numpy as np
import pandas as pd
import polars as pl
import shutil
import pickle
import json
import gzip
import networkx as nx
import datetime
import urllib.request
from tqdm import tqdm
import hashlib


In [4]:
%%bash
pwd

/mnt/e/Research/TSUMUGI-dev-main


In [5]:
date_str = datetime.date.today().strftime("%Y-%m-%d")

# パスの設定
data_dir = Path("data/impc")
data_dir.mkdir(parents=True, exist_ok=True)
csv_path = data_dir / f"statistical-results-ALL-{RELEASE}.csv"
readme_path = data_dir / "README.md"

# ファイルが存在しない場合にダウンロードして解凍
if not csv_path.exists():
    # ダウンロード URL
    url = f"https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/release-{RELEASE}/results/statistical-results-ALL.csv.gz"

    print(f"Downloading and extracting: {url}")

    # URL からファイルサイズ取得（tqdmのため）
    with urllib.request.urlopen(url) as response:
        total_size = int(response.info().get("Content-Length", -1))
        with tqdm.wrapattr(response, "read", total=total_size, desc="Downloading", unit="B", unit_scale=True) as r:
            with gzip.GzipFile(fileobj=r) as uncompressed:
                with open(csv_path, 'wb') as out_file:
                    shutil.copyfileobj(uncompressed, out_file)

In [6]:
%%bash

# wc -l data/impc/statistical-results*.csv
# Release 22.1: 3165335
# Release 23.0: 2159931
# 1 min

## 2. Filter dataset by P value < 0.0001 (10^-4)


In [None]:
path_data = Path("data", "impc", f"statistical-results-ALL-{RELEASE}.csv")
data = pd.read_csv(path_data)
data = data[columns]
# 30 seconds

  data = pd.read_csv(path_data)


In [8]:
# print(len(data))
# Release 21.1: 2062772
# Release 22.0: 3165334
# Release 23.0: 2159930

In [9]:
# Filter by p_value < 0.0001
threshold = 0.0001
filter_pvalue = data["p_value"] < threshold
filter_female_ko_pvalue = data["female_ko_effect_p_value"] < threshold
filter_male_ko_pvalue = data["male_ko_effect_p_value"] < threshold

data_filtered = data[filter_pvalue | filter_male_ko_pvalue | filter_male_ko_pvalue]

# Filter by mp_term_id and mp_term_name are not NaN
data_filtered = data_filtered.dropna(subset=["mp_term_id"])
data_filtered = data_filtered.dropna(subset=["mp_term_name"])

# Filter by effect_size is not NaN
data_filtered = data_filtered.dropna(subset=["effect_size"])

In [10]:
print(len(data_filtered))
# Release 22.0: 54059 rows
# Release 22.1: 54059 rows
# Release 23.0: 48963 rows

48963


In [11]:
data_filtered.to_csv(f"data/statistical_filtered-{RELEASE}.csv", index=False) # 2 sec

## Split data by mp_term_name

In [12]:
data = data_filtered

In [13]:
# data/mp_term_nameを作成

output_path = Path("data", "mp_term_name")
if output_path.exists():
    shutil.rmtree(output_path)
output_path.mkdir(parents=True, exist_ok=True)

In [14]:
# 名前をクリーンにする関数を定義
def clean_name(name):
    return name.replace("/", "_").replace(" ", "_")

# mp_term_nameをクリーニングし、ユニークな値を取得
unique_mp_term_names = data['mp_term_name'].unique()

In [15]:
# ユニークなmp_term_nameごとにフィルタリングしてCSVに保存: 5 sec
for mp_term_name in unique_mp_term_names:
    filtered_df = data[data['mp_term_name'] == mp_term_name]
    clean_mp_term_name = clean_name(mp_term_name)
    filtered_df.to_csv(f"data/mp_term_name/{clean_mp_term_name}.csv", index=False)
# 5 sec

In [16]:
print(len(data))

48963


## 3. TSUMUGIに必要なアノテーション情報を整理する

In [17]:
data = pd.read_csv(f"data/statistical_filtered-{RELEASE}.csv")

### Annotate life stages

In [18]:
# life_stageの初期割り当て
def assign_life_stage(pipeline_name):
    if pd.isna(pipeline_name):
        return "Early"
    if "Interval" in pipeline_name or "interval" in pipeline_name:
        return "Interval"
    elif "Late" in pipeline_name or "late" in pipeline_name:
        return "Late"
    else:
        return "Early"

data["life_stage"] = data["pipeline_name"].apply(assign_life_stage)

# Embryo 表現型に該当する procedure_name の一覧
embryo_phenotyping = [
    "Gross Morphology Embryo E9.5",
    "Viability E9.5 Secondary Screen",
    "OPT E9.5",
    "MicroCT E9.5",
    "Gross Morphology Placenta E9.5",
    "Gross Morphology Embryo E12.5",
    "Embryo LacZ",
    "Gross Morphology Placenta E12.5",
    "Viability E12.5 Secondary Screen",
    "Viability E14.5-E15.5 Secondary Screen",
    "Gross Morphology Placenta E14.5-E15.5",
    "MicroCT E14.5-E15.5",
    "Gross Morphology Embryo E14.5-E15.5",
    "Viability E18.5 Secondary Screen",
    "MicroCT E18.5",
    "Gross Morphology Embryo E18.5",
    "Gross Morphology Placenta E18.5"
]

# life_stageをEmbryoに上書き
data.loc[data["procedure_name"].isin(embryo_phenotyping), "life_stage"] = "Embryo"
data_annotated = data.reset_index(drop=True)

In [19]:
print(len(data_annotated))
print(data_annotated["life_stage"].value_counts())
# 54059
# life_stage
# Early       45724
# Embryo       4253
# Late         4024
# Interval       58
# Name: count, dtype: int64

48963
life_stage
Early       41655
Embryo       4381
Late         2869
Interval       58
Name: count, dtype: int64


### Annotate Sex differences

In [20]:
threshold = 0.0001

# 条件リスト
conditions = [
    (data_annotated["sex_effect_p_value"] < threshold) & (data_annotated["female_ko_effect_p_value"] < threshold) & (data_annotated["male_ko_effect_p_value"] > threshold),
    (data_annotated["sex_effect_p_value"] < threshold) & (data_annotated["male_ko_effect_p_value"] < threshold) & (data_annotated["female_ko_effect_p_value"] > threshold)
]

# 条件に対応する値
choices = ["female", "male"]

# np.selectで列を設定
data_annotated["sexdual_dimorphism"] = np.select(conditions, choices, default=None)
data_annotated = data_annotated.reset_index(drop=True)

# 結果を確認
print(RELEASE)
print(data_annotated["sexdual_dimorphism"].value_counts())

# RELEASE 22.1
# sexdual_dimorphism
# male      4915
# female    4146

print(len(data_annotated))

23.0
sexdual_dimorphism
male      5026
female    4307
Name: count, dtype: int64
48963


In [21]:
# 確認
data_annotated.dropna(subset=["sexdual_dimorphism"])[["p_value", "sexdual_dimorphism", "effect_size", "genotype_effect_parameter_estimate", "female_ko_parameter_estimate", "male_ko_parameter_estimate"]].head(10)

Unnamed: 0,p_value,sexdual_dimorphism,effect_size,genotype_effect_parameter_estimate,female_ko_parameter_estimate,male_ko_parameter_estimate
10,0.0001260278,male,2.230004,24.066403,24.066403,41.292084
11,5.111543e-06,female,0.435955,0.169696,0.169696,-0.063424
12,0.0163733,male,-1.42984,-1.50034,-1.553489,-3.789276
13,0.5153139,male,1.088084,1.399613,1.399613,8.361055
15,8.911086e-08,female,1.811095,0.004294,0.004294,0.002278
16,3.849604e-08,female,1.602556,22.176017,22.010837,12.868002
20,0.02362372,male,2.379854,2.03359,2.03359,5.18959
26,9.895246e-06,female,1.095692,443.545464,443.833507,227.451186
27,2.294615e-07,female,1.443321,2.103954,2.484874,1.71928
30,2.508208e-08,male,-0.704889,-3.655154,-3.314047,-3.999483


### 遺伝型、性差、ライフステージのアノテーションを統合する

In [22]:
print(data_annotated["zygosity"].value_counts())
# zygosity
# homozygote      41444
# heterozygote    11921
# hemizygote        694
# Name: count, dtype: int64

zygosity
homozygote      37599
heterozygote    10781
hemizygote        583
Name: count, dtype: int64


In [23]:
colmuns = ["marker_symbol", "marker_accession_id", "mp_term_name", "mp_term_id", "p_value", "female_ko_effect_p_value", "male_ko_effect_p_value", "sexdual_dimorphism", "zygosity", "life_stage", "effect_size",]

data_annotated = data_annotated[colmuns]

In [24]:
# mp_term = "increased fasting circulating glucose level".strip()
# data_annotated[
#     (data_annotated["marker_symbol"] == "Dnase1l2") &
#     (data_annotated["mp_term_name"] == mp_term)
# ]

In [25]:
# アノテーション列を追加（inplace）
def make_annotation(row):
    # 遺伝型
    if row['zygosity'] == 'homozygote':
        annotate = "Homo"
    elif row['zygosity'] == 'heterozygote':
        annotate = "Hetero"
    else:
        annotate = "Hemi"

    # 性別
    if row['sexdual_dimorphism'] == "female":
        annotate += ", Female"
    elif row['sexdual_dimorphism'] == "male":
        annotate += ", Male"

    # life_stage
    if row['life_stage'] in {"Embryo", "Early", "Interval", "Late"}:
        annotate += f", {row['life_stage']}"

    return f"{row['mp_term_name']} ({annotate})"

data_annotated["annotation"] = data_annotated.apply(make_annotation, axis=1)

# marker_symbol ごとに annotation をリスト化＆ソート
marker_annotation_map = (
    data_annotated
    .groupby("marker_symbol")["annotation"]
    .apply(lambda x: sorted(set(x)))
)


In [26]:
# 例：Rhdの注釈を表示
print(marker_annotation_map["Rhd"])
# 例：Amtの注釈を表示 (Embryo)
print(marker_annotation_map["Amt"])
# 例：Spag4の注釈を表示 (重複が削除されているか)
print(marker_annotation_map["Spag4"])

['abnormal skin condition (Homo, Early)', 'decreased circulating HDL cholesterol level (Homo, Male, Early)', 'decreased circulating alkaline phosphatase level (Homo, Female, Early)', 'decreased circulating cholesterol level (Homo, Male, Early)', 'decreased circulating free fatty acids level (Homo, Early)', 'decreased hemoglobin content (Homo, Male, Early)', 'decreased mean corpuscular hemoglobin (Homo, Early)', 'decreased mean corpuscular hemoglobin concentration (Homo, Early)', 'decreased mean corpuscular volume (Homo, Early)', 'increased exploration in new environment (Homo, Early)']
['abnormal abdominal wall morphology (Homo, Embryo)', 'abnormal embryo size (Homo, Embryo)', 'abnormal facial morphology (Homo, Embryo)', 'abnormal head shape (Homo, Embryo)', 'abnormal head size (Homo, Embryo)', 'abnormal limb morphology (Homo, Embryo)', 'abnormal placenta size (Homo, Embryo)', 'abnormal retina blood vessel morphology (Hetero, Early)', 'abnormal retina vasculature morphology (Hetero, Ea

In [27]:

Path("data/annotation").mkdir(exist_ok=True, parents=True)
file_path = "data/annotation/symbol_mptermname.json"
marker_annotation_map.to_json(file_path, indent=4)

# json.dump(marker_annotation_map, open(file_path, "w"), indent=4, sort_keys=True)


In [28]:
%%bash

grep -c "Male" data/annotation/symbol_mptermname.json | sed "s|^|Male: |"
grep -c "Female" data/annotation/symbol_mptermname.json | sed "s|^|Feale: |"

grep -c "Homo" data/annotation/symbol_mptermname.json | sed "s|^|Homo: |"
grep -c "Hetero" data/annotation/symbol_mptermname.json | sed "s|^|Hetero: |"
grep -c "Hemi" data/annotation/symbol_mptermname.json | sed "s|^|Hemi: |"

grep -c "Embryo" data/annotation/symbol_mptermname.json | sed "s|^|Embryo: |"
grep -c "Early" data/annotation/symbol_mptermname.json | sed "s|^|Early: |"
grep -c "Interval" data/annotation/symbol_mptermname.json | sed "s|^|Interval: |"
grep -c "Late" data/annotation/symbol_mptermname.json | sed "s|^|Late: |"

# Male: 4915
# Feale: 4146
# Homo: 41444
# Hetero: 11921
# Hemi: 694
# Embryo: 4253
# Early: 45724
# Interval: 58
# Late: 4024


Male: 4480
Feale: 3524
Homo: 30786
Hetero: 9516


Hemi: 492
Embryo: 4207
Early: 34048
Interval: 52
Late: 2487


### mp term nameとIMPCのPhenotype URLを紐付ける

In [29]:
data_select = data[['mp_term_id', 'mp_term_name']].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select

Unnamed: 0,mp_term_id,mp_term_name
0,MP:0001510,abnormal coat appearance
1,MP:0011100,"preweaning lethality, complete penetrance"
2,MP:0004738,abnormal auditory brainstem response
3,MP:0001303,abnormal lens morphology
4,MP:0001147,small testis
...,...,...
46893,MP:0011503,distended jejunum
47045,MP:0002862,abnormal righting response
47794,MP:0004727,absent epididymis
48018,MP:0000023,abnormal ear position


In [30]:
data_dict_url = dict()
for index, row in data_select.iterrows():
    mp_tern_id = row['mp_term_id']
    impc_url = f"https://www.mousephenotype.org/data/phenotypes/{mp_tern_id}"
    mp_term_name = row['mp_term_name']
    data_dict_url[mp_term_name] = impc_url

print(data_dict_url["small lymph nodes"])

https://www.mousephenotype.org/data/phenotypes/MP:0002217


In [31]:
with open('data/annotation/mptermname_phenotypeurl.tsv', 'w') as f:
    for term, url in data_dict_url.items():
        f.write(f"{term}\t{url}\n")

In [32]:
%%bash

head -n 3 data/annotation/mptermname_phenotypeurl.tsv
wc -l data/annotation/mptermname_phenotypeurl.tsv
# Release 22.0: 664
# Release 23.0: 659

abnormal coat appearance	https://www.mousephenotype.org/data/phenotypes/MP:0001510
preweaning lethality, complete penetrance	https://www.mousephenotype.org/data/phenotypes/MP:0011100
abnormal auditory brainstem response	https://www.mousephenotype.org/data/phenotypes/MP:0004738
659 data/annotation/mptermname_phenotypeurl.tsv


### marker symbolとMGI accession idを紐付ける

In [33]:
data_select = data[['marker_symbol', 'marker_accession_id']].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select
# Release 22.1: 7746 rows
# Release 23.0: 7934 rows

Unnamed: 0,marker_symbol,marker_accession_id
0,Ap1ar,MGI:2384822
1,Kif21a,MGI:109188
2,Baiap2l2,MGI:2652819
3,Wars1,MGI:104630
4,Prss47,MGI:2685120
...,...,...
48686,Cacng3,MGI:1859165
48691,Dkk4,MGI:2385299
48894,Slc9a3,MGI:105064
48913,Srek1,MGI:2145245


In [34]:
data_dict = dict()
for index, row in data_select.iterrows():
    data_dict[row['marker_symbol']] = row['marker_accession_id']
print(data_dict["Ncam1"])

MGI:97281


In [35]:
json.dump(data_dict, open("data/annotation/symbol_mgiid.json", "w"), indent=4, sort_keys=True)
Path("data/annotation/symbol_mgiid.tsv").write_text("\n".join([f"{k}\t{v}" for k, v in data_dict.items()]))

143526

In [36]:
%%bash
head -n 3 data/annotation/symbol_mgiid.json
head -n 3 data/annotation/symbol_mgiid.tsv

{
    "1110059G10Rik": "MGI:1913452",
    "1500009L16Rik": "MGI:1917034",
Ap1ar	MGI:2384822
Kif21a	MGI:109188
Baiap2l2	MGI:2652819


## 4. 表現型の類似度を求める

In [37]:
file_path = Path("data", "annotation", "symbol_mptermname.json")

symbol_mptermname = json.load(open(file_path))
print(symbol_mptermname["Dpf2"])

['abnormal craniofacial morphology (Homo, Embryo)', 'abnormal embryo size (Homo, Embryo)', 'abnormal heart morphology (Homo, Embryo)', 'abnormal kidney morphology (Hetero, Early)', 'abnormal limb morphology (Homo, Embryo)', 'abnormal seminal vesicle morphology (Hetero, Early)', 'abnormal tail morphology (Homo, Embryo)', 'edema (Homo, Embryo)', 'pallor (Homo, Embryo)', 'preweaning lethality, complete penetrance (Homo, Early)']


In [38]:
symbol_mptermname = {k: set(v) for k, v in symbol_mptermname.items() if v}
print(symbol_mptermname["Dpf2"])

{'abnormal tail morphology (Homo, Embryo)', 'abnormal embryo size (Homo, Embryo)', 'abnormal heart morphology (Homo, Embryo)', 'preweaning lethality, complete penetrance (Homo, Early)', 'abnormal seminal vesicle morphology (Hetero, Early)', 'edema (Homo, Embryo)', 'abnormal limb morphology (Homo, Embryo)', 'abnormal kidney morphology (Hetero, Early)', 'pallor (Homo, Embryo)', 'abnormal craniofacial morphology (Homo, Embryo)'}


### Jaccard係数で集合の類似度を計算

In [39]:

overlapped_ratios_all = []

for a, b in combinations(symbol_mptermname, 2):
    overlapped_mp = sorted(symbol_mptermname[a] & symbol_mptermname[b])
    overlapped_mp_number = len(overlapped_mp)
    union_mp_number = len(symbol_mptermname[a] | symbol_mptermname[b])
    overlap_ratio = overlapped_mp_number / union_mp_number

    overlapped_ratios_all.append([a, b, round(overlap_ratio, 3), overlapped_mp_number, overlapped_mp])

## 46s

In [40]:
print(overlapped_ratios_all[:3])
print(len(overlapped_ratios_all))
# Release 22.0: 29996385
# Release 22.1: 29996385
# Release 23.0: 31470211

[['1110059G10Rik', '1500009L16Rik', 0.0, 0, []], ['1110059G10Rik', '1600014C10Rik', 0.0, 0, []], ['1110059G10Rik', '1600029I14Rik', 0.0, 0, []]]
31470211


### 重複する表現型が閾値以上のものを抽出

In [41]:

similarity_threshold = 0.5
num_overlapped_mp = 3

overlapped_ratios_filtered = []
for record in overlapped_ratios_all:
    if record[2] >= similarity_threshold or record[3] >= num_overlapped_mp:
        overlapped_ratios_filtered.append(record)

In [42]:
print(overlapped_ratios_filtered[:3])
print(len(overlapped_ratios_filtered))
# Release 21.1: 134880
# Release 22.0: 133281 <- Homo/Hetero/Hemiおよび♂・♀の完全一致を考慮するようになったため、減少
# Release 22.1: 133281
# v0.3.0: 261216 <- Similarity_threshodのor条件をつけたため、増加
# Release 23.0 TSUMUGI v0.3.2: 241645

[['1110059G10Rik', 'Kdm7a', 1.0, 1, ['vertebral transformation (Homo, Early)']], ['1500009L16Rik', 'Bscl2', 0.094, 3, ['increased circulating aspartate transaminase level (Homo, Early)', 'increased circulating calcium level (Homo, Early)', 'increased circulating serum albumin level (Homo, Early)']], ['1500009L16Rik', 'Lepr', 0.091, 4, ['decreased bone mineral density (Homo, Early)', 'increased circulating aspartate transaminase level (Homo, Early)', 'increased circulating calcium level (Homo, Early)', 'increased circulating serum albumin level (Homo, Early)']]]
241645


In [43]:
Path("data", "overlap").mkdir(exist_ok=True, parents=True)
pickle.dump(overlapped_ratios_all, open("data/overlap/overlapped_ratios_all.pkl", "wb"))
pickle.dump(overlapped_ratios_filtered, open("data/overlap/overlapped_ratios_filtered.pkl", "wb"))

# 18 sec

### 生データをCSV形式で出力 （ダウンロード用）

In [44]:
df_overlap = pd.DataFrame(overlapped_ratios_all)
df_overlap.columns = ["Gene1", "Gene2", "Jaccard Similarity", "Number of shared phenotype", "List of shared phenotypes"]
df_overlap.reindex(
    columns=["Gene1", "Gene2", "Number of shared phenotype", "Jaccard Similarity", "List of shared phenotypes"]
)
df_overlap["List of shared phenotypes"] = df_overlap["List of shared phenotypes"].apply(json.dumps)
df_overlap
# 30 sec

Unnamed: 0,Gene1,Gene2,Jaccard Similarity,Number of shared phenotype,List of shared phenotypes
0,1110059G10Rik,1500009L16Rik,0.000,0,[]
1,1110059G10Rik,1600014C10Rik,0.000,0,[]
2,1110059G10Rik,1600029I14Rik,0.000,0,[]
3,1110059G10Rik,1700003F12Rik,0.000,0,[]
4,1110059G10Rik,1700008O03Rik,0.000,0,[]
...,...,...,...,...,...
31470206,Zyg11b,Zzz3,0.000,0,[]
31470207,Zyg11b,a,0.000,0,[]
31470208,Zzef1,Zzz3,0.000,0,[]
31470209,Zzef1,a,0.053,1,"[""abnormal kidney morphology (Homo, Early)""]"


In [45]:
output_path = Path("data/TSUMUGI_raw_data.csv.gz")

def get_head1000_hash(df: pd.DataFrame) -> str:
    # head(1000)だけを対象にハッシュ化
    csv_bytes = df.head(1000).to_csv(index=False, lineterminator='\n').encode('utf-8')
    return hashlib.md5(csv_bytes).hexdigest()

def file_head1000_hash(path: Path) -> str | None:
    if not path.exists():
        return None
    with gzip.open(path, "rt", encoding="utf-8") as f:
        lines = [next(f) for _ in range(1001)]  # 1行目がヘッダー
        csv_content = ''.join(lines).encode('utf-8')
        return hashlib.md5(csv_content).hexdigest()

# 比較
new_hash = get_head1000_hash(df_overlap)
existing_hash = file_head1000_hash(output_path)

if new_hash != existing_hash:
    df_overlap.to_csv(output_path, index=False, compression="gzip", lineterminator='\n')
    print("🔄 ファイルを更新しました") # 3 min
else:
    print("✅ 内容に変更がないためスキップしました")

🔄 ファイルを更新しました


In [46]:
df_overlap

Unnamed: 0,Gene1,Gene2,Jaccard Similarity,Number of shared phenotype,List of shared phenotypes
0,1110059G10Rik,1500009L16Rik,0.000,0,[]
1,1110059G10Rik,1600014C10Rik,0.000,0,[]
2,1110059G10Rik,1600029I14Rik,0.000,0,[]
3,1110059G10Rik,1700003F12Rik,0.000,0,[]
4,1110059G10Rik,1700008O03Rik,0.000,0,[]
...,...,...,...,...,...
31470206,Zyg11b,Zzz3,0.000,0,[]
31470207,Zyg11b,a,0.000,0,[]
31470208,Zzef1,Zzz3,0.000,0,[]
31470209,Zzef1,a,0.053,1,"[""abnormal kidney morphology (Homo, Early)""]"


In [47]:
df_overlap_filtered = pd.DataFrame(overlapped_ratios_filtered)
df_overlap_filtered.columns = ["Gene1", "Gene2", "Jaccard Similarity", "Number of shared phenotype", "List of shared phenotypes"]
df_overlap_filtered.reindex(
    columns=["Gene1", "Gene2", "Number of shared phenotype", "Jaccard Similarity", "List of shared phenotypes"]
)
df_overlap_filtered["List of shared phenotypes"] = df_overlap_filtered["List of shared phenotypes"].apply(json.dumps)

df_overlap_filtered.to_csv("data/TSUMUGI_filtered_data.csv.gz", index=False, compression="gzip", lineterminator='\n')

## 表現型ごとのネットワークを出力

In [48]:
overlapped_ratios_filtered = pickle.load(open("data/overlap/overlapped_ratios_filtered.pkl", "rb"))

In [49]:
df_overlap = pd.DataFrame(
    overlapped_ratios_filtered, columns=["marker1", "marker2", "overlap_ratio", "overlapped_mp_number", "overlapped_mp"]
)
df_overlap
# version 0.2.2: 133281  rows × 5 columns
# version 0.3.0: 261216  rows × 5 columns

Unnamed: 0,marker1,marker2,overlap_ratio,overlapped_mp_number,overlapped_mp
0,1110059G10Rik,Kdm7a,1.000,1,"[vertebral transformation (Homo, Early)]"
1,1500009L16Rik,Bscl2,0.094,3,[increased circulating aspartate transaminase ...
2,1500009L16Rik,Lepr,0.091,4,"[decreased bone mineral density (Homo, Early),..."
3,1600014C10Rik,Acap2,0.500,1,"[abnormal vocalization (Homo, Early)]"
4,1600014C10Rik,Adh5,0.500,1,"[abnormal vocalization (Homo, Early)]"
...,...,...,...,...,...
241640,Zfp654,Zfp750,0.500,1,"[preweaning lethality, complete penetrance (Ho..."
241641,Zfp719,Zmat4,0.500,1,"[abnormal auditory brainstem response (Homo, E..."
241642,Zfp74,Zmat4,0.500,1,"[abnormal auditory brainstem response (Homo, E..."
241643,Zfp871,Zwilch,0.231,3,"[abnormal embryo size (Homo, Embryo), embryoni..."


In [50]:
marker_mp = json.load(open("data/annotation/symbol_mptermname.json"))
marker_mp = pd.DataFrame(marker_mp.items(), columns=["marker_symbol", "mp_term_name"])
marker_mp
# version 0.2.2: 7626 rows × 2 columns
# version 0.3.0: 7746 rows × 2 columns
# version 0.3.1: 7746 rows × 2 columns

Unnamed: 0,marker_symbol,mp_term_name
0,1110059G10Rik,"[vertebral transformation (Homo, Early)]"
1,1500009L16Rik,"[decreased bone mineral density (Homo, Early),..."
2,1600014C10Rik,"[abnormal coat/hair pigmentation (Homo, Early)..."
3,1600029I14Rik,"[abnormal kidney morphology (Homo, Early), sma..."
4,1700003F12Rik,"[abnormal embryo size (Homo, Embryo), abnormal..."
...,...,...
7929,Zwint,[embryonic lethality prior to organogenesis (H...
7930,Zyg11b,[decreased exploration in new environment (Het...
7931,Zzef1,"[abnormal coat/hair pigmentation (Homo, Early)..."
7932,Zzz3,"[abnormal cornea morphology (Hetero, Early), c..."


In [51]:
# path_mp_terms = list(Path("data", "mp_term_name").glob("*.csv"))
# for path_mp_term in path_mp_terms:
#     mp_term = path_mp_term.stem
#     if mp_term == "increased_fasting_circulating_glucose_level":
#         break
# mp_term

In [52]:
# df_marker_effect = pd.read_csv(path_mp_term)

# # effect sizeの絶対値が最大の行を取得
# df_marker_effect = df_marker_effect[["marker_symbol", "effect_size"]].loc[
#     df_marker_effect.groupby("marker_symbol")["effect_size"].apply(lambda x: x.abs().idxmax())
# ].reset_index(drop=True)

# df_marker_effect
# # df_marker_effect[df_marker_effect["marker_symbol"] == "Abcd3"]

In [53]:
# # Absolute value of effect size
# df_marker_effect.loc[:, "effect_size"] = df_marker_effect["effect_size"].abs()

# df_filtered = df_overlap[
#     (df_overlap['marker1'].isin(df_marker_effect['marker_symbol'])) &
#     (df_overlap['marker2'].isin(df_marker_effect['marker_symbol']))
# ]

# number_of_nodes = 200
# # 二分探索の範囲
# low, high = df_filtered["overlap_ratio"].min(), df_filtered["overlap_ratio"].max()
# best_overlap_ratio = None

# while low <= high:
#     mid = (low + high) / 2

#     # overlap_ratio >= mid のデータをフィルタリング
#     df_mid = df_filtered[df_filtered["overlap_ratio"] >= mid]

#     ## 出力
#     ### Nodeを作成する
#     df_marker1 = df_mid[["marker1"]]
#     df_marker2 = df_mid[["marker2"]]
#     df_node_marker1 = pd.merge(df_marker1, df_marker_effect, left_on='marker1', right_on='marker_symbol', how='inner')[["marker_symbol"]]
#     df_node_marker2 = pd.merge(df_marker2, df_marker_effect, left_on='marker2', right_on='marker_symbol', how='inner')[["marker_symbol"]]
#     df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()
#     df_node = pd.merge(df_node, marker_mp, how='inner', on='marker_symbol')
#     df_node = pd.merge(df_node, df_marker_effect, how='inner', on='marker_symbol')

#     node_count = len(df_node)
#     # ターゲット列数に近い場合、結果を保存
#     if number_of_nodes - 25 < node_count < number_of_nodes + 25:
#         best_overlap_ratio = mid
#         break
#     elif node_count > number_of_nodes:
#         # 列数が多い場合、範囲を上げる
#         best_overlap_ratio = mid
#         low = mid + 1e-6
#     else:
#         # 列数が少ない場合、範囲を下げる
#         best_overlap_ratio = mid
#         high = mid - 1e-6

# df_filtered = df_filtered[df_filtered["overlap_ratio"] >= best_overlap_ratio]

# ## 出力
# ### Nodeを作成する
# df_marker1 = df_filtered[["marker1"]]
# df_marker2 = df_filtered[["marker2"]]
# df_node_marker1 = pd.merge(df_marker1, df_marker_effect, left_on='marker1', right_on='marker_symbol', how='inner')[["marker_symbol"]]
# df_node_marker2 = pd.merge(df_marker2, df_marker_effect, left_on='marker2', right_on='marker_symbol', how='inner')[["marker_symbol"]]

# df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()
# df_node = pd.merge(df_node, marker_mp, how='inner', on='marker_symbol')
# df_node = pd.merge(df_node, df_marker_effect, how='inner', on='marker_symbol')

# df_node

In [54]:
# df_node[df_node["marker_symbol"] == "Abcd3"]

In [55]:
output_dir = Path("data/network/mp_term_name")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)


In [56]:
path_mp_terms = list(Path("data", "mp_term_name").glob("*.csv"))
# print(path_mp_terms[:3])
# print(len(path_mp_terms))
# path_mp_term = Path("data", "mp_term_name", "increasing_circulating_glucose_level.csv")

"""
ノードが多すぎるとWebページが描画できない問題を回避するため、
ノード数を200以下にするために最適なoverlap_ratioを求める
"""
number_of_nodes = 200

for path_mp_term in path_mp_terms:
    mp_term = path_mp_term.stem
    # print(mp_term)

    df_marker_effect = pd.read_csv(path_mp_term)
    df_marker_effect = df_marker_effect.dropna(subset=["effect_size"])
    # Absolute value of effect size
    df_marker_effect.loc[:, "effect_size"] = df_marker_effect["effect_size"].abs()

    # * effect sizeの絶対値が最大の行を取得 (Homo/Heteroで異なる効果量がある場合に、ひとまず最大値を採用する← 今後の考慮事項)
    df_marker_effect = df_marker_effect[["marker_symbol", "effect_size"]].loc[
        df_marker_effect.groupby("marker_symbol")["effect_size"].idxmax()
    ].reset_index(drop=True)

    df_filtered = df_overlap[
        (df_overlap['marker1'].isin(df_marker_effect['marker_symbol'])) &
        (df_overlap['marker2'].isin(df_marker_effect['marker_symbol'])) &
        (df_overlap['overlapped_mp'].apply(lambda lst: any(mp_term.replace("_", " ") in term for term in lst)))
    ]
    # 二分探索の範囲
    low, high = df_filtered["overlap_ratio"].min(), df_filtered["overlap_ratio"].max()
    best_overlap_ratio = None

    while low <= high:
        mid = (low + high) / 2

        # overlap_ratio >= mid のデータをフィルタリング
        df_mid = df_filtered[df_filtered["overlap_ratio"] >= mid]

        ## 出力
        ### Nodeを作成する
        df_marker1 = df_mid[["marker1"]]
        df_marker2 = df_mid[["marker2"]]
        df_node_marker1 = pd.merge(df_marker1, df_marker_effect, left_on='marker1', right_on='marker_symbol', how='inner')[["marker_symbol"]]
        df_node_marker2 = pd.merge(df_marker2, df_marker_effect, left_on='marker2', right_on='marker_symbol', how='inner')[["marker_symbol"]]
        df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()
        df_node = pd.merge(df_node, marker_mp, how='inner', on='marker_symbol')
        df_node = pd.merge(df_node, df_marker_effect, how='inner', on='marker_symbol')

        node_count = len(df_node)
        # ターゲット列数に近い場合、結果を保存
        if number_of_nodes - 25 < node_count < number_of_nodes + 25:
            best_overlap_ratio = mid
            break
        elif node_count > number_of_nodes:
            # 列数が多い場合、範囲を上げる
            best_overlap_ratio = mid
            low = mid + 1e-6
        else:
            # 列数が少ない場合、範囲を下げる
            best_overlap_ratio = mid
            high = mid - 1e-6

    df_filtered = df_filtered[df_filtered["overlap_ratio"] >= best_overlap_ratio]

    ## 出力
    ### Nodeを作成する
    df_marker1 = df_filtered[["marker1"]]
    df_marker2 = df_filtered[["marker2"]]
    df_node_marker1 = pd.merge(df_marker1, df_marker_effect, left_on='marker1', right_on='marker_symbol', how='inner')[["marker_symbol"]]
    df_node_marker2 = pd.merge(df_marker2, df_marker_effect, left_on='marker2', right_on='marker_symbol', how='inner')[["marker_symbol"]]

    df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()
    df_node = pd.merge(df_node, marker_mp, how='inner', on='marker_symbol')
    df_node = pd.merge(df_node, df_marker_effect, how='inner', on='marker_symbol')

    if len(df_node) == 0:
        continue

    # print(mp_term, len(df_node))

    # NodeをJSON形式に変換
    node_json = []
    for _, row in df_node.iterrows():
        node_json.append({
            "data": {
                "id": row['marker_symbol'],
                "label": row['marker_symbol'],
                "annotation": row['mp_term_name'],
                "node_color": row['effect_size']
            }
        })
    ### Edgeを作成する
    df_edge = df_filtered[["marker1", "marker2", "overlap_ratio", "overlapped_mp"]]
    # EdgeをJSON形式に変換
    edge_json = []
    for _, row in df_edge.iterrows():
        edge_json.append({
            "data": {
                "source": row['marker1'],
                "target": row['marker2'],
                "annotation": row['overlapped_mp'],
                "edge_size": row['overlap_ratio']
            }
        })

    ### EdgeとNodeを統合して、出力
    # Combine node and edge

    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{mp_term}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)

# 1m30s

In [57]:
%%bash

ls -lhS data/network/mp_term_name/ | head -n 5

# version 0.2.2: total 5.3M
# version 0.3.0: total 5.5M
# version 0.3.1: total 5.1M <- 該当の表現型を含むネットワークのみを表示 （Issue: #54）

total 4.8M
-rwxrwxrwx 1 kuno kuno 208K May  9 16:19 preweaning_lethality,_complete_penetrance.json.gz
-rwxrwxrwx 1 kuno kuno  72K May  9 16:18 edema.json.gz
-rwxrwxrwx 1 kuno kuno  45K May  9 16:17 abnormal_placenta_morphology.json.gz
-rwxrwxrwx 1 kuno kuno  43K May  9 16:17 abnormal_heart_morphology.json.gz


## 遺伝子ごとのネットワークを出力

In [58]:
marker_mp_dict = dict(zip(marker_mp.marker_symbol, marker_mp.mp_term_name))

In [59]:
gene_symbols = df_overlap.marker1.unique().tolist()
gene_symbols += df_overlap.marker2.unique().tolist()
gene_symbols = list(set(gene_symbols))
gene_symbols.sort()  # 以下のfor文で、どこまで遺伝子が処理されたのか途中経過を見積もるためのソート
P(gene_symbols[:3])
P(len(gene_symbols))  # 6003

['1110059G10Rik', '1500009L16Rik', '1600014C10Rik']
6904


In [60]:
output_dir = Path("data", "network", "gene_symbol")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)


In [61]:
for gene_symbol in gene_symbols:
    """
    ノードが多すぎるとWebページが描画できない問題を回避するため、
    ノード数を200以下にするために最適なoverlap_ratioを求める
    """
    # 今の処理
    df_filtered = df_overlap[(df_overlap["marker1"] == gene_symbol) | (df_overlap["marker2"] == gene_symbol)]

    G = nx.from_pandas_edgelist(df_filtered, "marker1", "marker2")

    # ノードAと直接つながっているノードのみを取得
    neighbors = list(G.neighbors(gene_symbol))
    subgraph_nodes = [gene_symbol] + neighbors
    subgraph = G.subgraph(subgraph_nodes)

    target_number_of_nodes = 200
    if len(subgraph.nodes) > target_number_of_nodes:
        # 二分探索の範囲
        low, high = df_filtered["overlap_ratio"].min(), df_filtered["overlap_ratio"].max()
        best_overlap_ratio = None

        while low <= high:
            mid = (low + high) / 2

            # overlap_ratio >= mid のデータをフィルタリング
            df_mid = df_filtered[df_filtered["overlap_ratio"] >= mid]

            G = nx.from_pandas_edgelist(df_mid, "marker1", "marker2")
            # ノードAと直接つながっているノードのみを取得
            try:
                neighbors = list(G.neighbors(gene_symbol))
            except:
                high = mid - 1e-6
                continue
            subgraph_nodes = [gene_symbol] + neighbors
            subgraph = G.subgraph(subgraph_nodes)

            row_count = len(subgraph.nodes)
            # ターゲット列数に近い場合、結果を保存
            if target_number_of_nodes - 25 < row_count < target_number_of_nodes + 25:
                best_overlap_ratio = mid
                break
            elif row_count > target_number_of_nodes:
                # 列数が多い場合、範囲を上げる
                best_overlap_ratio = mid
                low = mid + 1e-6
            else:
                # 列数が少ない場合、範囲を下げる
                best_overlap_ratio = mid
                high = mid - 1e-6

        df_nodes = df_filtered[df_filtered["overlap_ratio"] >= best_overlap_ratio]
        G = nx.from_pandas_edgelist(df_nodes, "marker1", "marker2")
        # ノードAと直接つながっているノードのみを取得
        neighbors = list(G.neighbors(gene_symbol))
        subgraph_nodes = [gene_symbol] + neighbors
        subgraph = G.subgraph(subgraph_nodes)

    # nodesを用意
    node_json = []
    for node in subgraph.nodes():
        annotation = marker_mp_dict[node]
        node_color = 1 if node == gene_symbol else 0
        node_json.append({"data": {"id": node, "label": node, "node_color": node_color, "annotation": annotation}})

    # edgesを用意
    df_edge = df_overlap[
        (df_overlap["marker1"].isin(subgraph.nodes())) & (df_overlap["marker2"].isin(subgraph.nodes()))
    ]

    edge_json = []
    for edge in df_edge.itertuples():
        edge_json.append(
            {
                "data": {
                    "source": edge.marker1,
                    "target": edge.marker2,
                    "edge_size": edge.overlap_ratio,
                    "annotation": edge.overlapped_mp,
                }
            }
        )
    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{gene_symbol}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)

# 10m

In [62]:
%%bash
ls -lhS data/network/gene_symbol/ | head -n 5
# version 0.3.0: total 170M
# version 0.3.1: total 168M
# version 0.3.2: total 145M

total 145M
-rwxrwxrwx 1 kuno kuno 177K May  9 16:22 Irf5.json.gz
-rwxrwxrwx 1 kuno kuno 177K May  9 16:24 Phactr4.json.gz
-rwxrwxrwx 1 kuno kuno 177K May  9 16:26 Zbtb8os.json.gz
-rwxrwxrwx 1 kuno kuno 177K May  9 16:26 Tnni1.json.gz


In [63]:
Path("data/overlap/available_gene_symbols.txt").write_text("\n".join(gene_symbols) + "\n")
print(len(gene_symbols))  # 4416 -> 4244 → 6003 → 4139
# version 0.2.2: 4139
# version 0.3.0: 6812 (Life stageを考慮 + 類似度を追加)
# version 0.3.1: 6812
# version 0.3.2: 6904

6904


In [64]:
%%bash

uname -a # OS name
date +"%Y/%m/%d %H:%M:%S" # Last update

Linux Sycom-2021 5.15.167.4-microsoft-standard-WSL2 #1 SMP Tue Nov 5 00:21:55 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux
2025/05/09 16:26:59
