# 性特異的な遺伝子モジュールを探索する

# セットアップ

In [1]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir('../')

print(os.getcwd())

/mnt/e/Research/TSUMUGI-dev-main/notebooks/notebooks-experiments
/mnt/e/Research/TSUMUGI-dev-main


In [57]:
from pathlib import Path
from pprint import pprint
from collections import defaultdict, Counter
from itertools import combinations
import csv
import numpy as np
import pandas as pd
import polars as pl
from matplotlib import pyplot as plt
import seaborn as sns
import networkx as nx

P = print
PP = pprint
C = Counter


# 実験

In [None]:
import json
import pandas as pd
df_tsumugi = pd.read_csv("data/TSUMUGI_raw_data.csv.gz",
    converters={"List of shared phenotypes": json.loads})

# 45 seconds

In [45]:
df_tsumugi_filtered = df_tsumugi[
    (df_tsumugi["Jaccard Similarity"] > 0.1) &
    (df_tsumugi["Number of shared phenotype"] > 2)
]

In [46]:
df_tsumugi_filtered

Unnamed: 0,Gene1,Gene2,Jaccard Similarity,Number of shared phenotype,List of shared phenotypes
32086,1700003F12Rik,Alg10b,0.125,3,"[abnormal embryo size (Homo, Embryo), abnormal..."
32144,1700003F12Rik,Ankmy2,0.158,3,"[abnormal embryo size (Homo, Embryo), abnormal..."
32581,1700003F12Rik,Bmp4,0.120,3,"[abnormal embryo size (Homo, Embryo), abnormal..."
32764,1700003F12Rik,Cars2,0.273,3,"[abnormal embryo size (Homo, Embryo), abnormal..."
32979,1700003F12Rik,Cdk8,0.429,3,"[abnormal embryo size (Homo, Embryo), abnormal..."
...,...,...,...,...,...
31464745,Zfp503,Zwilch,0.227,5,"[abnormal embryo size (Homo, Embryo), abnormal..."
31464836,Zfp512,Zpld1,0.429,3,"[abnormal behavior (Homo, Early), decreased th..."
31466730,Zfp612,Zfp641,0.500,3,"[decreased mean corpuscular volume (Homo, Earl..."
31467185,Zfp638,Znhit2,0.600,3,[embryonic lethality prior to organogenesis (H...


In [67]:
import pandas as pd
import re

# explode して行展開
df_long = df_tsumugi_filtered.explode("List of shared phenotypes").dropna(subset=["List of shared phenotypes"])

# phenotype と sex を抽出する関数
def extract_phenotype_and_sex(entry):
    match = re.match(r"^(.*?) \((.*?)\)$", entry)
    if match:
        phenotype_part, annotation_part = match.groups()
        parts = [p.strip() for p in annotation_part.split(",")]
        for p in parts:
            if p in {"Male", "Female"}:
                return {"Phenotype": phenotype_part.strip(), "Sex": p}
    return None

# .apply()して辞書を返し、Noneを除外
extracted = df_long["List of shared phenotypes"].apply(extract_phenotype_and_sex)
df_extracted = df_long[extracted.notnull()].copy()

# 抽出した辞書を DataFrame 化し、元と連結
df_extracted = pd.concat([df_extracted.reset_index(drop=True), pd.DataFrame(extracted.dropna().tolist()).reset_index(drop=True)], axis=1)

# 最終整形
df_result = df_extracted[["Gene1", "Gene2", "Phenotype", "Sex"]]

# 表示（例）
print(df_result.head())


           Gene1   Gene2                                    Phenotype   Sex
0  1700008P02Rik  Fam81b  increased circulating HDL cholesterol level  Male
1  1700008P02Rik  Fam81b              increased total body fat amount  Male
2  1700008P02Rik    Heca  increased circulating HDL cholesterol level  Male
3  1700008P02Rik    Heca              increased total body fat amount  Male
4  1700008P02Rik  Ms4a4d  increased circulating HDL cholesterol level  Male


In [68]:
# def extract_phenotype_and_sex(entry):
#     entry = entry.strip()  # ← 空白や改行を除去
#     match = re.match(r"^(.*?)\s*\((.*?)\)$", entry)
#     if match:
#         phenotype_part, annotation_part = match.groups()
#         parts = [p.strip() for p in annotation_part.split(",")]
#         for p in parts:
#             if p in {"Male", "Female"}:
#                 return {"Phenotype": phenotype_part.strip(), "Sex": p}
#     return None

# entry = "increased total body fat amount (Homo, Female, Early)"
# extract_phenotype_and_sex(entry)

In [69]:
df_result["Sex"].value_counts()

Sex
Male      7662
Female    7036
Name: count, dtype: int64

In [74]:
# モジュールを格納する辞書
modules = defaultdict(list)

# Phenotype × Sex specificity ごとにグラフを構築
for (phenotype, sex), group in df_result.groupby(['Phenotype', 'Sex']):
    G = nx.Graph()
    G.add_edges_from(group[['Gene1', 'Gene2']].values)
    
    # 連結成分（モジュール）を抽出
    for component in nx.connected_components(G):
        if len(component) > 1:  # 単独ノードは除外
            modules[(phenotype, sex)].append(sorted(component))

modules = dict(modules)
# # 結果の表示
# for (phenotype, sex), comps in modules.items():
#     print(f"\n[Phenotype: {phenotype}, Sex: {sex}]")
#     for i, comp in enumerate(comps, 1):
#         print(f"  Module {i}: {comp}")

In [75]:
len(modules)

205

In [76]:
P(modules)

{('abnormal behavior', 'Female'): [['Ago1', 'Arhgef4', 'Arl4d', 'Bpifa2', 'Cacna1b', 'Cadm1', 'Cpap', 'Csnk2a2', 'Cyb561', 'Defb1', 'Fzd1', 'Gal3st1', 'Hoga1', 'Kdm3a', 'Kptn', 'Lrig1', 'M6pr', 'Milr1', 'Mysm1', 'Ndufv3', 'Pacs2', 'Prmt7', 'Rsad2', 'Sbspon', 'Sox14', 'Sqstm1', 'Trak2', 'Usp42', 'Vat1l', 'Vcf1'], ['Ehmt1', 'Fgd5', 'Fgf10', 'Jak3', 'Lemd3', 'Srp72']], ('abnormal behavior', 'Male'): [['Ap3s2', 'Clcn1', 'Cnot7', 'Cnp', 'Epha3', 'Fbxo25', 'Fcho1', 'Il1rl2', 'Il9r', 'Kcnt1', 'Leprotl1', 'Lrrtm1', 'Ncf1', 'Ngfr', 'Nod2', 'Nucb1', 'Sbf1', 'Scg5', 'Slc22a3', 'Slc25a18', 'Vldlr'], ['Dsc3', 'Fubp1', 'Lamtor4', 'Ncoa2', 'Nedd4', 'Robo1', 'Tgds']], ('abnormal bone mineralization', 'Female'): [['Acp2', 'Rnf169']], ('abnormal bone mineralization', 'Male'): [['Angel1', 'Dhx40', 'Fam117b', 'Pbx3', 'Rhbdf2', 'Tnfaip1', 'Xylb']], ('abnormal bone structure', 'Female'): [['Adnp2', 'Ajap1', 'Anapc7', 'Arhgap10', 'Arvcf', 'Brms1l', 'Cers5', 'Chd2', 'Cnih2', 'Cyp17a1', 'Dennd5b', 'Dnase1l2', 

In [72]:
# 結果の表示
for (phenotype, sex), comps in modules.items():
    print(f"\n[Phenotype: {phenotype}, Sex: {sex}]")
    for i, comp in enumerate(comps, 1):
        print(f"  Module {i}: {comp}")


[Phenotype: abnormal behavior, Sex: Female]
  Module 1: ['Ago1', 'Arhgef4', 'Arl4d', 'Bpifa2', 'Cacna1b', 'Cadm1', 'Cpap', 'Csnk2a2', 'Cyb561', 'Defb1', 'Fzd1', 'Gal3st1', 'Hoga1', 'Kdm3a', 'Kptn', 'Lrig1', 'M6pr', 'Milr1', 'Mysm1', 'Ndufv3', 'Pacs2', 'Prmt7', 'Rsad2', 'Sbspon', 'Sox14', 'Sqstm1', 'Trak2', 'Usp42', 'Vat1l', 'Vcf1']
  Module 2: ['Ehmt1', 'Fgd5', 'Fgf10', 'Jak3', 'Lemd3', 'Srp72']

[Phenotype: abnormal behavior, Sex: Male]
  Module 1: ['Ap3s2', 'Clcn1', 'Cnot7', 'Cnp', 'Epha3', 'Fbxo25', 'Fcho1', 'Il1rl2', 'Il9r', 'Kcnt1', 'Leprotl1', 'Lrrtm1', 'Ncf1', 'Ngfr', 'Nod2', 'Nucb1', 'Sbf1', 'Scg5', 'Slc22a3', 'Slc25a18', 'Vldlr']
  Module 2: ['Dsc3', 'Fubp1', 'Lamtor4', 'Ncoa2', 'Nedd4', 'Robo1', 'Tgds']

[Phenotype: abnormal bone mineralization, Sex: Female]
  Module 1: ['Acp2', 'Rnf169']

[Phenotype: abnormal bone mineralization, Sex: Male]
  Module 1: ['Angel1', 'Dhx40', 'Fam117b', 'Pbx3', 'Rhbdf2', 'Tnfaip1', 'Xylb']

[Phenotype: abnormal bone structure, Sex: Female]
  Mo

## ✅ 目的

以下の条件をすべて満たすペアを抽出したい：

* 同じ遺伝子群の一部（≧3遺伝子） を含む2つのモジュール間で、
* 片方がMale、もう片方がFemale
* 表現型（Phenotype）が異なる

つまり：

“同じ遺伝子群が、性別によって異なる表現型に関与している” ことを示唆する遺伝子モジュールペアを抽出したい。

In [87]:
from itertools import combinations
from collections import defaultdict

def find_sex_diff_phenotype_modules(modules, min_shared_genes=3):
    """
    modules: dict of (phenotype, sex) → list of list of genes
    """
    result = []

    # フラット化: (phenotype, sex, gene_set)
    flat_modules = []
    for (phenotype, sex), mod_lists in modules.items():
        for genes in mod_lists:
            flat_modules.append((phenotype, sex, set(genes)))

    # すべてのペアを比較
    for (ph1, sex1, genes1), (ph2, sex2, genes2) in combinations(flat_modules, 2):
        if sex1 == sex2:
            continue  # 両方ともMaleやFemaleならスキップ
        if ph1 == ph2:
            continue  # 表現型が同じならスキップ

        shared = genes1 & genes2
        if len(shared) >= min_shared_genes:
            result.append({
                'shared_genes': sorted(shared),
                'module1': {'phenotype': ph1, 'sex': sex1, 'genes': sorted(genes1)},
                'module2': {'phenotype': ph2, 'sex': sex2, 'genes': sorted(genes2)},
            })

    return result

# 使用例
test_modules = {("hoge", "Male"): [["A", "B", "C", "D"]], ("fuga", "Female"): [["A", "B", "C"]],}

interesting_pairs = find_sex_diff_phenotype_modules(test_modules)

# 表示例
for i, pair in enumerate(interesting_pairs[:5], 1):  # 最初の5件だけ表示
    print(f"\n=== Match {i} ===")
    print("Shared genes:", pair['shared_genes'])
    print("→ Module 1:", pair['module1']['phenotype'], "/", pair['module1']['sex'])
    print("   Genes:", pair['module1']['genes'])
    print("→ Module 2:", pair['module2']['phenotype'], "/", pair['module2']['sex'])
    print("   Genes:", pair['module2']['genes'])



=== Match 1 ===
Shared genes: ['A', 'B', 'C']
→ Module 1: hoge / Male
   Genes: ['A', 'B', 'C', 'D']
→ Module 2: fuga / Female
   Genes: ['A', 'B', 'C']


In [88]:
interesting_pairs = find_sex_diff_phenotype_modules(modules)

# 表示例
for i, pair in enumerate(interesting_pairs, 1):  # 最初の5件だけ表示
    if pair['module1']['phenotype'] == "abnormal behavior":
        continue
    if pair['module2']['phenotype'] == "abnormal behavior":
        continue
    print(f"\n=== Match {i} ===")
    print("Shared genes:", pair['shared_genes'])
    print("→ Module 1:", pair['module1']['phenotype'], "/", pair['module1']['sex'])
    print("   Genes:", pair['module1']['genes'])
    print("→ Module 2:", pair['module2']['phenotype'], "/", pair['module2']['sex'])
    print("   Genes:", pair['module2']['genes'])


=== Match 6 ===
Shared genes: ['Ncald', 'Postn', 'Ppp1r9b']
→ Module 1: abnormal bone structure / Female
   Genes: ['Adnp2', 'Ajap1', 'Anapc7', 'Arhgap10', 'Arvcf', 'Brms1l', 'Cers5', 'Chd2', 'Cnih2', 'Cyp17a1', 'Dennd5b', 'Dnase1l2', 'Elovl5', 'Fancl', 'Fbf1', 'Foxk2', 'G2e3', 'Gdf5', 'Hectd3', 'Hif1an', 'Hspb6', 'Ier5l', 'Kash5', 'Kbtbd7', 'Klhl9', 'Myh1', 'Ncald', 'Nrxn1', 'Pcif1', 'Pdxp', 'Pepd', 'Pkp4', 'Postn', 'Ppp1r9b', 'Prdm14', 'Ptp4a1', 'Ptp4a2', 'Rab3ip', 'Rbpms2', 'Rwdd1', 'Selenot', 'Smoc2', 'Sra1', 'Stag3', 'Steap2', 'Strbp', 'Tmco6', 'Tmod2', 'Tnfrsf11b', 'Tram1', 'Trim37', 'Vkorc1l1', 'Xkr4']
→ Module 2: abnormal sleep behavior / Male
   Genes: ['Ap4e1', 'Ncald', 'Pitx3', 'Postn', 'Ppp1r9b', 'Tmem151b', 'Tppp', 'Zzef1']

=== Match 7 ===
Shared genes: ['Fbf1', 'Hspb6', 'Nrxn1']
→ Module 1: abnormal bone structure / Female
   Genes: ['Adnp2', 'Ajap1', 'Anapc7', 'Arhgap10', 'Arvcf', 'Brms1l', 'Cers5', 'Chd2', 'Cnih2', 'Cyp17a1', 'Dennd5b', 'Dnase1l2', 'Elovl5', 'Fancl', 

In [79]:
len(interesting_pairs)

414

In [84]:
interesting_pairs

[{'shared_genes': ['Cacna1b',
   'Csnk2a2',
   'Defb1',
   'Fzd1',
   'Hoga1',
   'Kdm3a',
   'M6pr',
   'Milr1',
   'Trak2'],
  'module1': {'phenotype': 'abnormal behavior',
   'sex': 'Female',
   'genes': ['Ago1',
    'Arhgef4',
    'Arl4d',
    'Bpifa2',
    'Cacna1b',
    'Cadm1',
    'Cpap',
    'Csnk2a2',
    'Cyb561',
    'Defb1',
    'Fzd1',
    'Gal3st1',
    'Hoga1',
    'Kdm3a',
    'Kptn',
    'Lrig1',
    'M6pr',
    'Milr1',
    'Mysm1',
    'Ndufv3',
    'Pacs2',
    'Prmt7',
    'Rsad2',
    'Sbspon',
    'Sox14',
    'Sqstm1',
    'Trak2',
    'Usp42',
    'Vat1l',
    'Vcf1']},
  'module2': {'phenotype': 'decreased anxiety-related response',
   'sex': 'Female',
   'genes': ['Ano2',
    'Cacna1b',
    'Comt',
    'Csnk2a2',
    'Defb1',
    'Fzd1',
    'Grik3',
    'Hoga1',
    'Kcnf1',
    'Kdm3a',
    'M6pr',
    'Milr1',
    'Trak2']}},
 {'shared_genes': ['Arl4d', 'Lrig1', 'Usp42'],
  'module1': {'phenotype': 'abnormal behavior',
   'sex': 'Female',
   'genes': ['Ag

In [39]:
df = pl.read_csv("data/TSUMUGI_raw_data.csv.gz")

df = df.with_columns([
    pl.col("List of shared phenotypes").str.json_decode().alias("List of shared phenotypes")
  ])

ComputeError: error deserializing JSON: error deserializing value "String("vertebral transformation (Homo, Early)")" as null. \
            Try increasing `infer_schema_length` or specifying a schema.
            

In [None]:
df

In [14]:
RELEASE = 22.1

In [27]:
path_data = Path("data", "impc", f"statistical-results-ALL-{RELEASE}.csv")
data = pd.read_csv(path_data)
# 30 seconds

  data = pd.read_csv(path_data)


In [28]:
columns = ["marker_symbol", "mp_term_name", "p_value", "sex_effect_p_value", "female_ko_effect_p_value", "male_ko_effect_p_value", "zygosity", "effect_size",]


data = data[columns]

# Filter by p_value < 0.0001
threshold = 0.0001
filter_pvalue = data["p_value"] < threshold
filter_female_ko_pvalue = data["female_ko_effect_p_value"] < threshold
filter_male_ko_pvalue = data["male_ko_effect_p_value"] < threshold

data_filtered = data[filter_pvalue | filter_male_ko_pvalue | filter_male_ko_pvalue]

# Filter by mp_term_id and mp_term_name are not NaN
data_filtered = data_filtered.dropna(subset=["mp_term_name"])

# Filter by effect_size is not NaN
data_filtered = data_filtered.dropna(subset=["effect_size"])

data_filtered

Unnamed: 0,marker_symbol,mp_term_name,p_value,sex_effect_p_value,female_ko_effect_p_value,male_ko_effect_p_value,zygosity,effect_size
1,Uap1,abnormal lens morphology,6.967638e-10,,0.000016,1.779191e-05,heterozygote,0.697502
85,6430548M08Rik,enlarged kidney,0.000000e+00,,,,homozygote,1.000000
98,Stac,abnormal locomotor behavior,5.883610e-08,,0.000048,4.532563e-04,homozygote,0.364640
262,Slc24a4,enlarged thymus,0.000000e+00,,,,heterozygote,1.000000
295,Uhrf2,female infertility,0.000000e+00,,,,homozygote,1.000000
...,...,...,...,...,...,...,...,...
3165250,Rab6b,decreased thigmotaxis,1.571041e-01,0.0,0.163047,4.835990e-09,homozygote,-1.246848
3165251,Alg3,"preweaning lethality, complete penetrance",0.000000e+00,,,,homozygote,1.000000
3165261,Acsl4,decreased body weight,7.441061e-06,,,,hemizygote,-2.794646
3165306,Thoc3,embryonic lethality prior to organogenesis,0.000000e+00,,,,homozygote,1.000000


In [32]:
data_annotated = data_filtered.copy()

threshold = 0.0001

# 条件リスト
conditions = [
    (data_annotated["sex_effect_p_value"] < threshold) & (data_annotated["female_ko_effect_p_value"] < threshold) & (data_annotated["male_ko_effect_p_value"] > threshold),
    (data_annotated["sex_effect_p_value"] < threshold) & (data_annotated["male_ko_effect_p_value"] < threshold) & (data_annotated["female_ko_effect_p_value"] > threshold)
]

# 条件に対応する値
choices = ["female", "male"]

# np.selectで列を設定
data_annotated["sex"] = np.select(conditions, choices, default=None)
data_annotated = data_annotated.reset_index(drop=True)

# 結果を確認
print(RELEASE)
print(data_annotated["sex"].value_counts())

22.1
sex
male      4915
female    4146
Name: count, dtype: int64


In [33]:
data_annotated

Unnamed: 0,marker_symbol,mp_term_name,p_value,sex_effect_p_value,female_ko_effect_p_value,male_ko_effect_p_value,zygosity,effect_size,sex
0,Uap1,abnormal lens morphology,6.967638e-10,,0.000016,1.779191e-05,heterozygote,0.697502,
1,6430548M08Rik,enlarged kidney,0.000000e+00,,,,homozygote,1.000000,
2,Stac,abnormal locomotor behavior,5.883610e-08,,0.000048,4.532563e-04,homozygote,0.364640,
3,Slc24a4,enlarged thymus,0.000000e+00,,,,heterozygote,1.000000,
4,Uhrf2,female infertility,0.000000e+00,,,,homozygote,1.000000,
...,...,...,...,...,...,...,...,...,...
54054,Rab6b,decreased thigmotaxis,1.571041e-01,0.0,0.163047,4.835990e-09,homozygote,-1.246848,male
54055,Alg3,"preweaning lethality, complete penetrance",0.000000e+00,,,,homozygote,1.000000,
54056,Acsl4,decreased body weight,7.441061e-06,,,,hemizygote,-2.794646,
54057,Thoc3,embryonic lethality prior to organogenesis,0.000000e+00,,,,homozygote,1.000000,


In [35]:
columns_sex = ["marker_symbol", "mp_term_name", "sex"]
data_sex = data_annotated[columns_sex]
P(len(data_sex))

54059


In [36]:
data_sex_filtered = data_sex[data_sex["sex"].notna()]
P(len(data_sex_filtered))

9061


In [37]:
data_sex_filtered

Unnamed: 0,marker_symbol,mp_term_name,sex
7,C1qtnf1,decreased circulating cholesterol level,male
9,Prdm8,hyperactivity,male
20,Adcy2,increased circulating HDL cholesterol level,female
24,Eif4g2,hyperactivity,female
27,Ptpru,hyperactivity,female
...,...,...,...
54019,Serac1,decreased exploration in new environment,female
54021,Trim55,prolonged RR interval,male
54033,Ccl26,abnormal sleep behavior,female
54052,Adcy3,increased total body fat amount,female


In [None]:
from itertools import combinations
import networkx as nx

df = data_sex_filtered.copy()

# モジュールを記録
modules = defaultdict(list)

# phenotypes per sex
for (phenotype, sex), group in df.groupby(['mp_term_name', 'sex']):
    genes = group['marker_symbol'].unique()

    if len(genes) < 2:
        continue  # モジュールにならない単一遺伝子はスキップ

    G = nx.Graph()
    G.add_nodes_from(genes)
    G.add_edges_from(combinations(genes, 2))  # 完全グラフ：同じ表現型に属する遺伝子を接続

    for comp in nx.connected_components(G):
        modules[(phenotype, sex)].append(sorted(comp))

# 結果表示
for (phenotype, sex), comps in modules.items():
    print(f"\n[Phenotype: {phenotype}, Sex: {sex}]")
    for i, comp in enumerate(comps, 1):
        print(f"  Module {i}: {comp}")