# TSUMUGIに必要なアノテーション情報を整理する

In [1]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir('../')

print(os.getcwd())

/mnt/e/Research/TSUMUGI-dev/notebooks/notebools-web
/mnt/e/Research/TSUMUGI-dev


In [2]:
from pathlib import Path
from collections import defaultdict
from itertools import combinations
import csv
import pandas as pd
from matplotlib import pyplot as plt
import pickle
import json

In [3]:
path_data = Path("data", "statistical_filtered.csv")

In [4]:
data = pd.read_csv(path_data)
print(len(data))
# Release 22.0: 54059

54059


  data = pd.read_csv(path_data)


## 性特異的表現型をアノテーションする

In [5]:
import numpy as np

threshold = 0.0001

# 条件リスト
conditions = [
    (data["sex_effect_p_value"] < threshold) & (data["female_ko_effect_p_value"] < threshold) & (data["male_ko_effect_p_value"] > threshold),
    (data["sex_effect_p_value"] < threshold) & (data["male_ko_effect_p_value"] < threshold) & (data["female_ko_effect_p_value"] > threshold)
]

# 条件に対応する値
choices = ["female", "male"]

# np.selectで列を設定
data["sexdual_dimorphism"] = np.select(conditions, choices, default=None)

# 結果を確認
print(data["sexdual_dimorphism"].value_counts())


sexdual_dimorphism
male      4915
female    4146
Name: count, dtype: int64


In [6]:
filtered_data = data[data["sexdual_dimorphism"].notna()]
filtered_data[["p_value", "sexdual_dimorphism", "effect_size", "genotype_effect_parameter_estimate", "female_ko_parameter_estimate", "male_ko_parameter_estimate"]].head(10)

Unnamed: 0,p_value,sexdual_dimorphism,effect_size,genotype_effect_parameter_estimate,female_ko_parameter_estimate,male_ko_parameter_estimate
7,0.2473221,male,-0.775659,-9.880265,-9.880265,-52.709552
9,0.09323751,male,0.637815,-407.079557,-434.647221,1362.539526
20,5.407602e-05,female,0.246786,12.886045,12.886045,2.852505
24,3.072703e-06,female,-1.242673,-74.143434,-91.529079,-61.885708
27,2.315925e-13,female,2.053221,1.424075,1.421448,0.715908
28,0.2623063,male,1.258237,0.286702,0.286702,1.119338
38,0.3010097,male,0.942703,0.364327,0.364327,1.333887
50,0.764477,male,1.428011,0.047221,0.047221,0.976487
51,0.03243832,male,1.024187,450.500694,450.500694,1026.044584
54,4.8525e-06,male,0.063404,1.479369,1.109724,1.709209


## 遺伝型をアノテーションする

In [7]:
data_select = data[['marker_symbol', 'mp_term_name', 'zygosity', "sexdual_dimorphism"]].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select

Unnamed: 0,marker_symbol,mp_term_name,zygosity,sexdual_dimorphism
0,Uap1,abnormal lens morphology,heterozygote,
1,6430548M08Rik,enlarged kidney,homozygote,
2,Stac,abnormal locomotor behavior,homozygote,
3,Slc24a4,enlarged thymus,heterozygote,
4,Uhrf2,female infertility,homozygote,
...,...,...,...,...
54048,Lrrk1,abnormal joint morphology,homozygote,
54051,Rtl5,abnormal kidney morphology,hemizygote,
54052,Adcy3,increased total body fat amount,heterozygote,female
54055,Alg3,"preweaning lethality, complete penetrance",homozygote,


In [8]:
data_select["zygosity"].value_counts()

zygosity
homozygote      31206
heterozygote     9725
hemizygote        519
Name: count, dtype: int64

In [9]:
data_dict = defaultdict(list)
for index, row in data_select.iterrows():
    if row['zygosity'] == 'homozygote':
        annotate = "Homo"
    elif row['zygosity'] == 'heterozygote':
        annotate = "Hetero"
    else:
        annotate = "Hemi"

    if row['sexdual_dimorphism'] == "female":
        annotate = ", ".join([annotate, 'Female'])
    if row['sexdual_dimorphism'] == "male":
        annotate = ", ".join([annotate, 'Male'])

    data_dict[row['marker_symbol']].append(f"{row['mp_term_name']} ({annotate})")

for key in data_dict.keys():
    data_dict[key].sort()

print(data_dict["Rhd"])

['abnormal skin condition (Homo)', 'decreased circulating HDL cholesterol level (Homo, Male)', 'decreased circulating alkaline phosphatase level (Homo, Female)', 'decreased circulating cholesterol level (Homo, Male)', 'decreased circulating free fatty acids level (Homo)', 'decreased hemoglobin content (Homo, Male)', 'decreased mean corpuscular hemoglobin (Homo)', 'decreased mean corpuscular hemoglobin concentration (Homo)', 'decreased mean corpuscular volume (Homo)', 'increased exploration in new environment (Homo)']


In [10]:
Path("data/annotation").mkdir(exist_ok=True, parents=True)
file_path = "data/annotation/symbol_mptermname.json"
json.dump(data_dict, open(file_path, "w"), indent=4, sort_keys=True)


In [11]:
%%bash

grep -c "Male" data/annotation/symbol_mptermname.json | sed "s|^|Male: |"
grep -c "Female" data/annotation/symbol_mptermname.json | sed "s|^|Feale: |"
grep -c "Homo" data/annotation/symbol_mptermname.json | sed "s|^|Homo: |"
grep -c "Hetero" data/annotation/symbol_mptermname.json | sed "s|^|Hetero: |"
grep -c "Hemi" data/annotation/symbol_mptermname.json | sed "s|^|Hemi: |"

# Male: 4374
# Feale: 3392
# Homo: 31206
# Hetero: 9725
# Hemi: 519


Male: 4374
Feale: 3392
Homo: 31206
Hetero: 9725
Hemi: 519


## Connect mp_term_name to IMPC Phenotype URL

In [12]:
data_select = data[['mp_term_id', 'mp_term_name']].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select

Unnamed: 0,mp_term_id,mp_term_name
0,MP:0001303,abnormal lens morphology
1,MP:0003068,enlarged kidney
2,MP:0001392,abnormal locomotor behavior
3,MP:0000709,enlarged thymus
4,MP:0001926,female infertility
...,...,...
46204,MP:0004818,increased skeletal muscle mass
47247,MP:0005407,hyperalgesia
48663,MP:0003956,abnormal body size
51033,MP:0009477,small cecum


In [13]:
data_dict_url = dict()
for index, row in data_select.iterrows():
    mp_tern_id = row['mp_term_id']
    impc_url = f"https://www.mousephenotype.org/data/phenotypes/{mp_tern_id}"
    mp_term_name = row['mp_term_name']
    data_dict_url[mp_term_name] = impc_url

print(data_dict_url["small lymph nodes"])

https://www.mousephenotype.org/data/phenotypes/MP:0002217


In [14]:
with open('data/annotation/mptermname_phenotypeurl.tsv', 'w') as f:
    for term, url in data_dict_url.items():
        f.write(f"{term}\t{url}\n")

In [15]:
%%bash

head -n 3 data/annotation/mptermname_phenotypeurl.tsv
wc -l data/annotation/mptermname_phenotypeurl.tsv # Release 22.0: 664 

abnormal lens morphology	https://www.mousephenotype.org/data/phenotypes/MP:0001303
enlarged kidney	https://www.mousephenotype.org/data/phenotypes/MP:0003068
abnormal locomotor behavior	https://www.mousephenotype.org/data/phenotypes/MP:0001392
664 data/annotation/mptermname_phenotypeurl.tsv


## Connect marker_symbol to accession_id

In [22]:
data_select = data[['marker_symbol', 'marker_accession_id']].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select

Unnamed: 0,marker_symbol,marker_accession_id
0,Uap1,MGI:1334459
1,6430548M08Rik,MGI:2443793
2,Stac,MGI:1201400
3,Slc24a4,MGI:2447362
4,Uhrf2,MGI:1923718
...,...,...
53904,Tm9sf1,MGI:1921390
53963,Eci3,MGI:1916373
53987,Stambp,MGI:1917777
54006,Wdr45b,MGI:1914090


In [23]:
data_dict = dict()
for index, row in data_select.iterrows():
    data_dict[row['marker_symbol']] = row['marker_accession_id']

In [24]:
data_dict["Ncam1"]

'MGI:97281'

In [25]:
json.dump(data_dict, open("data/annotation/symbol_mgiid.json", "w"), indent=4, sort_keys=True)
Path("data/annotation/symbol_mgiid.tsv").write_text("\n".join([f"{k}\t{v}" for k, v in data_dict.items()]))

140147

In [26]:
%%bash
head -n 3 data/annotation/symbol_mgiid.json
head -n 3 data/annotation/symbol_mgiid.tsv

{
    "0610010K14Rik": "MGI:1915609",
    "0610040J01Rik": "MGI:1923511",
Uap1	MGI:1334459
6430548M08Rik	MGI:2443793
Stac	MGI:1201400


In [27]:
%%bash

date +"%Y/%m/%d %H:%M:%S" # Last update

2025/03/19 15:20:29
