# TSUMUGIに必要なアノテーション情報を整理する

In [None]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir("../")

print(os.getcwd())

In [None]:
from pathlib import Path
from collections import defaultdict
import pandas as pd
import json

In [None]:
path_data = Path("data", "statistical_filtered.csv")

In [None]:
data = pd.read_csv(path_data)
print(len(data))
# Release 22.0: 54059

## 性特異的表現型をアノテーションする

In [None]:
import numpy as np

threshold = 0.0001

# 条件リスト
conditions = [
    (data["sex_effect_p_value"] < threshold)
    & (data["female_ko_effect_p_value"] < threshold)
    & (data["male_ko_effect_p_value"] > threshold),
    (data["sex_effect_p_value"] < threshold)
    & (data["male_ko_effect_p_value"] < threshold)
    & (data["female_ko_effect_p_value"] > threshold),
]

# 条件に対応する値
choices = ["female", "male"]

# np.selectで列を設定
data["sexdual_dimorphism"] = np.select(conditions, choices, default=None)

# 結果を確認
print(data["sexdual_dimorphism"].value_counts())

In [None]:
filtered_data = data[data["sexdual_dimorphism"].notna()]
filtered_data[
    [
        "p_value",
        "sexdual_dimorphism",
        "effect_size",
        "genotype_effect_parameter_estimate",
        "female_ko_parameter_estimate",
        "male_ko_parameter_estimate",
    ]
].head(10)

## 遺伝型をアノテーションする

In [None]:
data_select = data[
    ["marker_symbol", "mp_term_name", "zygosity", "sexdual_dimorphism"]
].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select

In [None]:
data_select["zygosity"].value_counts()

In [None]:
data_dict = defaultdict(list)
for index, row in data_select.iterrows():
    if row["zygosity"] == "homozygote":
        annotate = "Homo"
    elif row["zygosity"] == "heterozygote":
        annotate = "Hetero"
    else:
        annotate = "Hemi"

    if row["sexdual_dimorphism"] == "female":
        annotate = ", ".join([annotate, "Female"])
    if row["sexdual_dimorphism"] == "male":
        annotate = ", ".join([annotate, "Male"])

    data_dict[row["marker_symbol"]].append(f"{row['mp_term_name']} ({annotate})")

for key in data_dict.keys():
    data_dict[key].sort()

print(data_dict["Rhd"])

In [None]:
Path("data/annotation").mkdir(exist_ok=True, parents=True)
file_path = "data/annotation/symbol_mptermname.json"
json.dump(data_dict, open(file_path, "w"), indent=4, sort_keys=True)

In [None]:
%%bash

grep -c "Male" data/annotation/symbol_mptermname.json | sed "s|^|Male: |"
grep -c "Female" data/annotation/symbol_mptermname.json | sed "s|^|Feale: |"
grep -c "Homo" data/annotation/symbol_mptermname.json | sed "s|^|Homo: |"
grep -c "Hetero" data/annotation/symbol_mptermname.json | sed "s|^|Hetero: |"
grep -c "Hemi" data/annotation/symbol_mptermname.json | sed "s|^|Hemi: |"

# Male: 4374
# Feale: 3392
# Homo: 31206
# Hetero: 9725
# Hemi: 519


## Connect mp_term_name to IMPC Phenotype URL

In [None]:
data_select = data[["mp_term_id", "mp_term_name"]].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select

In [None]:
data_dict_url = dict()
for index, row in data_select.iterrows():
    mp_tern_id = row["mp_term_id"]
    impc_url = f"https://www.mousephenotype.org/data/phenotypes/{mp_tern_id}"
    mp_term_name = row["mp_term_name"]
    data_dict_url[mp_term_name] = impc_url

print(data_dict_url["small lymph nodes"])

In [None]:
with open("data/annotation/mptermname_phenotypeurl.tsv", "w") as f:
    for term, url in data_dict_url.items():
        f.write(f"{term}\t{url}\n")

In [None]:
%%bash

head -n 3 data/annotation/mptermname_phenotypeurl.tsv
wc -l data/annotation/mptermname_phenotypeurl.tsv # Release 22.0: 664 

## Connect marker_symbol to accession_id

In [None]:
data_select = data[["marker_symbol", "marker_accession_id"]].drop_duplicates()
# data_select = data[['marker_symbol', 'marker_accession_id', 'mp_term_name', 'mp_term_id']].drop_duplicates()
data_select

In [None]:
data_dict = dict()
for index, row in data_select.iterrows():
    data_dict[row["marker_symbol"]] = row["marker_accession_id"]

In [None]:
data_dict["Ncam1"]

In [None]:
json.dump(
    data_dict, open("data/annotation/symbol_mgiid.json", "w"), indent=4, sort_keys=True
)
Path("data/annotation/symbol_mgiid.tsv").write_text(
    "\n".join([f"{k}\t{v}" for k, v in data_dict.items()])
)

In [None]:
%%bash
head -n 3 data/annotation/symbol_mgiid.json
head -n 3 data/annotation/symbol_mgiid.tsv

In [None]:
%%bash

date +"%Y/%m/%d %H:%M:%S" # Last update