# 各表現型について、表現型に類似性があり、性差が異なる遺伝子リストを取得する

# セットアップ

In [1]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir('../')

print(os.getcwd())

/mnt/e/Research/TSUMUGI-dev-main/notebooks/notebooks-experiments
/mnt/e/Research/TSUMUGI-dev-main


In [2]:
from pathlib import Path
from pprint import pprint
from collections import defaultdict, Counter
from itertools import combinations
import csv
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

P = print
PP = pprint
C = Counter


# 実験

In [3]:
df_similarity = pd.read_csv('data/TSUMUGI_raw_data.csv.gz')
# 15 sec

In [4]:
conditions = (
    (df_similarity['Number of overlapped phenotype'] > 0) &
    (df_similarity['Jaccard Similarity'] > 0.1)
)
df_similarity_with_phenotype = df_similarity[conditions]
df_similarity_with_phenotype # 1779355 row

Unnamed: 0,Gene1,Gene2,Jaccard Similarity,Number of overlapped phenotype,Overlapped phenotype
5,0610010K14Rik,1700003F12Rik,0.167,1,"['preweaning lethality, complete penetrance (H..."
18,0610010K14Rik,1700067K01Rik,0.250,1,"['preweaning lethality, complete penetrance (H..."
26,0610010K14Rik,2310011J03Rik,0.200,1,"['preweaning lethality, complete penetrance (H..."
42,0610010K14Rik,4930444P10Rik,0.125,1,"['preweaning lethality, complete penetrance (H..."
56,0610010K14Rik,4933427D14Rik,0.500,3,['embryonic lethality prior to organogenesis (...
...,...,...,...,...,...
29996154,Znhit2,Znhit6,0.154,2,['embryonic lethality prior to organogenesis (...
29996158,Znhit2,Zranb2,0.125,1,"['preweaning lethality, complete penetrance (H..."
29996170,Znhit2,Zwint,0.250,2,['embryonic lethality prior to organogenesis (...
29996190,Znhit6,Zwint,0.182,2,['embryonic lethality prior to organogenesis (...


In [None]:
# Overlapped phenotypeのなかに、"male"または"female"が含まれているものを抽出する
df_similarity_with_sexual_dimorphism = df_similarity_with_phenotype[
    df_similarity_with_phenotype['Overlapped phenotype'].str.contains('male|female')
]
df_similarity_with_sexual_dimorphism # 63494 row

Unnamed: 0,Gene1,Gene2,Jaccard Similarity,Number of overlapped phenotype,Overlapped phenotype
131537,1700030K09Rik,2210408I21Rik,0.200,2,"['decreased bone mineral content (Homo, Female..."
131619,1700030K09Rik,Abcc10,0.250,1,"['decreased bone mineral content (Homo, Female)']"
131764,1700030K09Rik,Adgrd1,0.250,1,"['decreased bone mineral content (Homo, Female)']"
131767,1700030K09Rik,Adgrf5,0.167,2,"['decreased bone mineral content (Homo, Female..."
131815,1700030K09Rik,Agpat4,0.143,1,"['decreased bone mineral content (Homo, Female)']"
...,...,...,...,...,...
29993285,Zfp629,Zpld1,0.222,2,"['hyperactivity (Homo)', 'hyperactivity (Homo,..."
29993481,Zfp641,Zfp804a,0.200,2,"['hyperactivity (Homo)', 'hyperactivity (Homo,..."
29993516,Zfp641,Zpld1,0.286,2,"['hyperactivity (Homo)', 'hyperactivity (Homo,..."
29994988,Zfp804a,Zpld1,0.222,2,"['hyperactivity (Homo)', 'hyperactivity (Homo,..."


In [17]:
# Overlapeed phenotypeの中で、maleとfemaleを含むものを抽出する
overlapped_phenotype = df_similarity_with_sexual_dimorphism['Overlapped phenotype'].tolist()

import ast

# 各文字列をパースしてリストに変換
parsed_lists = [ast.literal_eval(item) for item in overlapped_phenotype]

# 全てのリストを結合
flattened_list = []
for sublist in parsed_lists:
    flattened_list.extend(sublist)

P(flattened_list[:5])

['decreased bone mineral content (Homo, Female)', 'decreased bone mineral density (Homo)', 'decreased bone mineral content (Homo, Female)', 'decreased bone mineral content (Homo, Female)', 'decreased bone mineral content (Homo, Female)']


In [18]:

phenotype_with_sexual_dimorphism = [p for p in flattened_list if "Male" in p or "Female" in p]

P(phenotype_with_sexual_dimorphism[:5])

['decreased bone mineral content (Homo, Female)', 'decreased bone mineral content (Homo, Female)', 'decreased bone mineral content (Homo, Female)', 'decreased bone mineral content (Homo, Female)', 'decreased bone mineral content (Homo, Female)']


In [20]:
Counter(phenotype_with_sexual_dimorphism).most_common(100)

[('hyperactivity (Homo, Female)', 13781),
 ('increased circulating alkaline phosphatase level (Homo, Female)', 3981),
 ('decreased bone mineral content (Homo, Female)', 3132),
 ('decreased grip strength (Homo, Female)', 2839),
 ('increased grip strength (Homo, Female)', 1835),
 ('decreased locomotor activity (Homo, Female)', 1821),
 ('hyperactivity (Hetero, Female)', 1246),
 ('decreased bone mineral density (Homo, Female)', 1208),
 ('increased total body fat amount (Homo, Female)', 917),
 ('abnormal bone structure (Homo, Female)', 809),
 ('increased fasting circulating glucose level (Homo, Female)', 682),
 ('increased circulating alkaline phosphatase level (Hetero, Female)', 612),
 ('increased bone mineral content (Homo, Female)', 572),
 ('decreased lean body mass (Homo, Female)', 525),
 ('increased grip strength (Hetero, Female)', 509),
 ('decreased grip strength (Hetero, Female)', 434),
 ('decreased exploration in new environment (Homo, Female)', 292),
 ('decreased body length (Homo,