# 創薬対象となりえる遺伝子リストを取得する

# セットアップ

In [None]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir("../")

print(os.getcwd())

In [None]:
from pathlib import Path
from pprint import pprint
from collections import defaultdict, Counter
from itertools import combinations
import csv
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

P = print
PP = pprint
C = Counter

# 実験

- DGIdb 5.0

https://academic.oup.com/nar/article/52/D1/D1227/7416371

In [None]:
url = "https://dgidb.org/data/2024-Dec/interactions.tsv"
df_interactions = pd.read_csv(url, sep="\t", comment="#")
df_interactions  # 98920 rows

In [None]:
print(df_interactions[["interaction_score"]].describe())
df_interactions[["interaction_score"]].plot.hist(bins=1000)

In [None]:
print(print(df_interactions[["evidence_score"]].describe()))
df_interactions[["evidence_score"]].plot.hist(bins=1000)

In [None]:
df_interactions_approved = df_interactions[df_interactions["drug_is_approved"] == True]
df_interactions_approved  # 38483 rows

In [None]:
df_interactions_approved_high_eveidence = df_interactions_approved[
    df_interactions_approved["evidence_score"] >= 2
]
df_interactions_approved_high_eveidence  # 20452 rows

In [None]:
druggable_genes = df_interactions_approved_high_eveidence["gene_name"].unique()
print(len(druggable_genes))  # 2265

In [None]:
# ソートして出力

Path("data/experiments/").mkdir(parents=True, exist_ok=True)
druggable_genes = sorted(druggable_genes)
with open("data/experiments/druggable_genes.txt", "w") as f:
    for gene in druggable_genes:
        f.write(gene + "\n")

## ヒトからマウスの遺伝子シンボルに変換

In [None]:
url = "https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt"

df_mouse_human = pd.read_csv(url, sep="\t", comment="#")
df_mouse_human  # 1081 rows

In [None]:
# 種名から 'mouse' or 'human' を抽出
df_mouse_human["Organism"] = df_mouse_human["Common Organism Name"].str.extract(
    r"(mouse|human)", expand=False
)
df = df_mouse_human[["DB Class Key", "Organism", "Symbol"]]
df

In [None]:
# human/mouseごとの Symbol を分けて抽出
human_df = df[df["Organism"] == "human"][["DB Class Key", "Symbol"]].rename(
    columns={"Symbol": "human"}
)
mouse_df = df[df["Organism"] == "mouse"][["DB Class Key", "Symbol"]].rename(
    columns={"Symbol": "mouse"}
)

In [None]:
df_merged = pd.merge(human_df, mouse_df, on="DB Class Key", how="left").reset_index(
    drop=True
)
df_merged[["human", "mouse"]].drop_duplicates().to_csv(
    "data/experiments/human_mouse_genes.txt", sep="\t", index=False
)

In [None]:
df_merged[df_merged["DB Class Key"] == 48820152]

## マウスにおけるDruggable genesのリストを作成

In [None]:
druggable_genes[:10]

In [None]:
human_mouse = {
    df_merged.iloc[i]["human"]: df_merged.iloc[i]["mouse"]
    for i in range(len(df_merged))
}
human_mouse["TP53"]  # Trp53

In [None]:
druggable_genes_mouse = []
for gene in druggable_genes:
    if gene in human_mouse:
        druggable_genes_mouse.append(human_mouse[gene])

druggable_genes_mouse = sorted(druggable_genes_mouse)
with open("data/experiments/druggable_genes_mouse.txt", "w") as f:
    for gene in druggable_genes_mouse:
        f.write(gene + "\n")
print(len(druggable_genes))
print(len(druggable_genes_mouse))  # 2189