# 表現型の類似度を求める

In [None]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir("../")

print(os.getcwd())

In [None]:
from pathlib import Path
from collections import defaultdict
from itertools import combinations
import csv
import pandas as pd
from matplotlib import pyplot as plt
import pickle
import json

In [None]:
path_data = Path("data", "statistical_filtered.csv")

In [None]:
data = pd.read_csv(path_data)

print(len(data))  # 54059

In [None]:
file_path = Path("data", "annotation", "symbol_mptermname.json")

symbol_mptermname = json.load(open(file_path))

In [None]:
print(symbol_mptermname["Dpf2"])

In [None]:
symbol_mptermname = {k: set(v) for k, v in symbol_mptermname.items() if v}
print(symbol_mptermname["Dpf2"])

In [None]:
# with open("data/overlap_ratios.csv", "w") as f_all, open("data/overlap_ratios_filtered.csv", "w") as f_filtered:
#     writer_all = csv.writer(f_all)
#     writer_filtered = csv.writer(f_filtered)
#     writer_all.writerow(["marker1", "marker2", "overlap_ratio", "overlapped_mp_number", "overlapped_mp"])
#     writer_filtered.writerow(["marker1", "marker2", "overlap_ratio", "overlapped_mp_number", "overlapped_mp"])
#     for a, b in combinations(symbol_mptermname, 2):
#         overlapped_mp = symbol_mptermname[a] & symbol_mptermname[b]
#         overlapped_mp_number = len(overlapped_mp)
#         union_mp_number = len(symbol_mptermname[a] | symbol_mptermname[b])
#         overlap_ratio = overlapped_mp_number / union_mp_number

#         overlapped_mp = ",".join(map(str, sorted(overlapped_mp)))

#         if overlapped_mp_number > 0:
#             writer_all.writerow([a, b, round(overlap_ratio, 3), overlapped_mp_number, overlapped_mp])
#         if overlapped_mp_number > 2:
#             writer_filtered.writerow([a, b, round(overlap_ratio, 3), overlapped_mp_number, overlapped_mp])

# # 30s


In [None]:
# Jaccard係数で集合の類似度を計算

overlapped_ratios_all = []

for a, b in combinations(symbol_mptermname, 2):
    overlapped_mp = sorted(symbol_mptermname[a] & symbol_mptermname[b])
    overlapped_mp_number = len(overlapped_mp)
    union_mp_number = len(symbol_mptermname[a] | symbol_mptermname[b])
    overlap_ratio = overlapped_mp_number / union_mp_number

    overlapped_ratios_all.append(
        [a, b, round(overlap_ratio, 3), overlapped_mp_number, overlapped_mp]
    )

## 46s

In [None]:
print(len(overlapped_ratios_all))  # 29996385
print(overlapped_ratios_all[:3])

In [None]:
num_overlapped_mp = 3

overlapped_ratios_filtered = []
for record in overlapped_ratios_all:
    if record[3] >= num_overlapped_mp:
        overlapped_ratios_filtered.append(record)

In [None]:
print(len(overlapped_ratios_filtered))
# Release 21.1: 134880
# Release 22.0: 133281 <- Homo/Hetero/Hemiおよび♂・♀の完全一致を考慮するようになったため、減少
# Phenotypeのfilterの閾値を3以上から2以上: 580458
print(overlapped_ratios_filtered[:3])

In [None]:
Path("data", "overlap").mkdir(exist_ok=True, parents=True)
pickle.dump(overlapped_ratios_all, open("data/overlap/overlapped_ratios_all.pkl", "wb"))
pickle.dump(
    overlapped_ratios_filtered,
    open("data/overlap/overlapped_ratios_filtered.pkl", "wb"),
)

# 18 sec

In [None]:
df_overlap = pd.DataFrame(overlapped_ratios_all)

In [None]:
df_overlap.columns = [
    "Gene1",
    "Gene2",
    "Jaccard Similarity",
    "Number of overlapped phenotype",
    "Overlapped phenotype",
]
df_overlap.reindex(
    columns=[
        "Gene1",
        "Gene2",
        "Number of overlapped phenotype",
        "Jaccard Similarity",
        "Overlapped phenotype",
    ]
)
df_overlap

In [None]:
df_overlap.to_csv(
    "data/TSUMUGI_raw_data.csv.gz", index=False, compression="gzip", lineterminator="\n"
)
# 3 min

In [None]:
num_overlapped_mp = 1

overlapped_ratios_filtered = []
for record in overlapped_ratios_all:
    if record[3] >= num_overlapped_mp:
        overlapped_ratios_filtered.append(record)

In [None]:
df_overlap_filtered = pd.DataFrame(overlapped_ratios_filtered)
df_overlap_filtered.columns = [
    "Gene1",
    "Gene2",
    "Jaccard Similarity",
    "Number of overlapped phenotype",
    "Overlapped phenotype",
]
df_overlap_filtered.reindex(
    columns=[
        "Gene1",
        "Gene2",
        "Number of overlapped phenotype",
        "Jaccard Similarity",
        "Overlapped phenotype",
    ]
)
df_overlap_filtered

In [None]:
df_overlap_filtered.to_csv(
    "data/TSUMUGI_filtered_data.csv.gz",
    index=False,
    compression="gzip",
    lineterminator="\n",
)
df_overlap_filtered.to_json(
    "data/TSUMUGI_filtered_data.json.gz", index=False, compression="gzip", indent=2
)
# 1 min

In [None]:
%%bash

ls -lh data/TSUMUGI_raw_data.csv.gz # 100M
zcat data/TSUMUGI_raw_data.csv.gz | head | od -c | head | grep '\n'

In [None]:
%%bash

ls -lh data/TSUMUGI_filtered_data.csv.gz # 21M
zcat data/TSUMUGI_filtered_data.csv.gz | head | od -c | head | grep '\n'

In [None]:
%%bash

ls -lh data/TSUMUGI_filtered_data.json.gz # 
zcat data/TSUMUGI_filtered_data.json.gz | head
zcat data/TSUMUGI_filtered_data.json.gz | head | od -c | head | grep '\n'

In [None]:
# with open("data/overlap_ratios.csv", "a") as f:
#     writer = csv.writer(f)
#     for a in symbol_mptermname:
#         intersect_count = 0
#         union_count = 0
#         overlap = 0
#         writer.writerow([a, a, intersect_count, union_count, overlap])



In [None]:
%%bash

date +"%Y/%m/%d %H:%M:%S"  # Last update

## Experiment🧑‍🔬: Visualize the number of intersects

- [ ] ひとつの遺伝子において何個くらいの表現形が共通しているのか

In [None]:
overlapped_ratios_all = pickle.load(open("data/overlapped_ratios_all.pkl", "rb"))  # 24s

In [None]:
print(overlapped_ratios_all[:3])

In [None]:
counts = defaultdict(int)
for record in overlapped_ratios_all:
    counts[record[0]] += 0
    counts[record[1]] += 0
    if record[3] > 0:
        counts[record[0]] += 1
        counts[record[1]] += 1

In [None]:
sort_counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))

In [None]:
print(sort_counts["Rhd"])

In [None]:
print(len(sort_counts))  # 7626

In [None]:
import seaborn as sns

# Seabornを使ってバーグラフを作成
plt.figure(figsize=(10, 6))
sns.barplot(x=list(sort_counts.keys()), y=list(sort_counts.values()), color="orange")

# X軸ラベルを表示しない
plt.xlabel("Gene Symbols (7626)")
plt.ylabel("Number of Genes Sharing Phenotypes")
plt.xticks([])  # X軸ラベルを非表示にする
plt.grid(axis="y")

# プロットを表示
plt.show()

In [None]:
# # Sort the data by keys to ensure the plot is ordered numerically
# sorted_data = dict(sorted(count_intersect.items(), key=lambda item: int(item[0])))

# # Create a bar plot
# plt.figure(figsize=(10, 6))
# plt.bar(sorted_data.keys(), sorted_data.values(), color="skyblue")
# plt.xlabel("Number of Shared Significant Phenotypes")
# plt.ylabel("Number of Gene Symbol Pairs")
# plt.xticks(rotation=45)
# plt.grid(axis="y")

# # Show the plot
# plt.show()


- A single interaction accounts for 85%.
  - If the network you are visualizing is messy, it might be a good idea to exclude this one interaction.


## Experiment🧑‍🔬: Visualize the overlaps

In [None]:
path_results = Path("data", "overlap_ratios_py.csv")
with open(path_results, "r") as f:
    reader = csv.reader(f)
    header = next(reader)
    count_overlap = defaultdict(int)
    for row in reader:
        marker1, marker2, intersect_count, union_count, overlap_ratio = row
        count_overlap[float(overlap_ratio)] += 1

In [None]:
# Sort the data by keys to ensure the plot is ordered numerically
sorted_data = dict(sorted(count_overlap.items(), key=lambda item: item[0]))

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(sorted_data.keys(), sorted_data.values(), color="skyblue")
plt.xlabel("Category")
plt.ylabel("Frequency")
plt.title("Bar Plot of Frequency Data")
plt.xticks(rotation=45)
plt.grid(axis="y")

# Show the plot
plt.show()

In [None]:
sorted_data

In [None]:
count_overlap

In [None]:
# データをリストに変換
x = list(count_overlap.keys())
y = list(count_overlap.values())

# 散布図を作成
plt.figure(figsize=(10, 8))
plt.scatter(y, x)
plt.xlabel("Values")
plt.ylabel("Keys")
plt.title("Scatter plot of defaultdict values")
plt.show()