# Overlap と mp_term_name の統合

-   例：abnormal_circulating_glucose_level（連続値）
-   例：male_infertility（カテゴリ値）


In [None]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir("../")

print(os.getcwd())

In [None]:
P = print
from pathlib import Path
import json
import pickle
import gzip
import shutil

import pandas as pd

# 一括処理


In [None]:
overlap = pickle.load(open("data/overlap/overlapped_ratios_filtered.pkl", "rb"))
df_overlap = pd.DataFrame(
    overlap,
    columns=[
        "marker1",
        "marker2",
        "overlap_ratio",
        "overlapped_mp_number",
        "overlapped_mp",
    ],
)
df_overlap  # 133281  rows × 5 columns

In [None]:
marker_mp = json.load(open("data/annotation/symbol_mptermname.json"))
marker_mp = pd.DataFrame(marker_mp.items(), columns=["marker_symbol", "mp_term_name"])
marker_mp  # 7626 rows × 2 columns

In [None]:
print(len(df_overlap))  # 184250 → 134880
print(len(marker_mp))  # 7626

In [None]:
output_dir = Path("data/network/mp_term_name")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)

In [None]:
path_mp_terms = list(Path("data", "mp_term_name").glob("*.csv"))
# print(path_mp_terms[:3])
# print(len(path_mp_terms))
# path_mp_term = Path("data", "mp_term_name", "decreased_circulating_glucose_level.csv")

"""
ノードが多すぎるとWebページが描画できない問題を回避するため、
ノード数を200以下にするために最適なoverlap_ratioを求める
"""
number_of_nodes = 200

for path_mp_term in path_mp_terms:
    mp_term = path_mp_term.stem
    # print(mp_term)

    df_marker_effect = pd.read_csv(path_mp_term)
    df_marker_effect = df_marker_effect[
        ["marker_symbol", "effect_size"]
    ].drop_duplicates()
    df_marker_effect = df_marker_effect.dropna(subset=["effect_size"])

    # Absolute value of effect size
    df_marker_effect.loc[:, "effect_size"] = df_marker_effect["effect_size"].abs()

    df_filtered = df_overlap[
        (df_overlap["marker1"].isin(df_marker_effect["marker_symbol"]))
        & (df_overlap["marker2"].isin(df_marker_effect["marker_symbol"]))
    ]
    # 二分探索の範囲
    low, high = df_filtered["overlap_ratio"].min(), df_filtered["overlap_ratio"].max()
    best_overlap_ratio = None

    while low <= high:
        mid = (low + high) / 2

        # overlap_ratio >= mid のデータをフィルタリング
        df_mid = df_filtered[df_filtered["overlap_ratio"] >= mid]

        ## 出力
        ### Nodeを作成する
        df_marker1 = df_mid[["marker1"]]
        df_marker2 = df_mid[["marker2"]]
        df_node_marker1 = pd.merge(
            df_marker1,
            df_marker_effect,
            left_on="marker1",
            right_on="marker_symbol",
            how="inner",
        )[["marker_symbol"]]
        df_node_marker2 = pd.merge(
            df_marker2,
            df_marker_effect,
            left_on="marker2",
            right_on="marker_symbol",
            how="inner",
        )[["marker_symbol"]]
        df_node = pd.concat(
            [df_node_marker1, df_node_marker2], axis=0
        ).drop_duplicates()
        df_node = pd.merge(df_node, marker_mp, how="inner", on="marker_symbol")
        df_node = pd.merge(df_node, df_marker_effect, how="inner", on="marker_symbol")

        node_count = len(df_node)
        # ターゲット列数に近い場合、結果を保存
        if number_of_nodes - 25 < node_count < number_of_nodes + 25:
            best_overlap_ratio = mid
            break
        elif node_count > number_of_nodes:
            # 列数が多い場合、範囲を上げる
            best_overlap_ratio = mid
            low = mid + 1e-6
        else:
            # 列数が少ない場合、範囲を下げる
            best_overlap_ratio = mid
            high = mid - 1e-6

    df_filtered = df_filtered[df_filtered["overlap_ratio"] >= best_overlap_ratio]

    ## 出力
    ### Nodeを作成する
    df_marker1 = df_filtered[["marker1"]]
    df_marker2 = df_filtered[["marker2"]]
    df_node_marker1 = pd.merge(
        df_marker1,
        df_marker_effect,
        left_on="marker1",
        right_on="marker_symbol",
        how="inner",
    )[["marker_symbol"]]
    df_node_marker2 = pd.merge(
        df_marker2,
        df_marker_effect,
        left_on="marker2",
        right_on="marker_symbol",
        how="inner",
    )[["marker_symbol"]]

    df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()
    df_node = pd.merge(df_node, marker_mp, how="inner", on="marker_symbol")
    df_node = pd.merge(df_node, df_marker_effect, how="inner", on="marker_symbol")

    if len(df_node) == 0:
        continue

    # print(mp_term, len(df_node))

    # NodeをJSON形式に変換
    node_json = []
    for _, row in df_node.iterrows():
        node_json.append(
            {
                "data": {
                    "id": row["marker_symbol"],
                    "label": row["marker_symbol"],
                    "annotation": row["mp_term_name"],
                    "node_color": row["effect_size"],
                }
            }
        )
    ### Edgeを作成する
    df_edge = df_filtered[["marker1", "marker2", "overlap_ratio", "overlapped_mp"]]
    # EdgeをJSON形式に変換
    edge_json = []
    for _, row in df_edge.iterrows():
        edge_json.append(
            {
                "data": {
                    "source": row["marker1"],
                    "target": row["marker2"],
                    "annotation": row["overlapped_mp"],
                    "edge_size": row["overlap_ratio"],
                }
            }
        )

    ### EdgeとNodeを統合して、出力
    # Combine node and edge

    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{mp_term}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)

# 1m30s

In [None]:
%%bash

ls -lhS data/network/mp_term_name/ | head -n 5

# total 5.3M
# -rwxrwxrwx 1 aki aki  73K Feb 22 12:02 edema.json.gz
# -rwxrwxrwx 1 aki aki  63K Feb 22 12:02 enlarged_kidney.json.gz
# -rwxrwxrwx 1 aki aki  63K Feb 22 12:02 abnormal_lymph_node_morphology.json.gz
# -rwxrwxrwx 1 aki aki  58K Feb 22 12:02 small_kidney.json.gz

In [None]:
%%bash

date +"%Y/%m/%d %H:%M:%S" # Last update

# Debug


## 個別のMP term に対する処理

In [None]:
path_mp_term = Path(
    "data", "mp_term_name", "preweaning_lethality,_complete_penetrance.csv"
)
path_mp_term = Path("data", "mp_term_name", "male_infertility.csv")
path_mp_term = Path("data", "mp_term_name", "abnormal_anxiety-related_response.csv")

In [None]:
mp_term = path_mp_term.stem
# print(mp_term)

df_marker_effect = pd.read_csv(path_mp_term)
df_marker_effect = df_marker_effect[["marker_symbol", "effect_size"]].drop_duplicates()
df_marker_effect = df_marker_effect.dropna(subset=["effect_size"])
P(len(df_marker_effect))
# Absolute value of effect size
df_marker_effect.loc[:, "effect_size"] = df_marker_effect["effect_size"].abs()

df_filtered = df_overlap[
    (df_overlap["marker1"].isin(df_marker_effect["marker_symbol"]))
    & (df_overlap["marker2"].isin(df_marker_effect["marker_symbol"]))
]

df_filtered

In [None]:
mp_term = path_mp_term.stem
# print(mp_term)

df_marker_effect = pd.read_csv(path_mp_term)
df_marker_effect = df_marker_effect[["marker_symbol", "effect_size"]].drop_duplicates()
df_marker_effect = df_marker_effect.dropna(subset=["effect_size"])

# Absolute value of effect size
df_marker_effect.loc[:, "effect_size"] = df_marker_effect["effect_size"].abs()

df_filtered = df_overlap[
    (df_overlap["marker1"].isin(df_marker_effect["marker_symbol"]))
    & (df_overlap["marker2"].isin(df_marker_effect["marker_symbol"]))
]

"""
ノードが多すぎるとWebページが描画できない問題を回避するため、
ノード数を500以下にするために最適なoverlap_ratioを求める
"""
target_columns = 500
# 二分探索の範囲
low, high = df_filtered["overlap_ratio"].min(), df_filtered["overlap_ratio"].max()
best_overlap_ratio = None

while low <= high:
    mid = (low + high) / 2

    # overlap_ratio >= mid のデータをフィルタリング
    df_mid = df_filtered[df_filtered["overlap_ratio"] >= mid]

    ## 出力
    ### Nodeを作成する
    df_marker1 = df_mid[["marker1"]]
    df_marker2 = df_mid[["marker2"]]
    df_node_marker1 = pd.merge(
        df_marker1,
        df_marker_effect,
        left_on="marker1",
        right_on="marker_symbol",
        how="inner",
    )[["marker_symbol"]]
    df_node_marker2 = pd.merge(
        df_marker2,
        df_marker_effect,
        left_on="marker2",
        right_on="marker_symbol",
        how="inner",
    )[["marker_symbol"]]
    df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()
    df_node = pd.merge(df_node, marker_mp, how="inner", on="marker_symbol")
    df_node = pd.merge(df_node, df_marker_effect, how="inner", on="marker_symbol")

    row_count = len(df_node)
    # ターゲット列数に近い場合、結果を保存
    if row_count == target_columns:
        best_overlap_ratio = mid
        break
    elif row_count > target_columns:
        # 列数が多い場合、範囲を上げる
        best_overlap_ratio = mid
        low = mid + 1e-6
    else:
        # 列数が少ない場合、範囲を下げる
        best_overlap_ratio = mid
        high = mid - 1e-6

df_filtered = df_filtered[df_filtered["overlap_ratio"] >= best_overlap_ratio]

## 出力
### Nodeを作成する
df_marker1 = df_filtered[["marker1"]]
df_marker2 = df_filtered[["marker2"]]
df_node_marker1 = pd.merge(
    df_marker1,
    df_marker_effect,
    left_on="marker1",
    right_on="marker_symbol",
    how="inner",
)[["marker_symbol"]]
df_node_marker2 = pd.merge(
    df_marker2,
    df_marker_effect,
    left_on="marker2",
    right_on="marker_symbol",
    how="inner",
)[["marker_symbol"]]

df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()


df_node = pd.merge(df_node, marker_mp, how="inner", on="marker_symbol")
df_node = pd.merge(df_node, df_marker_effect, how="inner", on="marker_symbol")


# NodeをJSON形式に変換
node_json = []
for _, row in df_node.iterrows():
    node_json.append(
        {
            "data": {
                "id": row["marker_symbol"],
                "label": row["marker_symbol"],
                "annotation": row["mp_term_name"],
                "node_color": row["effect_size"],
            }
        }
    )

### Edgeを作成する
df_edge = df_filtered[["marker1", "marker2", "overlap_ratio", "overlapped_mp"]]
# EdgeをJSON形式に変換
edge_json = []
for _, row in df_edge.iterrows():
    edge_json.append(
        {
            "data": {
                "source": row["marker1"],
                "target": row["marker2"],
                "annotation": row["overlapped_mp"],
                "edge_size": row["overlap_ratio"],
            }
        }
    )

### EdgeとNodeを統合して、出力
# Combine node and edge

network_json = node_json + edge_json

In [None]:
P(len(df_node), best_overlap_ratio, low, high)
df_node

In [None]:
# path_overlap = Path("data", "overlap_ratios_filtered.csv")
# df_overlap = pd.read_csv(path_overlap)

overlap = pickle.load(open("data/overlapped_ratios_filtered.pkl", "rb"))
df_overlap = pd.DataFrame(
    overlap,
    columns=[
        "marker1",
        "marker2",
        "overlap_ratio",
        "overlapped_mp_number",
        "overlapped_mp",
    ],
)
P(len(df_overlap))  # 184250
df_overlap.head(3)

In [None]:
path_mp_terms = list(Path("data", "mp_term_name").glob("*.csv"))
print(path_mp_terms[:3])
print(len(path_mp_terms))  # 670

In [None]:
# path_mp_term = Path("data", "mp_term_name", "decreased_circulating_glucose_level.csv")
path_mp_term = Path("data", "mp_term_name", "increased_circulating_glucose_level.csv")
mp_term = path_mp_term.stem
print(mp_term)

In [None]:
df_marker_effect = pd.read_csv(path_mp_term)
df_marker_effect = df_marker_effect[["marker_symbol", "effect_size"]].drop_duplicates()

# effect_sizeがNanの行を削除
df_marker_effect = df_marker_effect.dropna(subset=["effect_size"])

print(len(df_marker_effect))

df_marker_effect.head(3)

In [None]:
# print(df_mp["effect_size"].describe())

In [None]:
# Absolute value of effect size

df_marker_effect.loc[:, "effect_size"] = df_marker_effect["effect_size"].abs()

In [None]:
# print(df_overlap[df_overlap["intersect_count"] > 5 & df_overlap["overlap_ratio"] > 0.5])

In [None]:
# df_overlap_filterd = df_overlap[df_overlap["overlapped_mp_number"] > 0]
# df_overlap_filterd = df_overlap[(df_overlap["overlap_ratio"] > 0.25)]
df_overlap_filterd = df_overlap

In [None]:
df_filtered = df_overlap_filterd[
    (df_overlap_filterd["marker1"].isin(df_marker_effect["marker_symbol"]))
    & (df_overlap_filterd["marker2"].isin(df_marker_effect["marker_symbol"]))
]

In [None]:
print(len(df_filtered))

df_filtered.head(3)

## 出力


### Node を作成する


In [None]:
# merge mp term
df_marker1 = df_filtered[["marker1"]]
df_marker2 = df_filtered[["marker2"]]
df_node_marker1 = pd.merge(
    df_marker1,
    df_marker_effect,
    left_on="marker1",
    right_on="marker_symbol",
    how="inner",
)[["marker_symbol"]]
df_node_marker2 = pd.merge(
    df_marker2,
    df_marker_effect,
    left_on="marker2",
    right_on="marker_symbol",
    how="inner",
)[["marker_symbol"]]

df_node = pd.concat([df_node_marker1, df_node_marker2], axis=0).drop_duplicates()

import pickle

marker_mp = pickle.load(open("data/marker_mp.pkl", "rb"))
marker_mp = pd.DataFrame(marker_mp.items(), columns=["marker_symbol", "mp_term"])

df_node = pd.merge(df_node, marker_mp, how="inner", on="marker_symbol")
df_node = pd.merge(df_node, df_marker_effect, how="inner", on="marker_symbol")

df_node[:3]

In [None]:
# JSON形式に変換
node_json = []
for _, row in df_node.iterrows():
    node_json.append(
        {
            "data": {
                "id": row["marker_symbol"],
                "label": row["marker_symbol"],
                "annotation": row["mp_term"],
                "node_color": row["effect_size"],
            }
        }
    )

print(node_json[:3])

### Edge を作成する


In [None]:
df_edge = df_filtered[["marker1", "marker2", "overlap_ratio", "overlapped_mp"]]
print(len(df_edge))
df_edge.head(3)

In [None]:
# JSON形式に変換
edge_json = []
for _, row in df_edge.iterrows():
    edge_json.append(
        {
            "data": {
                "source": row["marker1"],
                "target": row["marker2"],
                "annotation": row["overlapped_mp"],
                "edge_size": row["overlap_ratio"],
            }
        }
    )

print(edge_json[:3])

### Edge と Node を統合して、出力


In [None]:
# Combine node and edge

network_json = node_json + edge_json

In [None]:
print(len(network_json))
print(network_json[:3])
print(network_json[-3:])

In [None]:
# Output as JSON file
Path("data/network").mkdir(exist_ok=True, parents=True)
with open(f"data/network/{mp_term}.json", "w") as f:
    json.dump(network_json, f, indent=2)

In [None]:
marker_mp

In [None]:
for data in node_json:
    if "Lepr" == data["data"]["id"]:
        print(data)

In [None]:
for data in edge_json:
    if "Herc1" == data["data"]["source"]:
        print(data)
    if "Herc1" == data["data"]["target"]:
        print(data)

In [None]:
for data in edge_json:
    if "Stoml2" == data["data"]["source"]:
        print(data)
    if "Stoml2" == data["data"]["target"]:
        print(data)