# ネットワークを遺伝子単位で出力する


In [None]:
# Move up to top directory
import os
from pathlib import Path

print(os.getcwd())

while not Path("LICENSE").exists():
    os.chdir("../")

print(os.getcwd())

In [None]:
P = print
from pprint import pprint as PP
from pathlib import Path
import json
import pickle
import networkx as nx
import shutil
import gzip

import pandas as pd

In [None]:
overlap = pickle.load(open("data/overlap/overlapped_ratios_filtered.pkl", "rb"))
df_overlap = pd.DataFrame(
    overlap,
    columns=[
        "marker1",
        "marker2",
        "overlap_ratio",
        "overlapped_mp_number",
        "overlapped_mp",
    ],
)
df_overlap  # 133281 rows × 5 columns

In [None]:
marker_mp = json.load(open("data/annotation/symbol_mptermname.json"))
marker_mp = pd.DataFrame(marker_mp.items(), columns=["marker_symbol", "mp_term_name"])
marker_mp  # 7746 rows × 2 columns
# DFをdictに変換
marker_mp_dict = dict(zip(marker_mp.marker_symbol, marker_mp.mp_term_name))

## ひとつの遺伝子をハンドルする


In [None]:
gene_symbol = "Rab10"

In [None]:
P(sum(df_overlap.marker1 == gene_symbol))
P(sum(df_overlap.marker2 == gene_symbol))

In [None]:
G = nx.from_pandas_edgelist(df_overlap, "marker1", "marker2")

In [None]:
# ノードAと直接つながっているノードのみを取得
neighbors = list(G.neighbors(gene_symbol))
subgraph_nodes = [gene_symbol] + neighbors
subgraph = G.subgraph(subgraph_nodes)

P(subgraph)
P(len(subgraph.nodes))

In [None]:
df_overlap

In [None]:
"""
ノードが多すぎるとWebページが描画できない問題を回避するため、
ノード数を500以下にするために最適なoverlap_ratioを求める
"""
target_columns = 500
# 二分探索の範囲
low, high = df_overlap["overlap_ratio"].min(), df_overlap["overlap_ratio"].max()
best_overlap_ratio = None

while low <= high:
    mid = (low + high) / 2

    # overlap_ratio >= mid のデータをフィルタリング
    df_mid = df_overlap[df_overlap["overlap_ratio"] >= mid]

    G = nx.from_pandas_edgelist(df_mid, "marker1", "marker2")
    # ノードAと直接つながっているノードのみを取得
    try:
        neighbors = list(G.neighbors(gene_symbol))
    except:
        high = mid - 1e-6
        continue
    subgraph_nodes = [gene_symbol] + neighbors
    subgraph = G.subgraph(subgraph_nodes)

    row_count = len(subgraph.nodes)
    # ターゲット列数に近い場合、結果を保存
    if row_count == target_columns:
        best_overlap_ratio = mid
        break
    elif row_count > target_columns:
        # 列数が多い場合、範囲を上げる
        best_overlap_ratio = mid
        low = mid + 1e-6
    else:
        # 列数が少ない場合、範囲を下げる
        best_overlap_ratio = mid
        high = mid - 1e-6
    print(low, high, mid, best_overlap_ratio, row_count)

df_overlap = df_overlap[df_overlap["overlap_ratio"] >= best_overlap_ratio]

In [None]:
P(subgraph.nodes())

In [None]:
# nodesを用意

node_json = []
for node in subgraph.nodes():
    annotation = marker_mp_dict[node]
    node_color = 1 if node == gene_symbol else 0
    node_json.append(
        {
            "data": {
                "id": node,
                "label": node,
                "node_color": node_color,
                "annotation": annotation,
            }
        }
    )

P(node_json[0])

In [None]:
# edgesを用意
df_overlap["id"] = df_overlap.apply(
    lambda row: "-".join(sorted([row["marker1"], row["marker2"]])), axis=1
)

edge_info = pd.DataFrame(
    {"id": ["-".join(sorted([a, b])) for a, b in subgraph.edges()]}
).merge(df_overlap, on="id")
edge_json = []
for edge in edge_info.itertuples():
    edge_json.append(
        {
            "data": {
                "source": edge.marker1,
                "target": edge.marker2,
                "edge_size": edge.overlap_ratio,
                "annotation": edge.overlapped_mp,
            }
        }
    )
P(edge_json[:3])

In [None]:
# Cytoscape.js用のJSON形式に変換
cytoscape_data = node_json + edge_json

In [None]:
# JSONに変換
cytoscape_json = json.dumps(cytoscape_data, indent=2)
print(cytoscape_json[:3])

In [None]:
# Path("Rab10.json").write_text(cytoscape_json)

## 一括処理


In [None]:
overlap = pickle.load(open("data/overlap/overlapped_ratios_filtered.pkl", "rb"))
df_overlap = pd.DataFrame(
    overlap,
    columns=[
        "marker1",
        "marker2",
        "overlap_ratio",
        "overlapped_mp_number",
        "overlapped_mp",
    ],
)
P(len(df_overlap))  # 133281 rows × 5 columns

In [None]:
gene_symbols = df_overlap.marker1.unique().tolist()
gene_symbols += df_overlap.marker2.unique().tolist()
gene_symbols = list(set(gene_symbols))
gene_symbols.sort()  # 以下のfor文で、どこまで遺伝子が処理されたのか途中経過を見積もるためのソート
P(gene_symbols[:3])
P(len(gene_symbols))  # 6003

In [None]:
output_dir = Path("data", "network", "gene_symbol")
# remove network directory
if output_dir.exists():
    shutil.rmtree(output_dir)

output_dir.mkdir(exist_ok=True, parents=True)

In [None]:
for gene_symbol in gene_symbols:
    """
    ノードが多すぎるとWebページが描画できない問題を回避するため、
    ノード数を200以下にするために最適なoverlap_ratioを求める
    """
    # 今の処理
    df_filtered = df_overlap[
        (df_overlap["marker1"] == gene_symbol) | (df_overlap["marker2"] == gene_symbol)
    ]

    G = nx.from_pandas_edgelist(df_filtered, "marker1", "marker2")

    # ノードAと直接つながっているノードのみを取得
    neighbors = list(G.neighbors(gene_symbol))
    subgraph_nodes = [gene_symbol] + neighbors
    subgraph = G.subgraph(subgraph_nodes)

    target_number_of_nodes = 200
    if len(subgraph.nodes) > target_number_of_nodes:
        # 二分探索の範囲
        low, high = (
            df_filtered["overlap_ratio"].min(),
            df_filtered["overlap_ratio"].max(),
        )
        best_overlap_ratio = None

        while low <= high:
            mid = (low + high) / 2

            # overlap_ratio >= mid のデータをフィルタリング
            df_mid = df_filtered[df_filtered["overlap_ratio"] >= mid]

            G = nx.from_pandas_edgelist(df_mid, "marker1", "marker2")
            # ノードAと直接つながっているノードのみを取得
            try:
                neighbors = list(G.neighbors(gene_symbol))
            except:
                high = mid - 1e-6
                continue
            subgraph_nodes = [gene_symbol] + neighbors
            subgraph = G.subgraph(subgraph_nodes)

            row_count = len(subgraph.nodes)
            # ターゲット列数に近い場合、結果を保存
            if target_number_of_nodes - 25 < row_count < target_number_of_nodes + 25:
                best_overlap_ratio = mid
                break
            elif row_count > target_number_of_nodes:
                # 列数が多い場合、範囲を上げる
                best_overlap_ratio = mid
                low = mid + 1e-6
            else:
                # 列数が少ない場合、範囲を下げる
                best_overlap_ratio = mid
                high = mid - 1e-6

        df_nodes = df_filtered[df_filtered["overlap_ratio"] >= best_overlap_ratio]
        G = nx.from_pandas_edgelist(df_nodes, "marker1", "marker2")
        # ノードAと直接つながっているノードのみを取得
        neighbors = list(G.neighbors(gene_symbol))
        subgraph_nodes = [gene_symbol] + neighbors
        subgraph = G.subgraph(subgraph_nodes)

    # nodesを用意
    node_json = []
    for node in subgraph.nodes():
        annotation = marker_mp_dict[node]
        node_color = 1 if node == gene_symbol else 0
        node_json.append(
            {
                "data": {
                    "id": node,
                    "label": node,
                    "node_color": node_color,
                    "annotation": annotation,
                }
            }
        )

    # edgesを用意
    df_edge = df_overlap[
        (df_overlap["marker1"].isin(subgraph.nodes()))
        & (df_overlap["marker2"].isin(subgraph.nodes()))
    ]

    edge_json = []
    for edge in df_edge.itertuples():
        edge_json.append(
            {
                "data": {
                    "source": edge.marker1,
                    "target": edge.marker2,
                    "edge_size": edge.overlap_ratio,
                    "annotation": edge.overlapped_mp,
                }
            }
        )
    network_json = node_json + edge_json

    # Output as JSON
    if network_json:
        output_json = output_dir / f"{gene_symbol}.json.gz"
        with gzip.open(output_json, "wt", encoding="utf-8") as f:
            json.dump(network_json, f, indent=4)

# 4m

In [None]:
%%bash
ls -lhS data/network/gene_symbol/ | head -n 5 # total 4.6G -> 133M → 975M
# total 118M
# -rwxrwxrwx 1 kuno kuno 198K Jan 25 10:31 Fam161b.json.gz
# -rwxrwxrwx 1 kuno kuno 198K Jan 25 10:32 Pabir2.json.gz
# -rwxrwxrwx 1 kuno kuno 185K Jan 25 10:31 Lrrc17.json.gz
# -rwxrwxrwx 1 kuno kuno 182K Jan 25 10:30 Cer1.json.gz

In [None]:
Path("data/overlap/available_gene_symbols.txt").write_text(
    "\n".join(gene_symbols) + "\n"
)
print(len(gene_symbols))  # 4416 -> 4244 → 6003 → 4139

In [None]:
%%bash

date +"%Y/%m/%d %H:%M:%S" # Last update

# Debug



## 一つの遺伝子のみ取り扱う

In [None]:
overlap = pickle.load(open("data/overlap/overlapped_ratios_filtered.pkl", "rb"))
df_overlap = pd.DataFrame(
    overlap,
    columns=[
        "marker1",
        "marker2",
        "overlap_ratio",
        "overlapped_mp_number",
        "overlapped_mp",
    ],
)
df_overlap  # 133281 rows × 5 columns

In [None]:
marker_mp = json.load(open("data/annotation/symbol_mptermname.json"))
marker_mp = pd.DataFrame(marker_mp.items(), columns=["marker_symbol", "mp_term_name"])
marker_mp  # 7746 rows × 2 columns
# DFをdictに変換
marker_mp_dict = dict(zip(marker_mp.marker_symbol, marker_mp.mp_term_name))

In [None]:
gene_symbol = "Rhd"

In [None]:
# 今の処理
df_filtered = df_overlap[
    (df_overlap["marker1"] == gene_symbol) | (df_overlap["marker2"] == gene_symbol)
]

G = nx.from_pandas_edgelist(df_filtered, "marker1", "marker2")

# ノードAと直接つながっているノードのみを取得
neighbors = list(G.neighbors(gene_symbol))
subgraph_nodes = [gene_symbol] + neighbors
subgraph = G.subgraph(subgraph_nodes)
if len(subgraph.nodes) > 500:
    target_columns = 500
    # 二分探索の範囲
    low, high = df_filtered["overlap_ratio"].min(), df_filtered["overlap_ratio"].max()
    best_overlap_ratio = None

    while low <= high:
        mid = (low + high) / 2

        # overlap_ratio >= mid のデータをフィルタリング
        df_mid = df_filtered[df_filtered["overlap_ratio"] >= mid]

        G = nx.from_pandas_edgelist(df_mid, "marker1", "marker2")
        # ノードAと直接つながっているノードのみを取得
        try:
            neighbors = list(G.neighbors(gene_symbol))
        except:
            high = mid - 1e-6
            continue
        subgraph_nodes = [gene_symbol] + neighbors
        subgraph = G.subgraph(subgraph_nodes)

        row_count = len(subgraph.nodes)
        # ターゲット列数に近い場合、結果を保存
        if row_count == target_columns:
            best_overlap_ratio = mid
            break
        elif row_count > target_columns:
            # 列数が多い場合、範囲を上げる
            best_overlap_ratio = mid
            low = mid + 1e-6
        else:
            # 列数が少ない場合、範囲を下げる
            best_overlap_ratio = mid
            high = mid - 1e-6
    df_results = df_filtered[df_filtered["overlap_ratio"] >= best_overlap_ratio]
    G = nx.from_pandas_edgelist(df_results, "marker1", "marker2")
    # ノードAと直接つながっているノードのみを取得
    neighbors = list(G.neighbors(gene_symbol))
    subgraph_nodes = [gene_symbol] + neighbors
    subgraph = G.subgraph(subgraph_nodes)
else:
    df_results = df_filtered

# nodesを用意
node_json = []
for node in subgraph.nodes():
    annotation = marker_mp_dict[node]
    node_color = 1 if node == gene_symbol else 0
    node_json.append(
        {
            "data": {
                "id": node,
                "label": node,
                "node_color": node_color,
                "annotation": annotation,
            }
        }
    )
# 0.0s

In [None]:
P(len(subgraph.nodes))  # 91
PP(node_json[0])

In [None]:
# edgesを用意
df_edge = df_overlap[
    (df_overlap["marker1"].isin(subgraph.nodes()))
    & (df_overlap["marker2"].isin(subgraph.nodes()))
]

edge_json = []
for edge in df_edge.itertuples():
    edge_json.append(
        {
            "data": {
                "source": edge.marker1,
                "target": edge.marker2,
                "edge_size": edge.overlap_ratio,
                "annotation": edge.overlapped_mp,
            }
        }
    )
P(len(edge_json))  # 91

In [None]:
edge_json[10]

In [None]:
df_edge = df_overlap[
    (df_overlap["marker1"].isin(subgraph.nodes()))
    & (df_overlap["marker2"].isin(subgraph.nodes()))
]

In [None]:
df_edge[df_edge["marker1"] == "Anapc7"]

In [None]:
df_edge[df_edge["marker2"] == "Anapc7"]

In [None]:
# 昔の処理

G = nx.from_pandas_edgelist(df_overlap, "marker1", "marker2")
df_overlap = df_overlap.copy()
df_overlap["id"] = df_overlap.apply(
    lambda row: "-".join(sorted([row["marker1"], row["marker2"]])), axis=1
)

# ノードAと直接つながっているノードのみを取得
neighbors = list(G.neighbors(gene_symbol))
subgraph_nodes = [gene_symbol] + neighbors
subgraph = G.subgraph(subgraph_nodes)
if len(subgraph.nodes) <= 500:
    df_filtered = df_overlap
else:
    target_columns = 500
    # 二分探索の範囲
    low, high = df_overlap["overlap_ratio"].min(), df_overlap["overlap_ratio"].max()
    best_overlap_ratio = None

    while low <= high:
        mid = (low + high) / 2

        # overlap_ratio >= mid のデータをフィルタリング
        df_mid = df_overlap[df_overlap["overlap_ratio"] >= mid]

        G = nx.from_pandas_edgelist(df_mid, "marker1", "marker2")
        # ノードAと直接つながっているノードのみを取得
        try:
            neighbors = list(G.neighbors(gene_symbol))
        except:
            high = mid - 1e-6
            continue
        subgraph_nodes = [gene_symbol] + neighbors
        subgraph = G.subgraph(subgraph_nodes)

        row_count = len(subgraph.nodes)
        # ターゲット列数に近い場合、結果を保存
        if row_count == target_columns:
            best_overlap_ratio = mid
            break
        elif row_count > target_columns:
            # 列数が多い場合、範囲を上げる
            best_overlap_ratio = mid
            low = mid + 1e-6
        else:
            # 列数が少ない場合、範囲を下げる
            best_overlap_ratio = mid
            high = mid - 1e-6

    df_filtered = df_overlap[df_overlap["overlap_ratio"] >= best_overlap_ratio]

# 前処理
G = nx.from_pandas_edgelist(df_filtered, "marker1", "marker2")
df_filtered = df_filtered.copy()
df_filtered["id"] = df_filtered.apply(
    lambda row: "-".join(sorted([row["marker1"], row["marker2"]])), axis=1
)

# ノードAと直接つながっているノードのみを取得
neighbors = list(G.neighbors(gene_symbol))
subgraph_nodes = [gene_symbol] + neighbors
subgraph = G.subgraph(subgraph_nodes)
# nodesを用意
node_json = []
for node in subgraph.nodes():
    annotation = marker_mp_dict[node]
    node_color = 1 if node == gene_symbol else 0
    node_json.append(
        {
            "data": {
                "id": node,
                "label": node,
                "node_color": node_color,
                "annotation": annotation,
            }
        }
    )

# 5.8s

In [None]:
P(len(subgraph.nodes))  # 91
PP(node_json[:3])

In [None]:
old_nodes = {s for s in subgraph.nodes()}

In [None]:
new_nodes - old_nodes