In [32]:
human_go_ids = set()
with open("goa_human.gaf", "r") as f:
    for line in f:
        if line.startswith("!"):
            continue
        parts = line.strip().split("\t")
        if len(parts) > 4:
            go_id = parts[4]
            human_go_ids.add(go_id)

human_go_ids = [f"http://purl.obolibrary.org/obo/{id.split(':')[0]}_{id.split(':')[1]}" for id in human_go_ids]
print(len(human_go_ids), human_go_ids[:4])

18902 ['http://purl.obolibrary.org/obo/GO_0051496', 'http://purl.obolibrary.org/obo/GO_0001530', 'http://purl.obolibrary.org/obo/GO_0042542', 'http://purl.obolibrary.org/obo/GO_0051346']


In [34]:
import json
import networkx as nx
import plotly.graph_objects as go
from collections import deque

# Step 1: Load the JSON data
with open("go-basic.json") as f:
    data = json.load(f)

graph_data = data["graphs"][0]
nodes_data = graph_data["nodes"]
edges_data = graph_data["edges"]

# Step 2: Build the full graph
full_graph = nx.DiGraph()
id_to_label = {}

# Add all nodes
for node in nodes_data:
    if "meta" in node.keys() and node["meta"].get("deprecated", False):
        continue
    node_id = node["id"]
    label = node.get("lbl", node_id)
    id_to_label[node_id] = label
    full_graph.add_node(node_id, label=label)

# Add edges from explicit "edges" field
for edge in edges_data:
    if edge["pred"].endswith("is_a") or edge["pred"] in {
        "is_a",
        "http://purl.obolibrary.org/obo/BFO_0000050",  # part_of
        "http://purl.obolibrary.org/obo/RO_0002211",  # regulates
    }:
        subj = edge["sub"]
        obj = edge["obj"]
        full_graph.add_edge(obj, subj)  # Edge goes from parent to child


human_graph = full_graph.subgraph(human_go_ids).copy()

print(f"Number of total nodes: {full_graph.number_of_nodes()}")
print(f"Number of total edges: {full_graph.number_of_edges()}")
print(f"Number of Human nodes: {human_graph.number_of_nodes()}")
print(f"Number of Human edges: {human_graph.number_of_edges()}")

# Step 3: Find root nodes (no incoming edges)
root_nodes = [n for n in human_graph.nodes() if human_graph.in_degree(n) == 0]

# Step 4: Rank by out-degree and select top 5
top_root_nodes = sorted(
    root_nodes, key=lambda n: human_graph.out_degree(n), reverse=True
)[:1]

print("Top 5 root nodes by out-degree:")
for node in top_root_nodes:
    print(
        f"{id_to_label.get(node, node)} ({node}) -> {human_graph.out_degree(node)} children"
    )

# Step 5: BFS up to 50 nodes from each root
subset_graph = nx.DiGraph()
for root in top_root_nodes:
    visited = set()
    queue = deque([root])

    while queue and len(visited) < 10:
        current = queue.popleft()
        if current in visited:
            continue
        visited.add(current)
        subset_graph.add_node(current, label=id_to_label.get(current, current))

        for neighbor in human_graph.successors(current):
            if len(visited) >= 10:
                break
            subset_graph.add_node(neighbor, label=id_to_label.get(neighbor, neighbor))
            subset_graph.add_edge(current, neighbor)
            queue.append(neighbor)
        break



# Step 6: Layout and visualize
pos = nx.spring_layout(subset_graph, k=0.5, iterations=50)

edge_x = []
edge_y = []
for src, dst in subset_graph.edges():
    x0, y0 = pos[src]
    x1, y1 = pos[dst]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5, color="#888"),
    hoverinfo="none",
    mode="lines",
)

node_x = []
node_y = []
node_text = []

for node in subset_graph.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(id_to_label.get(node, node))

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode="markers+text",
    text=node_text,
    textposition="top center",
    marker=dict(showscale=False, color="lightblue", size=10, line_width=1),
    hoverinfo="text",
)

fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        title=dict(text="GO Ontology Graph Subset", font=dict(size=16)),
        showlegend=False,
        hovermode="closest",
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(showgrid=False, zeroline=False),
        yaxis=dict(showgrid=False, zeroline=False),
    ),
)


fig.show()

Number of total nodes: 40270
Number of total edges: 72219
Number of Human nodes: 18902
Number of Human edges: 26894
Top 5 root nodes by out-degree:
protein binding (http://purl.obolibrary.org/obo/GO_0005515) -> 94 children


In [35]:
import pickle

# Save full_graph
with open("full_graph.pkl", "wb") as f:
    pickle.dump(full_graph, f)

# Save human_graph
with open("human_graph.pkl", "wb") as f:
    pickle.dump(human_graph, f)

print("Graphs successfully pickled.")


Graphs successfully pickled.


In [None]:
import pickle

with open("full_graph.pkl", "rb") as f:
    full_graph = pickle.load(f)

with open("human_graph.pkl", "rb") as f:
    human_graph = pickle.load(f)



print(f"Number of total nodes: {full_graph.number_of_nodes()}")
print(f"Number of total edges: {full_graph.number_of_edges()}")
print(f"Number of Human nodes: {human_graph.number_of_nodes()}")
print(f"Number of Human edges: {human_graph.number_of_edges()}")
