In [118]:
human_go_ids = set()
with open("goa_human.gaf", "r") as f:
    for line in f:
        if line.startswith("!"):
            continue
        parts = line.strip().split("\t")
        if len(parts) > 4:
            go_id = parts[4]
            human_go_ids.add(go_id)

human_go_ids = [f"http://purl.obolibrary.org/obo/{id.split(':')[0]}_{id.split(':')[1]}" for id in human_go_ids]
print(len(human_go_ids), human_go_ids[:4])

18902 ['http://purl.obolibrary.org/obo/GO_0051496', 'http://purl.obolibrary.org/obo/GO_0001530', 'http://purl.obolibrary.org/obo/GO_0042542', 'http://purl.obolibrary.org/obo/GO_0051346']


In [119]:
import json
import networkx as nx
import plotly.graph_objects as go
from collections import deque

# Step 1: Load the JSON data
with open("go-basic.json") as f:
    data = json.load(f)

graph_data = data["graphs"][0]
nodes_data = graph_data["nodes"]
edges_data = graph_data["edges"]

# Step 2: Build the full graph
full_graph = nx.DiGraph()
id_to_label = {}

# Add all nodes
for node in nodes_data:
    if "meta" in node.keys() and node["meta"].get("deprecated", False):
        continue
    node_id = node["id"]
    label = node.get("lbl", node_id)
    definition = node.get("meta", {}).get("definition", {}).get("val", "")
    id_to_label[node_id] = label
    full_graph.add_node(node_id, label=label, definition=definition)

# Add edges from explicit "edges" field
for edge in edges_data:
    if edge["pred"].endswith("is_a") or edge["pred"] in {
        "is_a",
        "http://purl.obolibrary.org/obo/BFO_0000050",  # part_of
        "http://purl.obolibrary.org/obo/RO_0002211",  # regulates
    }:
        subj = edge["sub"]
        obj = edge["obj"]
        full_graph.add_edge(obj, subj)  # Edge goes from parent to child


human_graph = full_graph.subgraph(human_go_ids).copy()

print(f"Number of total nodes: {full_graph.number_of_nodes()}")
print(f"Number of total edges: {full_graph.number_of_edges()}")
print(f"Number of Human nodes: {human_graph.number_of_nodes()}")
print(f"Number of Human edges: {human_graph.number_of_edges()}")

# Step 3: Find root nodes (no incoming edges)
root_nodes = [n for n in human_graph.nodes() if human_graph.in_degree(n) == 0]

# Step 4: Rank by out-degree and select top 5
top_root_nodes = sorted(
    root_nodes, key=lambda n: human_graph.out_degree(n), reverse=True
)[:1]

print("Top 5 root nodes by out-degree:")
for node in top_root_nodes:
    print(
        f"{id_to_label.get(node, node)} ({node}) -> {human_graph.out_degree(node)} children"
    )

# Step 5: BFS up to 50 nodes from each root
subset_graph = nx.DiGraph()
for root in top_root_nodes:
    visited = set()
    queue = deque([root])

    while queue and len(visited) < 10:
        current = queue.popleft()
        if current in visited:
            continue
        visited.add(current)
        subset_graph.add_node(current, label=id_to_label.get(current, current))

        for neighbor in human_graph.successors(current):
            if len(visited) >= 10:
                break
            subset_graph.add_node(neighbor, label=id_to_label.get(neighbor, neighbor))
            subset_graph.add_edge(current, neighbor)
            queue.append(neighbor)
        break



# Step 6: Layout and visualize
pos = nx.spring_layout(subset_graph, k=0.5, iterations=50)

edge_x = []
edge_y = []
for src, dst in subset_graph.edges():
    x0, y0 = pos[src]
    x1, y1 = pos[dst]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5, color="#888"),
    hoverinfo="none",
    mode="lines",
)

node_x = []
node_y = []
node_text = []

for node in subset_graph.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(id_to_label.get(node, node))

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode="markers+text",
    text=node_text,
    textposition="top center",
    marker=dict(showscale=False, color="lightblue", size=10, line_width=1),
    hoverinfo="text",
)

fig = go.Figure(
    data=[edge_trace, node_trace],
    layout=go.Layout(
        title=dict(text="GO Ontology Graph Subset", font=dict(size=16)),
        showlegend=False,
        hovermode="closest",
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(showgrid=False, zeroline=False),
        yaxis=dict(showgrid=False, zeroline=False),
    ),
)


fig.show()

Number of total nodes: 40270
Number of total edges: 72219
Number of Human nodes: 18902
Number of Human edges: 26894
Top 5 root nodes by out-degree:
protein binding (http://purl.obolibrary.org/obo/GO_0005515) -> 94 children


In [None]:
import pickle
from networkx.readwrite import json_graph

# # Save full_graph
# with open("full_graph.pkl", "wb") as f:
#     pickle.dump(full_graph, f)

# # # Save human_graph
# # with open("human_graph.pkl", "wb") as f:
# #     pickle.dump(human_graph, f)
    
# with open("id_to_label.pkl", "wb") as f:
#     pickle.dump(id_to_label, f)
# print("Graphs successfully pickled.")

# human_graph_data = json_graph.node_link_data(human_graph)
# with open("human_graph.json", "w") as f:
#     json.dump(human_graph_data, f, indent=4)



Graphs successfully pickled.


In [135]:
import pickle

with open("full_graph.pkl", "rb") as f:
    full_graph = pickle.load(f)

# with open("human_graph.pkl", "rb") as f:
#     human_graph = pickle.load(f)

with open("id_to_label.pkl", "rb") as f:
    id_to_label = pickle.load(f)

# Load the graph data from the JSON file
with open("annotated_human_graph.json", "r") as f:
    annotated_human_graph_data = json.load(f)

# Convert the JSON data back to a NetworkX graph
human_graph = json_graph.node_link_graph(annotated_human_graph_data)


print(f"Number of total nodes: {full_graph.number_of_nodes()}")
print(f"Number of total edges: {full_graph.number_of_edges()}")
print(f"Number of Human nodes: {human_graph.number_of_nodes()}")
print(f"Number of Human edges: {human_graph.number_of_edges()}")


Number of total nodes: 40270
Number of total edges: 72219
Number of Human nodes: 18902
Number of Human edges: 26894


In [136]:
roots = [
    "http://purl.obolibrary.org/obo/GO_0003674", # Molecular Function
    "http://purl.obolibrary.org/obo/GO_0005575", # Cellular Component
    "http://purl.obolibrary.org/obo/GO_0008150", # Biological Function
]

In [137]:
import ollama

def build_prompt(label, definition, parent_labels):
    return f"""
Imagine a city where every citizen or piece of infrastructure represents a type of protein or protein activity. In this analogy:

- Proteins are citizens, workers, or infrastructure elements.
- DNA is the blueprint or city plan.
- Ribosomes are factories or birthing centers.
- Kinases are managers.
- ATP is food or fuel.
- Microtubules are highways.
- Transporters are delivery trucks or pipes.

Each protein class can be assigned a **job title** (like 'electrician', 'civil engineer', or 'streetlamp') to represent its role in the cellular city.

Below is a subclass of proteins, along with:
- A definition of what it does,
- A list of its parent classes and their assigned roles.

Your task:
Give a metaphorical **human city job or infrastructure** for the following protein subclass, staying within the logic of its parent category (important!). Be creative but **keep it grounded in actual urban roles**. 
And please do make the suggested subclass role a subcategory of the parental roles. 

---
### Protein Subclass: "{label}"

Definition:
{definition}

Parent Roles:
{parent_labels}

Examples of good metaphorical mappings:
- "molecular_function" → "engineer / laborer"
- "cellular_component" → "infrastructure / building"
- "molecular carrier activity" → "delivery person / miner"
- "immune process" → "security guard / police officer"
- "histone demethylase" → "building permit office"

Avoid using abstract terms like "architect" or "designer" unless truly accurate.

Try to think like a **biologist and a city planner** at the same time. Ask: how would this function manifest in a real-world city?

Only respond with a concrete role(s), like "road inspector / foreman" or "electrical relay / control box".

If the fit is questionable, include a "?" at the end.
"""

def summarize_response(response_text):
    short_prompt = (
        f'Shorten the following response to just a list of real-world urban jobs or infrastructure terms, separated by slashes (e.g., "handyman/electrician", "relay/wire"):\n\n"{response_text}"\n\n'
        "Make sure:\n"
        "- These are all real-world, human-usable roles or things in cities.\n"
        "- They're concrete, not abstract (avoid words like 'dreamer' or 'orchestrator').\n"
        "- If unsure, err toward physical or civic jobs.\n\n"
        "Final output: just a slash-separated list."
    )
    summary = ollama.chat(
        model="llama-3.2",
        messages=[{"role": "user", "content": short_prompt}]
    )
    return summary["message"]["content"].split('or')[0]


In [None]:
import time
import ollama
from collections import deque
import pickle

visited = set()
queue = deque(roots)

while queue:
    node_name = queue.popleft()
    if node_name in visited:
        continue
    visited.add(node_name)

    # Add children to the queue
    for child in human_graph.successors(node_name):
        queue.append(child)

    if node_name not in human_graph:
        print(f"Node {node_name} not found in human_graph. Skipping...")
        continue

    node_data = human_graph.nodes[node_name]

    if node_data.get("job_suggestion") is not None and node_data["job_suggestion"].strip().lower() != "none" and node_data["job_suggestion"].strip().lower() != "q":
        print(f"{node_data['label']}: {node_data['job_suggestion']}.")
        continue

    label = node_data.get("label", node_name)
    definition = node_data.get("definition", "No definition available.")
    parents = list(human_graph.predecessors(node_name))
    parent_labels = [f"{human_graph.nodes[p].get("label", p)} is assigned the role of {human_graph.nodes[p].get("job_suggestion", p)}" for p in parents]

    # prompt = (
    #     "In an analogy of a cell as a city, each protein class takes the form of infrastructure or the citizen. \n\n" + 
    #     "IF the overarching protein class " + ", ".join([f'- "{p}"\n' for p in parent_labels]) +
    #     f', THEN what subcategory of the overarching roles does the protein class "{label}" take?\n'
    #     + f"A more detailed definition of the protein subclass: {definition}\n\n"
    #     + " Please feel free to give a creative response while keeping the analogous protein subclass role strictly within the overarching role."
    #     +" If useful, here are some examples of successful roles:"
    #     + ('molecular_function: engineer/craftsman/technician/laborer.' + 
    #         'cellular_component: infrastructure/tools/architecture.' + 
    #         'molecular carrier activity: forager/miner.' +
    #         'immune system process: homeland security/army/police force.')
    #     + "Generally, DNA should be thought of as blue prints, Ribosome as factories or birthing stations, microtubules as highways, ATP as food / fuel, kinases as managerial roles.\n\n" \
    #     + "Be smart about it, think as a biologist, for example demethylation of histones increases transcription, so a demethylase would have the role of someone giving building permits.\n\n" \
    #     + "Try to avoid architect and chemical engineer unless you really think it's a good fit"
    #     + 'Be sure to include a "?" if you think its a bit of an iffy response'
    # )
    # print("\n" + "=" * 80)
    # print(prompt)
    # print("=" * 80)
    # response = ollama.chat(
    #     model="llama3.2", messages=[{"role": "user", "content": prompt}]
    # )

    # verbose_response = response["message"]["content"].strip()


    suggested_title = 'q'
    full_prompt = build_prompt(label, definition, parent_labels)
    print("\n" + "=" * 80)
    print(full_prompt)
    print("=" * 80)

    # Ask for a metaphorical job
    response = ollama.chat(
        model="llama3.2", messages=[{"role": "user", "content": full_prompt}]
    )
    verbose_response = response["message"]["content"].strip()
    print(f"LLaMA's suggestion for '{label}': {verbose_response}")
    # suggested_title = summarize_response(verbose_response)
    # print(f"LLaMA's summarized response ': {suggested_title}")
    short_and_sweet_prompt = (
        f"Shorten the following to just a simple separated by slashes (e.g., 'handyman/architect', 'manager/human resource coordinator/buisness associate','electrician/electrical wire'): \"{verbose_response}\"" 
        # + "Consider carefully, is the response actually a REAL job (e.g., Gene editor or origami folder is not a real job) or infrastructure description? Something that is actually present in human cities. If not try to massage a bit to make it so."
    )
    # Step 2: Ask Llama to summarize the title in one word
    summary_response = ollama.chat(
        model="llama3.2",
        messages=[
            {
                "role": "user", 
                "content": short_and_sweet_prompt
            }
        ]
    )
    suggested_title = summary_response["message"]["content"].strip()


    # print(f"\nLLaMA's suggestion for '{label}': {suggested_title}")

    # # Interactive input
    # user_input = input(
    #     f"Enter your final job title (or press Enter to accept LLaMA's suggestion {suggested_title}, or type 'skip'): "
    # ).strip()
    # if user_input.lower() == "skip" or suggested_title.lower() == "none":
    #     print(f"Skipping {label}...\n")
    #     continue
    # final_job = user_input if user_input else suggested_title

    final_job = '[Llama 3.2 generated] ' + suggested_title

    # Store it in the graph
    human_graph.nodes[node_name]["label"] = label
    human_graph.nodes[node_name]["definition"] = definition
    human_graph.nodes[node_name]["parents"] = parent_labels
    human_graph.nodes[node_name]["job_suggestion"] = final_job

    print(f"✔ Stored '{final_job}' for {label}.\n")

    # Save progress
    if len(visited) % 1 == 0:
        annotated_human_graph_data = json_graph.node_link_data(human_graph)
        with open("annotated_human_graph.json", "w") as f:
            json.dump(annotated_human_graph_data, f, indent=4)


        # with open("annotated_human_graph.pkl", "wb") as f:
        #     pickle.dump(human_graph, f)


molecular_function: engineer/craftsman/technician/laborer.
cellular_component: infrastructure/tools/architecture.
biological_process: manager/human resource coordinator/buisness associate.
cytoskeletal motor activity: construction equipment/freight transport/transportation.
catalytic activity: chemical engineer/craftsman/machinist.
structural molecule activity: handyman/architect.
electron transfer activity: electrician/electrical wire.
antioxidant activity: firefighter.
cargo receptor activity: cargo loader.
protein folding chaperone: daycare/preschool teachers/elementary school teachers.
translation regulator activity: factory foreman/assembly line overseer.
molecular adaptor activity: facilitator.
molecular carrier activity: forager/miner.
general transcription initiation factor activity: product manager/activist/whistleblower/public intellectual.
molecular sequestering activity: custodian/janitor/handler/warden/pest control.
RNA folding chaperone: mentor/coach/literary editor.
prot

KeyboardInterrupt: 

In [None]:
# import time
# import ollama
# from collections import deque
# import pickle  # for saving the graph

# visited = set()
# queue = deque(roots)

# while queue:
#     node_name = queue.popleft()
#     if node_name in visited:
#         continue
#     visited.add(node_name)

#     if node_name not in human_graph:
#         print(f"Node {node_name} not found in human_graph. Skipping...")
#         continue
#     node = human_graph.nodes[node_name]
#     label = node.label
#     definition = node.definition 
#     parents = list(human_graph.predecessors(node))
#     parent_labels = [id_to_label.get(p, p) for p in parents]
#     import pdb; pdb.set_trace()
#     prompt = (
#         "I’m creating an ontology that maps protein functions to human jobs.\n\n"
#         f'This protein is labeled as: "{label}" and defined as {definition}\n\n'
#         "It is related to the following higher-level concepts:\n"
#         + "".join([f'- "{p}"\n' for p in parent_labels])
#         + "\nBased on this, what human profession best corresponds to the role of this protein function?"
#         + " Please feel free to get a little creative! "
#         + " For example, a kinase might be a manager since it 'orders' other proteins to function in specific ways."
#         + " Really looking for a single answer, not even a phrase, just a single job title.\n\n"
#         + "If you really can't decide, or if there are no suitable human jobs, please respond with 'None'.\n\n"
#     )

#     print(prompt)

#     response = ollama.chat(
#         model="llama3",
#         messages=[{"role": "user", "content": prompt}]
#     )
#     print(response)

#     job_title = response["message"]["content"].strip()

#     if job_title.lower() == 'none':
#         print(f"No suitable job found for {label}. Skipping...")
#         continue

#     # Store directly in the graph
#     human_graph.nodes[node]["label"] = label
#     human_graph.nodes[node]["parents"] = parent_labels
#     human_graph.nodes[node]["job_suggestion"] = job_title

#     # Save the graph after every 5 new nodes
#     if len(visited) % 5 == 0:
#         with open("human_graph.pkl", "wb") as f:
#             pickle.dump(human_graph, f)

#     # Enqueue children
#     for child in human_graph.successors(node):
#         queue.append(child)


Node http://purl.obolibrary.org/obo/GO_0005554 not found in human_graph. Skipping...
Node http://purl.obolibrary.org/obo/GO_0008372 not found in human_graph. Skipping...
Node http://purl.obolibrary.org/obo/GO_0007582 not found in human_graph. Skipping...
Node http://purl.obolibrary.org/obo/GO_0044699 not found in human_graph. Skipping...
Node http://purl.obolibrary.org/obo/GO_0000004 not found in human_graph. Skipping...


In [None]:
list(human_graph.nodes())[:5]


['http://purl.obolibrary.org/obo/GO_0050152',
 'http://purl.obolibrary.org/obo/GO_0070169',
 'http://purl.obolibrary.org/obo/GO_0036313',
 'http://purl.obolibrary.org/obo/GO_0034750',
 'http://purl.obolibrary.org/obo/GO_0007638']

In [None]:
human_graph.nodes['http://purl.obolibrary.org/obo/GO_0050152']

{'label': 'omega-amidase activity',
 'definition': 'Catalysis of the reaction: a monoamide of a dicarboxylic acid + H2O = a dicarboxylate + NH3.'}