In [1]:
!pip install requests networkx pyvis tqdm



In [2]:
import requests
import networkx as nx
from pyvis.network import Network
from tqdm import tqdm
import time
from IPython.display import IFrame
from IPython.display import display

In [18]:
nac_proteins = ["NACA", "BTF3", "NACA2"]
natc_proteins = ["NAA30", "NAA35", "NAA40", "NAA50", "NAA60", "NAA70"]
nata_proteins = ["NAA10", "NAA15", "NAA20", "NAA25", "NAA80"]
nat_proteins = nac_proteins + natc_proteins + nata_proteins
tom_proteins = ["TOMM20", "SAM50", "MDM10", "OXA1", "OM14"]
species_id = 9606

In [None]:
def get_string_ids(gene_names, species=9606):
    string_ids = {}
    for gene in gene_names:
        response = requests.post(
            "https://string-db.org/api/json/get_string_ids",
            params={"identifiers": gene, "species": species}
        )
        data = response.json()
        if data:
            string_ids[gene] = data[0]["stringId"]
    return string_ids

def get_interactions(string_id, score_threshold=700):
    url = "https://string-db.org/api/json/actions"
    params = {
        "identifiers": string_id,
        "species": species_id,
        "required_score": score_threshold
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        # Filter auf nur physikalische Bindung
        return [inter for inter in data if inter.get("mode") == "binding"]
    return []



In [19]:
all_seeds = nat_proteins + tom_proteins
string_ids = get_string_ids(all_seeds)
string_to_gene = {v: k for k, v in string_ids.items()}

nac_ids = [string_ids[p] for p in nat_proteins if p in string_ids]
tom_ids = [string_ids[p] for p in tom_proteins if p in string_ids]

In [6]:
def build_string_graph(start_ids, depth=2, delay=1.0):
    G = nx.Graph()
    visited = set()

    def recurse(ids, d):
        if d == 0:
            return
        next_ids = set()
        for sid in tqdm(ids, desc=f"Depth {depth - d + 1}"):
            if sid in visited:
                continue
            visited.add(sid)
            interactions = get_interactions(sid)
            for inter in interactions:
                a = inter["stringId_A"]
                b = inter["stringId_B"]
                score = inter["score"]
                G.add_edge(a, b, weight=score)
                next_ids.update([a, b])
            time.sleep(delay)
        recurse(next_ids, d-1)

    recurse(start_ids, depth)
    return G

In [20]:
interaction_graph = build_string_graph(nac_ids + tom_ids, depth=3)

Depth 1: 100%|██████████| 16/16 [00:18<00:00,  1.17s/it]
Depth 2: 100%|██████████| 80/80 [01:14<00:00,  1.08it/s]
Depth 3: 100%|██████████| 280/280 [03:54<00:00,  1.20it/s]


In [21]:
def get_protein_names_dict(string_ids, species=9606):
    """
    Given a list of STRING IDs, return a dictionary mapping each to its preferred protein name.
    
    Parameters:
        string_ids (list): List of STRING protein IDs (e.g., ['9606.ENSP00000354587'])
        species (int): NCBI taxonomy ID (default 9606 = human)
    
    Returns:
        dict: Mapping of STRING ID -> preferredName
    """
    url = "https://string-db.org/api/json/get_string_ids"
    
    # Join IDs with carriage return for batch request
    identifiers = "\r".join(string_ids)
    
    params = {
        "identifiers": identifiers,
        "species": species
    }
    
    response = requests.get(url, params=params)
    if not response.ok:
        raise Exception(f"STRING API request failed: {response.status_code}")
    
    results = response.json()
    
    # Build dictionary
    id_to_name = {
        item["stringId"]: item.get("preferredName", "")
        for item in results
    }
    
    return id_to_name

In [22]:
remove_id = "9823.ENSSSCP00000014786"  

# Make a copy so you don't modify the original graph
graph_for_paths = interaction_graph.copy()
if remove_id in graph_for_paths:
    #graph_for_paths.remove_node(remove_id)
    print(f"Removed node {remove_id} from the graph for pathfinding.")

In [25]:
def is_commonly_upregulated(gene):
    """
    Returns True if the gene is described as upregulated in most cancers in the HPA pathology summary.
    """
    url = f"https://www.proteinatlas.org/api/search_download.php?search={gene}&format=json"
    try:
        resp = requests.get(url)
        data = resp.json()
        
        # The pathology_summary field often mentions up/downregulation in cancers.
        pathology = data[0].get('pathology_summary', '').lower()
        
        # Keywords indicating common upregulation
        keywords_up = ['upregulated', 'overexpressed', 'high expression', 'frequently expressed']
        keywords_down = ['downregulated', 'underexpressed', 'low expression', 'reduced expression']

        # Check if any keyword is present
        if any(k in pathology for k in keywords_up):
            return 1
        if any(k in pathology for k in keywords_down):
            return -1
        # If no keywords found, return 0
        else:
            return 0
    except Exception as e:
        print(f"Error for {gene}: {e}")
        return None

In [30]:
bridge_nodes = set()
all_paths = []

for n in nac_ids:
    for t in tom_ids:
        try:
            # Get a generator of simple paths ordered by length
            paths_gen = nx.shortest_simple_paths(graph_for_paths, source=n, target=t)
            # Take up to three shortest paths
            for i, path in enumerate(paths_gen):
                if i >= 50: 
                    break
                bridge_nodes.update(path)
                all_paths.append(path)
        except nx.NetworkXNoPath:
            continue

# Collect preferred names for the nodes in the bridge
get = get_protein_names_dict(list(bridge_nodes), species=species_id)

# Create the subgraph and relabel nodes
bridge_subgraph = graph_for_paths.subgraph(bridge_nodes).copy()
bridge_subgraph_named = nx.relabel_nodes(
    bridge_subgraph, 
    lambda node: get.get(node, node)
)


# Visualize with PyVis
net = Network(notebook=True, height="1000px", width="100%", bgcolor="#222222", font_color="white", cdn_resources='remote')
net.from_nx(bridge_subgraph_named)
for node in net.nodes:
    name = node['label']  # 'label' is the protein name after relabeling
    print(f"Processing node: {name}")
    if is_commonly_upregulated(name) == 1:
        node['color'] = 'red'  # Upregulated
    elif is_commonly_upregulated(name) == -1:
        node['color'] = 'blue'
    else:
        node['color'] = 'grey'  # Not upregulated
net.show("bridging_proteins_network.html")

Processing node: NAA40
Error for NAA40: Expecting value: line 1 column 2 (char 1)
Error for NAA40: Expecting value: line 1 column 2 (char 1)
Processing node: NAA50
Error for NAA50: Expecting value: line 1 column 2 (char 1)
Error for NAA50: Expecting value: line 1 column 2 (char 1)
Processing node: NAA30
Error for NAA30: Expecting value: line 1 column 2 (char 1)
Error for NAA30: Expecting value: line 1 column 2 (char 1)
Processing node: NAA11
Error for NAA11: Expecting value: line 1 column 2 (char 1)
Error for NAA11: Expecting value: line 1 column 2 (char 1)
Processing node: NAA15
Error for NAA15: Expecting value: line 1 column 2 (char 1)
Error for NAA15: Expecting value: line 1 column 2 (char 1)
Processing node: NAA20
Error for NAA20: Expecting value: line 1 column 2 (char 1)
Error for NAA20: Expecting value: line 1 column 2 (char 1)
Processing node: NAA35
Error for NAA35: Expecting value: line 1 column 2 (char 1)
Error for NAA35: Expecting value: line 1 column 2 (char 1)
Processing no

KeyboardInterrupt: 

In [None]:
net = Network(notebook=True, height="700px", width="100%", bgcolor="#222222", font_color="white",
              cdn_resources='remote')

# With this:
for node in interaction_graph.nodes:
    gene_label = string_to_gene.get(node, node)
    color = (
        "blue" if node in nac_ids else
        "red" if node in tom_ids else
        "gray"
    )
    net.add_node(node, label=gene_label, color=color)

for u, v, data in interaction_graph.edges(data=True):
    net.add_edge(u, v, value=data.get("weight", 1))

net.set_options('''
var options = {
  "nodes": {
    "font": {
      "size": 14
    }
  },
  "edges": {
    "color": {
      "inherit": true
    },
    "smooth": false
  },
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -8000,
      "springLength": 250
    },
    "minVelocity": 0.75
  }
}
''')

net.show("nac_tom_string_network.html")

In [None]:
import webbrowser
webbrowser.open("nac_tom_string_network.html")

In [None]:
# Cell 9: Shortest path bridging proteins
bridging_proteins = set()
for n in nac_ids:
    for t in tom_ids:
        try:
            path = nx.shortest_path(interaction_graph, source=n, target=t)
            bridging_proteins.update(path[1:-1])
        except nx.NetworkXNoPath:
            continue
print(f"Bridging proteins (STRING IDs): {bridging_proteins}")


In [16]:
# Cell 10: Betweenness centrality ranking
centrality = nx.betweenness_centrality(interaction_graph)
ranked_bridgers = sorted([(p, centrality[p]) for p in bridge_nodes], key=lambda x: x[1], reverse=True)
print("Top bridging proteins by centrality:")
for pid, score in ranked_bridgers[:10]:
    print(f"{get.get(pid)}: {score:.4f}")


Top bridging proteins by centrality:
TP53: 0.1887
HSP90AA1: 0.1800
MAVS: 0.1326
KAT2B: 0.1042
H4C6: 0.0863
GRB2: 0.0840
MDM2: 0.0748
GLYATL1: 0.0731
NAA40: 0.0705
VDAC1: 0.0645
