In [2]:
import requests
import pandas as pd

# raw data from: https://string-db.org/cgi/download?sessionId=b6owhyZw1jXy&species_text=Mus+musculus
mouse_ppi = pd.read_csv("mouse_10090.protein.links.v12.0.txt", sep=" ")
mouse_ppi

Unnamed: 0,protein1,protein2,combined_score
0,10090.ENSMUSP00000000001,10090.ENSMUSP00000027991,889
1,10090.ENSMUSP00000000001,10090.ENSMUSP00000137332,163
2,10090.ENSMUSP00000000001,10090.ENSMUSP00000041756,201
3,10090.ENSMUSP00000000001,10090.ENSMUSP00000075170,969
4,10090.ENSMUSP00000000001,10090.ENSMUSP00000110978,267
...,...,...,...
12684349,10090.ENSMUSP00000159257,10090.ENSMUSP00000047904,197
12684350,10090.ENSMUSP00000159257,10090.ENSMUSP00000062110,367
12684351,10090.ENSMUSP00000159257,10090.ENSMUSP00000099018,169
12684352,10090.ENSMUSP00000159257,10090.ENSMUSP00000023468,252


In [3]:
mouse_ppi

Unnamed: 0,protein1,protein2,combined_score
0,10090.ENSMUSP00000000001,10090.ENSMUSP00000027991,889
1,10090.ENSMUSP00000000001,10090.ENSMUSP00000137332,163
2,10090.ENSMUSP00000000001,10090.ENSMUSP00000041756,201
3,10090.ENSMUSP00000000001,10090.ENSMUSP00000075170,969
4,10090.ENSMUSP00000000001,10090.ENSMUSP00000110978,267
...,...,...,...
12684349,10090.ENSMUSP00000159257,10090.ENSMUSP00000047904,197
12684350,10090.ENSMUSP00000159257,10090.ENSMUSP00000062110,367
12684351,10090.ENSMUSP00000159257,10090.ENSMUSP00000099018,169
12684352,10090.ENSMUSP00000159257,10090.ENSMUSP00000023468,252


In [4]:
import numpy as np
ppi_scores = mouse_ppi["combined_score"]
np.min(ppi_scores), np.max(ppi_scores)

(150, 999)

In [5]:
significant_mouse_ppi = mouse_ppi[mouse_ppi["combined_score"] >= 700]
significant_mouse_ppi

Unnamed: 0,protein1,protein2,combined_score
0,10090.ENSMUSP00000000001,10090.ENSMUSP00000027991,889
3,10090.ENSMUSP00000000001,10090.ENSMUSP00000075170,969
7,10090.ENSMUSP00000000001,10090.ENSMUSP00000121127,916
8,10090.ENSMUSP00000000001,10090.ENSMUSP00000081569,956
13,10090.ENSMUSP00000000001,10090.ENSMUSP00000025541,760
...,...,...,...
12683968,10090.ENSMUSP00000159241,10090.ENSMUSP00000080242,899
12684079,10090.ENSMUSP00000159241,10090.ENSMUSP00000124205,748
12684116,10090.ENSMUSP00000159241,10090.ENSMUSP00000038137,959
12684164,10090.ENSMUSP00000159241,10090.ENSMUSP00000065819,961


In [6]:
significant_mouse_ppi = significant_mouse_ppi.reset_index(drop=True)
significant_mouse_ppi

Unnamed: 0,protein1,protein2,combined_score
0,10090.ENSMUSP00000000001,10090.ENSMUSP00000027991,889
1,10090.ENSMUSP00000000001,10090.ENSMUSP00000075170,969
2,10090.ENSMUSP00000000001,10090.ENSMUSP00000121127,916
3,10090.ENSMUSP00000000001,10090.ENSMUSP00000081569,956
4,10090.ENSMUSP00000000001,10090.ENSMUSP00000025541,760
...,...,...,...
403715,10090.ENSMUSP00000159241,10090.ENSMUSP00000080242,899
403716,10090.ENSMUSP00000159241,10090.ENSMUSP00000124205,748
403717,10090.ENSMUSP00000159241,10090.ENSMUSP00000038137,959
403718,10090.ENSMUSP00000159241,10090.ENSMUSP00000065819,961


In [7]:
in_prot_ids = list()
out_prot_ids = list()
in_genes = list()
out_genes = list()
in_gene_ids = list()
out_genes_ids = list()
#interaction_score = list()

def uniprot_rest_call(prot_id):
    # UniProt REST API URL
    url = f"https://rest.uniprot.org/uniprotkb/search?query={prot_id}"
    response = requests.get(url, headers={"Content-Type": "application/json"})
    if response.ok:
        data = response.json()
    gene_name = data["results"][0]["genes"][0]["geneName"]["value"]
    gene_ref = data["results"][0]["uniProtKBCrossReferences"]
    gene_id = None
    for item in gene_ref:
        if "database" in item and item["database"] == "GeneID":
            gene_id = item["id"]
    return gene_id, gene_name

for i, row in significant_mouse_ppi.iterrows():
    in_prot = row["protein1"].split(".")[1]
    out_prot = row["protein2"].split(".")[1]
    try:
        in_gene_id, in_gene = uniprot_rest_call(in_prot)
        out_gene_id, out_gene = uniprot_rest_call(out_prot)
        in_genes.append(in_gene)
        in_gene_ids.append(in_gene_id)
        out_genes.append(out_gene)
        out_genes_ids.append(out_gene_id)
        in_prot_ids.append(in_prot)
        out_prot_ids.append(out_prot)
    except:
        continue
    if i % 100 == 0 and i > 0:
        print("{} row processed".format(i+1))
    if i == 201:
        break
    
mouse_gene_interactions = pd.DataFrame(zip(in_prot_ids, in_gene_ids, in_genes, out_prot_ids, out_genes_ids, out_genes), columns=["Prot1ID", "Gene1ID", "Gene1Name", "Prot2ID", "Gene2ID", "Gene2Name"])
mouse_gene_interactions

101 row processed
201 row processed


Unnamed: 0,Prot1ID,Gene1ID,Gene1Name,Prot2ID,Gene2ID,Gene2Name
0,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000027991,19736,Rgs4
1,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000075170,13489,Drd2
2,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000121127,14696,Gnb4
3,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000081569,50780,Rgs3
4,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000025541,14682,Gnaq
...,...,...,...,...,...,...
196,ENSMUSP00000000028,12544,Cdc45,ENSMUSP00000045344,269582,Clspn
197,ENSMUSP00000000028,12544,Cdc45,ENSMUSP00000126135,17218,Mcm5
198,ENSMUSP00000000028,12544,Cdc45,ENSMUSP00000136972,16881,Lig1
199,ENSMUSP00000000028,12544,Cdc45,ENSMUSP00000023353,17217,Mcm4


In [8]:
mouse_gene_interactions.to_csv("mouse_gene_interactions_STRING_v12.tsv", sep="\t", index=None)