In [1]:
import requests
import pandas as pd

# raw data from: https://string-db.org/cgi/download?sessionId=b6owhyZw1jXy&species_text=Mus+musculus
mouse_ppi = pd.read_csv("mouse_10090.protein.links.v12.0.txt", sep=" ")
mouse_ppi

Unnamed: 0,protein1,protein2,combined_score
0,10090.ENSMUSP00000000001,10090.ENSMUSP00000027991,889
1,10090.ENSMUSP00000000001,10090.ENSMUSP00000137332,163
2,10090.ENSMUSP00000000001,10090.ENSMUSP00000041756,201
3,10090.ENSMUSP00000000001,10090.ENSMUSP00000075170,969
4,10090.ENSMUSP00000000001,10090.ENSMUSP00000110978,267
...,...,...,...
12684349,10090.ENSMUSP00000159257,10090.ENSMUSP00000047904,197
12684350,10090.ENSMUSP00000159257,10090.ENSMUSP00000062110,367
12684351,10090.ENSMUSP00000159257,10090.ENSMUSP00000099018,169
12684352,10090.ENSMUSP00000159257,10090.ENSMUSP00000023468,252


In [2]:
mouse_ppi

Unnamed: 0,protein1,protein2,combined_score
0,10090.ENSMUSP00000000001,10090.ENSMUSP00000027991,889
1,10090.ENSMUSP00000000001,10090.ENSMUSP00000137332,163
2,10090.ENSMUSP00000000001,10090.ENSMUSP00000041756,201
3,10090.ENSMUSP00000000001,10090.ENSMUSP00000075170,969
4,10090.ENSMUSP00000000001,10090.ENSMUSP00000110978,267
...,...,...,...
12684349,10090.ENSMUSP00000159257,10090.ENSMUSP00000047904,197
12684350,10090.ENSMUSP00000159257,10090.ENSMUSP00000062110,367
12684351,10090.ENSMUSP00000159257,10090.ENSMUSP00000099018,169
12684352,10090.ENSMUSP00000159257,10090.ENSMUSP00000023468,252


In [3]:
import numpy as np
ppi_scores = mouse_ppi["combined_score"]
np.min(ppi_scores), np.max(ppi_scores)

(150, 999)

In [4]:
significant_mouse_ppi = mouse_ppi[mouse_ppi["combined_score"] >= 700]
significant_mouse_ppi

Unnamed: 0,protein1,protein2,combined_score
0,10090.ENSMUSP00000000001,10090.ENSMUSP00000027991,889
3,10090.ENSMUSP00000000001,10090.ENSMUSP00000075170,969
7,10090.ENSMUSP00000000001,10090.ENSMUSP00000121127,916
8,10090.ENSMUSP00000000001,10090.ENSMUSP00000081569,956
13,10090.ENSMUSP00000000001,10090.ENSMUSP00000025541,760
...,...,...,...
12683968,10090.ENSMUSP00000159241,10090.ENSMUSP00000080242,899
12684079,10090.ENSMUSP00000159241,10090.ENSMUSP00000124205,748
12684116,10090.ENSMUSP00000159241,10090.ENSMUSP00000038137,959
12684164,10090.ENSMUSP00000159241,10090.ENSMUSP00000065819,961


In [5]:
significant_mouse_ppi = significant_mouse_ppi.reset_index(drop=True)
significant_mouse_ppi

Unnamed: 0,protein1,protein2,combined_score
0,10090.ENSMUSP00000000001,10090.ENSMUSP00000027991,889
1,10090.ENSMUSP00000000001,10090.ENSMUSP00000075170,969
2,10090.ENSMUSP00000000001,10090.ENSMUSP00000121127,916
3,10090.ENSMUSP00000000001,10090.ENSMUSP00000081569,956
4,10090.ENSMUSP00000000001,10090.ENSMUSP00000025541,760
...,...,...,...
403715,10090.ENSMUSP00000159241,10090.ENSMUSP00000080242,899
403716,10090.ENSMUSP00000159241,10090.ENSMUSP00000124205,748
403717,10090.ENSMUSP00000159241,10090.ENSMUSP00000038137,959
403718,10090.ENSMUSP00000159241,10090.ENSMUSP00000065819,961


In [6]:
# collect all ensemble mus
ensemble_ids = list()
for i, row in significant_mouse_ppi.iterrows():
    in_id = row[0].split(".")[1]
    out_id = row[1].split(".")[1]
    
    ensemble_ids.append(in_id)
    ensemble_ids.append(out_id)

len(list(set(ensemble_ids))), ensemble_ids[:5]

(15971,
 ['ENSMUSP00000000001',
  'ENSMUSP00000027991',
  'ENSMUSP00000000001',
  'ENSMUSP00000075170',
  'ENSMUSP00000000001'])

In [7]:
u_ensemble_ids = list(set(ensemble_ids))

In [8]:
in_prot_ids = list()
out_prot_ids = list()
in_genes = list()
out_genes = list()
in_gene_ids = list()
out_genes_ids = list()
prot_name_gene_ids = dict()


def uniprot_rest_call(prot_id):
    # UniProt REST API URL
    if prot_id not in prot_name_gene_ids:
        url = f"https://rest.uniprot.org/uniprotkb/search?query={prot_id}"
        response = requests.get(url, headers={"Content-Type": "application/json"})
        if response.ok:
            data = response.json()
        gene_name = data["results"][0]["genes"][0]["geneName"]["value"]
        gene_ref = data["results"][0]["uniProtKBCrossReferences"]
        gene_id = None
        for item in gene_ref:
            if "database" in item and item["database"] == "GeneID":
                gene_id = item["id"]
        prot_name_gene_ids[prot_id] = (gene_id, gene_name)
        return gene_id, gene_name
    else:
        return prot_name_gene_ids[prot_id][0], prot_name_gene_ids[prot_id][1]


print("Finding ids for {} ensemble ids".format(len(u_ensemble_ids)))
for idx, e_id in enumerate(u_ensemble_ids):
    try:
        uniprot_rest_call(e_id)
    except:
        continue
    if idx % 1000 == 0 and idx > 0:
        print("{} row processed".format(idx))

Finding ids for 15971 ensemble ids
1000 row processed
2000 row processed
3000 row processed
4000 row processed
5000 row processed
6000 row processed
7000 row processed
8000 row processed
9000 row processed
10000 row processed
11000 row processed
12000 row processed
13000 row processed
14000 row processed
15000 row processed


In [9]:
for i, row in significant_mouse_ppi.iterrows():
    in_prot = row["protein1"].split(".")[1]
    out_prot = row["protein2"].split(".")[1]
    try:
        in_gene_id, in_gene = prot_name_gene_ids[in_prot][0], prot_name_gene_ids[in_prot][1]
        #uniprot_rest_call(in_prot)
        out_gene_id, out_gene = prot_name_gene_ids[out_prot][0], prot_name_gene_ids[out_prot][1]
        #prot_name_gene_ids[in_prot] #uniprot_rest_call(out_prot)
        in_genes.append(in_gene)
        in_gene_ids.append(in_gene_id)
        out_genes.append(out_gene)
        out_genes_ids.append(out_gene_id)
        in_prot_ids.append(in_prot)
        out_prot_ids.append(out_prot)
    except:
        continue
    if i % 1000 == 0 and i > 0:
        print("{} row processed".format(i+1))
    
mouse_gene_interactions = pd.DataFrame(zip(in_prot_ids, in_gene_ids, in_genes, out_prot_ids, out_genes_ids, out_genes), columns=["Prot1ID", "Gene1ID", "Gene1Name", "Prot2ID", "Gene2ID", "Gene2Name"])
mouse_gene_interactions

1001 row processed
2001 row processed
3001 row processed
4001 row processed
5001 row processed
6001 row processed
7001 row processed
8001 row processed
9001 row processed
10001 row processed
11001 row processed
12001 row processed
13001 row processed
14001 row processed
15001 row processed
16001 row processed
17001 row processed
18001 row processed
19001 row processed
20001 row processed
21001 row processed
22001 row processed
23001 row processed
24001 row processed
25001 row processed
26001 row processed
27001 row processed
28001 row processed
29001 row processed
30001 row processed
31001 row processed
32001 row processed
33001 row processed
34001 row processed
35001 row processed
36001 row processed
37001 row processed
38001 row processed
39001 row processed
40001 row processed
41001 row processed
42001 row processed
43001 row processed
44001 row processed
45001 row processed
46001 row processed
47001 row processed
48001 row processed
49001 row processed
50001 row processed
51001 row

Unnamed: 0,Prot1ID,Gene1ID,Gene1Name,Prot2ID,Gene2ID,Gene2Name
0,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000027991,19736,Rgs4
1,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000075170,13489,Drd2
2,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000121127,14696,Gnb4
3,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000081569,50780,Rgs3
4,ENSMUSP00000000001,14679,Gnai3,ENSMUSP00000025541,14682,Gnaq
...,...,...,...,...,...,...
396697,ENSMUSP00000159241,100042165,Thoc2l,ENSMUSP00000080242,56009,Alyref2
396698,ENSMUSP00000159241,100042165,Thoc2l,ENSMUSP00000124205,60532,Wtap
396699,ENSMUSP00000159241,100042165,Thoc2l,ENSMUSP00000038137,386612,Thoc6
396700,ENSMUSP00000159241,100042165,Thoc2l,ENSMUSP00000065819,66231,Thoc7


In [10]:
mouse_gene_interactions.to_csv("mouse_gene_interactions_STRING_v12.tsv", sep="\t", index=None)

In [11]:
import json
with open("ProteinID_Gene_Names.json", "w+") as f:
    f.write(json.dumps(prot_name_gene_ids))