# Protein-Gene Mappings
- Protein = UniProt ID
- Gene = Entrez ID

In [33]:
import json
import pandas as pd
import csv
import os
import json
import urllib.parse
import urllib.request
import numpy as np
from biomed_apis import *
from biomedkg_utils import *
# INSTRUCTIONS: Download a set of as many genes as you can (ideally all human genes)
# NOTE: URL just broke. HGNC website seems to not work correctly right now
os.system('! wget -N -P input/ https://www.genenames.org/cgi-bin/download/custom?col=gd_app_name&col=gd_pub_acc_ids&col=md_prot_id&col=gd_pub_eg_id&status=Approved&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit')

0

--2023-06-14 13:54:01--  https://www.genenames.org/cgi-bin/download/custom?col=gd_app_name
Resolving www.genenames.org (www.genenames.org)... 193.62.193.83
Connecting to www.genenames.org (www.genenames.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘input/custom?col=gd_app_name’

     0K .......... .......... .......... .......... ..........  146K
    50K .......... .......... .......... .......... ..........  146K
   100K .......... .......... .......... .......... ..........  292K
   150K .......... .......... .......... .......... ..........  291K
   200K .......... .......... .......... .......... ..........  235K
   250K .......... .......... .......... .......... ..........  292K
   300K .......... .......... .......... .......... ..........  167K
   350K .......... .......... .......... .......... .......... 48.7M
   400K .......... .......... .......... .......... ..........  146K
   450K .......

## Gene-is-Protein (via HGNC)

In [12]:
os.system('mv input/custom?col=gd_app_name input/custom_entrez_uniprot.txt')

0

### Gene-[encodes]->Protein JSON, Protein-[encoded by]->Gene JSON
Entrez Gene, UniProt Protein

In [3]:
protein_id_2_gene_id, gene_id_2_protein_id = dict(), dict()
for line in open('input/custom_entrez_uniprot.txt','r'):
    line = line.split('\t')
    protein_ids = line[2].strip() # UniProt Protein ID
    gene_id = line[3].strip()     # Entrez Gene ID
    
    if 'UniProt ID(supplied by UniProt)' not in protein_ids:
        protein_ids = protein_ids.replace('_','').split(',')
        for protein_id in protein_ids:
            if protein_id != '' and gene_id != '':
                protein_id_2_gene_id.setdefault(protein_id.strip(), set()).add(gene_id.strip())
                gene_id_2_protein_id.setdefault(gene_id.strip(),set()).add(protein_id.strip())
        
protein_id_2_gene_id = switch_dictset_to_dictlist(protein_id_2_gene_id)
gene_id_2_protein_id = switch_dictset_to_dictlist(gene_id_2_protein_id)

## More Gene-is-Protein (via UniProt API)
This gets all reviewed (I think) human proteins plus some other unreviewed ones taken from non-UniProt databases

In [9]:
# Reviewed human proteome
url = 'https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names&format=json&query=%28reviewed%3Atrue%20AND%20proteome%3Aup000005640%29'
res = requests.get(url)
r = res.json()


# Proteins from other sources
proteins_from_go = set(pd.read_csv('output/edges/edges_protein2go.csv')['Protein (UniProt)'])
proteins_from_entrez_mapping = set(protein_id_2_gene_id.keys())
proteins_from_human_proteome = set([entry['primaryAccession'] for entry in r['results']])
proteins_from_reactome_pathways = set(pd.read_csv('output/edges/edges_protein2reactomePathway.csv')['Protein (UniProt)']) 
#proteins_from_reactome_ppi = set(pd.read_csv('output/edges/edges_protein2protein_reactome.csv')['Protein (UniProt)'])
#proteins_from_reactome_ppi = set(pd.read_csv('output/edges/edges_protein2protein_reactome.csv')['Protein (UniProt).1'])
proteins_from_reactome_reactions = set(pd.read_csv('output/edges/edges_protein2reaction.csv')['Protein (UniProt)'])

# Combine all those proteins
protein_set = proteins_from_human_proteome
protein_set = proteins_from_entrez_mapping.union(protein_set) 
protein_set = proteins_from_go.union(protein_set)    
protein_set = proteins_from_reactome_pathways.union(protein_set)    
protein_set = proteins_from_reactome_ppi.union(protein_set)    
protein_set = proteins_from_reactome_reactions.union(protein_set)   

In [10]:
protein_list = list(protein_set)
protein_set = set()
for protein_ids in protein_list:
    
    # Split each "protein" into multiple proteins if there are multiple
    protein_ids = protein_ids.split(', ')
    
    for protein_id in protein_ids:
        
        # Remove "UniProt:" prefix
        if 'UniProt:' in protein_id:
            protein_id = protein_id.split('UniProt:')[1].strip().split('_')[0]

        # Don't add numbers (genes), add UniProt IDs
        if not protein_id[0:].isnumeric():
            protein_set.add(protein_id)

json.dump(list(protein_set), open('output/protein2protein/human_proteins.json','w'))

len(protein_set)

In [49]:
job_id = submit_id_mapping_UniProtAPI(
                  from_db = 'UniProtKB_AC-ID',
                  to_db = 'GeneID', 
                  ids = protein_set)

# This checks on the job until it is finished
if check_id_mapping_results_ready_UniProtAPI(job_id):
    link = get_id_mapping_results_link_UniProtAPI(job_id)
    results = get_id_mapping_results_search_UniProtAPI(link)

Job still running. Retrying in 3s
Job still running. Retrying in 3s
Job still running. Retrying in 3s
Job still running. Retrying in 3s
Fetched: 20034 / 20034

In [50]:
print('Before UniProt API')
print('Entrez-is-UniProt', len(gene_id_2_protein_id), 
      'UniProt-is-Entrez', len(protein_id_2_gene_id))

gene_id_2_protein_id = switch_dictlist_to_dictset(gene_id_2_protein_id)
protein_id_2_gene_id = switch_dictlist_to_dictset(protein_id_2_gene_id)

for protein_to_gene in results['results']:
    protein_id = protein_to_gene['from'].strip()
    gene_id = protein_to_gene['to'].strip()
    if gene_id != '' and protein_id != '':
        gene_id_2_protein_id.setdefault(gene_id, set()).add(protein_id)
        protein_id_2_gene_id.setdefault(protein_id, set()).add(gene_id)
    
print('\nAfter UniProt API')
print('Entrez-is-UniProt', len(gene_id_2_protein_id), 
      'UniProt-is-Entrez', len(protein_id_2_gene_id))

Before UniProt API
Entrez-is-UniProt 20878 UniProt-is-Entrez 20876

After UniProt API
Entrez-is-UniProt 20882 UniProt-is-Entrez 20881


### Export
Gene-is-Protein, Protein-is-Gene

In [51]:
gene_id_2_protein_id = switch_dictset_to_dictlist(gene_id_2_protein_id)
protein_id_2_gene_id = switch_dictset_to_dictlist(protein_id_2_gene_id)

json.dump(gene_id_2_protein_id, open('output/protein2gene/all_entrez2uniprot.json','w'))
json.dump(protein_id_2_gene_id, open('output/protein2gene/all_uniprot2entrez.json','w'))

# Edges
gene2prot_dict = json.load(open('output/protein2gene/all_entrez2uniprot.json'))
with open('output/edges/edges_gene-ENCODES->protein.csv','w') as fout1:
    writerE = csv.writer(fout1)
    writerE.writerow(['Gene (Entrez)','Protein (UniProt)', 'Relationship'])
    
    with open('output/protein2gene/edges_gene-ENCODES->protein.csv','w') as fout:
        writer = csv.writer(fout)
        writer.writerow(['Gene (Entrez)','Protein (UniProt)', 'Relationship'])
        for gene, proteins in gene2prot_dict.items():
            for protein in proteins:
                writer.writerow(['Entrez:'+gene, 'UniProt:'+protein, '-encodes->'])
                writerE.writerow(['Entrez:'+gene, 'UniProt:'+protein, '-encodes->'])
                
os.system('cp "edges/edges_gene-ENCODES->protein.csv" "edges/Gene_(Entrez)_2_Protein_(UniProt).csv"')
os.system('cp "edges/edges_gene-ENCODES->protein.csv" "edges to use/Gene_(Entrez)_2_Protein_(UniProt).csv"')