# Generate mapping between Ensembl ID's of genes and proteins

In [1]:
import json

In [2]:
from pprint import pprint

In [3]:
from collections import defaultdict

In [4]:
import pickle

In [5]:
ensembl = json.load(open('../data/mus_musculus.json'))

In [6]:
len(ensembl['genes'])

56748

In [7]:
gene_protein_ids = {}
protein_gene_ids = {}
for gene in ensembl['genes']:
    if 'transcripts' in gene:
        for transcript in gene['transcripts']:
            if 'translations' in transcript:
                for translation in transcript['translations']:
                    gene_protein_ids[gene['id']] = translation['id']

In [8]:
len(gene_protein_ids)

22464

# Generate mapping between gene names and Ensembl ID's of genes and proteins

In [23]:
protein_primary_names = defaultdict(set)
gene_primary_names = defaultdict(set)
protein_all_names = defaultdict(set)
gene_all_names = defaultdict(set)
name_file = open('../data/mouse_gene_names.tsv')
next(name_file)
for line in name_file:
    s = line.split('\t')
    gene_id, transcript_id, synonym, name, protein_id = s
    protein_id = protein_id.strip()
    if synonym:
        gene_all_names[synonym.upper()].add(gene_id)
    if synonym and protein_id:
        protein_all_names[synonym.upper()].add(protein_id)
    if name:
        gene_primary_names[name.upper()].add(gene_id)
        gene_all_names[name.upper()].add(gene_id)
    if name and protein_id:
        protein_primary_names[name.upper()].add(protein_id)
        protein_all_names[name.upper()].add(protein_id)
gene_all_names = dict(gene_all_names)
protein_all_names = dict(protein_all_names)

In [24]:
len(gene_all_names)

100654

In [25]:
len(protein_all_names)

63822

In [26]:
pickle.dump(protein_all_names, open('../data/protein_synonyms.pickle','wb'))

pickle.dump(gene_all_names, open('../data/gene_synonyms.pickle','wb'))

pickle.dump(protein_primary_names, open('../data/protein_names.pickle','wb'))

pickle.dump(gene_primary_names, open('../data/gene_names.pickle','wb'))

pickle.dump(gene_protein_ids, open('../data/gene_protein_ids.pickle','wb'))

## Generate backward mapping from ID to names

In [27]:
# Map ids back to names
from collections import defaultdict
gene_id_to_name = defaultdict(set)
for name, ids in gene_primary_names.items():
    for gene_id in ids:
        gene_id_to_name[gene_id].add(name)
gene_id_to_name = dict(gene_id_to_name)
print(len(gene_id_to_name))

54055


In [28]:
# Map ids back to names
from collections import defaultdict
protein_id_to_name = defaultdict(set)
for name, ids in protein_primary_names.items():
    for gene_id in ids:
        protein_id_to_name[gene_id].add(name)
protein_id_to_name = dict(protein_id_to_name)
print(len(protein_id_to_name))

67085


In [29]:
# Map ids back to names
from collections import defaultdict
protein_id_to_synonyms = defaultdict(set)
for name, ids in protein_all_names.items():
    for protein_id in ids:
        protein_id_to_synonyms[protein_id].add(name)
protein_id_to_synonyms = dict(protein_id_to_synonyms)
print(len(protein_id_to_synonyms))

67085


In [30]:
protein_id_to_name['ENSMUSP00000040089']

{'TWIST1'}

In [31]:
protein_id_to_synonyms['ENSMUSP00000040089']

{'BHLHA38',
 'CHARLIE CHAPLIN',
 'M-TWIST',
 'PDT',
 'PLURIDIGITE',
 'SKA10',
 'SKA<M10JUS>',
 'TWIST1'}

In [17]:
pickle.dump(protein_id_to_name, open('../data/protein_id_to_name.pickle','wb'))

In [21]:
pickle.dump(protein_id_to_synonyms, open('../data/protein_id_to_synonyms.pickle','wb'))

In [18]:
pickle.dump(gene_id_to_name, open('../data/gene_id_to_name.pickle','wb'))

## Get names of genes that are in Enrichr's GO_Bio_Process

In [19]:
f = open('../data/GO_Biological_Process_2021')
go_names = set()
for line in f:
    for name in line.strip().split('\t')[2:]:
        go_names.add(name)

In [20]:
pickle.dump(go_names, open('../data/go_bio_process_2021_names.pickle','wb'))