In [64]:
import random

In [146]:
from collections import Counter

In [115]:
pathways = {}
subcellular_locations = {}
with open('data/uniprot-subcellular_location-pathway.tsv') as f:
    header = next(f)
    for line in f:
        sp = line.strip().split('\t')
        if len(sp) > 2: #has subcellular or pathway info
#             print(sp)
            protein = sp[0] + '|' + sp[1]
            for field in sp[2:]:
                if field.startswith('SUBCELLULAR'):
                    if protein not in subcellular_locations: subcellular_locations[protein] = []
                    subcellular_locations[protein].append(field)
                elif field.startswith('PATHWAY'):
                    if protein not in pathways: pathways[protein] = []
                    pathways[protein].append(field)

In [116]:
len(subcellular_locations)

16589

In [117]:
len(pathways)

1163

In [118]:
def flatten(l):
    f = []
    for item in l:
        if type(item) is list:
            f += flatten(item)
        else:
            f.append(item)
    return f

In [119]:
l = ['asdf', 1, [123, 'adsf'], [123, ['pqr', 'xyz']]]
flatten(l)

['asdf', 1, 123, 'adsf', 123, 'pqr', 'xyz']

In [131]:
import re

In [136]:
location = 'SUBCELLULAR LOCATION: Recycling endosome {ECO:0000269|PubMed:22595670}. Early endosome membrane {ECO:0000269|PubMed:16962593}; Lipid-anchor {ECO:0000305}; Cytoplasmic side {ECO:0000305}. Golgi apparatus membrane {ECO:0000269|PubMed:16962593}; Lipid-anchor {ECO:0000305}; Cytoplasmic side {ECO:0000305}. Golgi apparatus, trans-Golgi network membrane {ECO:0000269|PubMed:16962593}; Lipid-anchor {ECO:0000305}; Cytoplasmic side {ECO:0000305}. Cytoplasmic vesicle, phagosome {ECO:0000269|PubMed:21255211}. Note=Recruited to recycling endosomes by DENND6A (PubMed:22595670). Recruited to phagosomes containing S.aureus or M.tuberculosis (PubMed:21255211). {ECO:0000269|PubMed:21255211, ECO:0000269|PubMed:22595670}.'    
location = re.sub(string=location, pattern='{ECO.*?}', repl='')
location = re.sub(string=location, pattern='\(PubMed.*?\)', repl='')
location

'SUBCELLULAR LOCATION: Recycling endosome . Early endosome membrane ; Lipid-anchor ; Cytoplasmic side . Golgi apparatus membrane ; Lipid-anchor ; Cytoplasmic side . Golgi apparatus, trans-Golgi network membrane ; Lipid-anchor ; Cytoplasmic side . Cytoplasmic vesicle, phagosome . Note=Recruited to recycling endosomes by DENND6A . Recruited to phagosomes containing S.aureus or M.tuberculosis . .'

In [165]:
parsed_subcellular_locations = {}
i=0
for protein,locations in subcellular_locations.items():
    parsed_locations = []
    for location in locations:
        location = location.lower()
        location = location.replace('subcellular location: ','')
        location = re.sub(string=location, pattern='isoform.*?:', repl='')
        location = location.split('note')[0]
        location = re.sub(string=location, pattern='{eco.*?}', repl='')
        location = re.sub(string=location, pattern='\(pubmed.*?\)', repl='')
        location = location.split('; ')
        location = flatten([field.split(' {')[0] for field in location])
        location = flatten([field.split('. ') for field in location])
        location = flatten([field.split(', ') for field in location])
        location = [field.strip('.').strip(' ') for field in location if len(field) > 0]
        parsed_locations += location
    parsed_subcellular_locations[protein] = parsed_locations

In [166]:
for s in random.sample(parsed_subcellular_locations.items(),10):
    for x in s:
        print(x)
        print('----')

P55039|DRG2_HUMAN
----
['cytoplasm']
----
Q8N782|ZN525_HUMAN
----
['nucleus']
----
Q9BV86|NTM1A_HUMAN
----
['nucleus']
----
Q9BW60|ELOV1_HUMAN
----
['endoplasmic reticulum membrane', 'multi-pass membrane protein']
----
Q86YM7|HOME1_HUMAN
----
['cytoplasm', 'cell junction', 'synapse', 'postsynaptic cell membrane', 'postsynaptic density', 'cell junction', 'synapse', 'cell projection', 'dendritic spine']
----
Q5T5C0|STXB5_HUMAN
----
['cytoplasm', 'cell membrane', 'peripheral membrane protein', 'cytoplasmic vesicle membrane', 'peripheral membrane protein', 'cytoplasmic vesicle', 'secretory vesicle', 'synaptic vesicle', 'cell junction', 'synapse']
----
P12318|FCG2A_HUMAN
----
['cell membrane', 'single-pass type i membrane protein']
----
Q9UFH2|DYH17_HUMAN
----
['cytoplasm', 'cytoskeleton', 'cilium axoneme']
----
Q13105|ZBT17_HUMAN
----
['nucleus']
----
Q8IZP1|TBC3A_HUMAN
----
['cell membrane', 'lipid-anchor']
----


In [184]:
all_locations = Counter()
for locations in parsed_subcellular_locations.values():
    for location in locations:
        all_locations[location] += 1

In [185]:
all_locations = sorted(all_locations.items(), key=lambda _: _[1], reverse=True) 
top_locations = set([location for location,count in all_locations if count>20])

In [187]:
filtered_subcellular_locations = {}
for protein,locations in parsed_subcellular_locations.items():
    filtered_locations = []
    for location in locations:
        if location in top_locations:
            filtered_locations.append(location)
    if len(filtered_locations) > 0:
        filtered_subcellular_locations[protein] = filtered_locations

In [189]:
random.sample(filtered_subcellular_locations.items(), 10)

[('Q969Z4|TR19L_HUMAN',
  ['cell membrane',
   'single-pass type i membrane protein',
   'cytoplasm',
   'cytoplasm',
   'perinuclear region']),
 ('P0C0L4|CO4A_HUMAN',
  ['secreted',
   'cell junction',
   'synapse',
   'cell projection',
   'axon',
   'cell projection',
   'dendrite']),
 ('P01024|CO3_HUMAN', ['secreted']),
 ('P00742|FA10_HUMAN', ['secreted']),
 ('Q9Y3B8|ORN_HUMAN',
  ['mitochondrion intermembrane space',
   'mitochondrion matrix',
   'nucleus',
   'cytoplasm']),
 ('P07864|LDHC_HUMAN', ['cytoplasm']),
 ('Q96DZ1|ERLEC_HUMAN', ['endoplasmic reticulum lumen']),
 ('Q9BZM5|ULBP2_HUMAN',
  ['cell membrane',
   'lipid-anchor',
   'gpi-anchor',
   'endoplasmic reticulum',
   'secreted']),
 ('Q8NEA6|GLIS3_HUMAN', ['nucleus']),
 ('P07992|ERCC1_HUMAN',
  ['nucleus', 'cytoplasm', 'nucleus', 'nucleus', 'nucleus'])]

In [190]:
len(filtered_subcellular_locations)

16567

In [191]:
import json

In [195]:
json.dump(filtered_subcellular_locations, open('data/subcellular_locations.json','w'))