In [2]:
import pandas as pd
import numpy as np
import sqlite3
from tqdm import tqdm
import pickle
from pandarallel import pandarallel
from time import time
from tokenizers import Tokenizer
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy.stats import linregress
from goatools.obo_parser import GODag
from goatools.mapslim import mapslim
import requests
import os
import re
from vocabulary_functions import get_mutated, get_parents, set_difference, set_intersection, load_tokenizers, calc_agreement, calc_dice_idx_only

In [3]:
pandarallel.initialize(nb_workers=20, progress_bar=True)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
# Download GO DAG (ontology structure)
obo_url = "http://purl.obolibrary.org/obo/go/go-basic.obo"
obo_path = "go-basic.obo"

# Download OBO file if not already downloaded
if not os.path.exists(obo_path):
    with open(obo_path, "w") as f:
        f.write(requests.get(obo_url).text)

# Parse GO DAG
go_dag = GODag(obo_path)

# Download GO Slim generic terms
goslim_url = "http://current.geneontology.org/ontology/subsets/goslim_generic.obo"
goslim_path = "goslim_generic.obo"

if not os.path.exists(goslim_path):
    with open(goslim_path, "w") as f:
        f.write(requests.get(goslim_url).text)

# Parse GO Slim DAG
goslim_dag = GODag(goslim_path)
goslim_terms = set(goslim_dag.keys())

go-basic.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms
goslim_generic.obo: fmt(1.2) rel(go/2025-03-16/subsets/goslim_generic.owl) 206 Terms


In [5]:
# 'dataset': {'uniref50', 'uniref90'}
# 'is_pretokenizer': {True, False}
# 'subs_matrix': {'blosum45', 'blosum62', 'pam70', 'pam250'}
# 'mutation_cutoff': {0.7, 0.8, 0.9}
# 'min_mutation_freq': {0, 0.05,. 0.005}
# 'min_mutation_len': {3}
# 'max_mutation_len': {12}
# 'vocab_size': list=[800, 1600, 3200, 6400, 12800, 25600, 51200]

vocab_sizes = [800, 1600, 3200, 6400, 12800, 25600]
vocab_sizes = [1600, 6400, 25600]
uniref_id = "50"

tokenizer_opts_list = [
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': False,
        'subs_matrix': 'blosum62',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
    # {
    #     'is_mut': True,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': False,
    #     'subs_matrix': 'pam70',
    #     'mutation_cutoff': 0.7,
    #     'min_mutation_freq': 0.05,
    #     'min_mutation_len': 3,
    #     'max_mutation_len': 12,
    #     'vocab_size': vocab_sizes
    # },
    # {
    #     'is_mut': True,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': True,
    #     'subs_matrix': 'blosum62',
    #     'mutation_cutoff': 0.7,
    #     'min_mutation_freq': 0.05,
    #     'min_mutation_len': 3,
    #     'max_mutation_len': 12,
    #     'vocab_size': vocab_sizes
    # },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': True,
        'subs_matrix': 'pam70',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
]

In [6]:
tokenizer_list = load_tokenizers(tokenizer_opts_list, 'hf')
inner_vocab_list = load_tokenizers(tokenizer_opts_list, 'vocab')

vocab_list = {}
for name, tokenizer in tokenizer_list.items():
    vocab_list[name] = list(set([token for token, idx in tokenizer.get_vocab().items()]))

In [7]:
methods = [method_name[:-len(str(vocab_sizes[0]))-1] for method_name in list(tokenizer_list.keys())[::len(vocab_sizes)]]
methods2names = {mn:mn.replace('mut', 'evo').replace('std', '').replace('blosum', 'BLOSUM').replace('pam', 'PAM').replace('pre', 'Pre') for mn in methods}
methods2names = {k: ' '.join(v.split()[:-2]) if 'evoBPE' in v else v for k, v in methods2names.items()}
methods2names

{'mutBPE blosum62 0.7 0.05': 'evoBPE BLOSUM62',
 'mutBPE pre pam70 0.7 0.05': 'evoBPE Pre PAM70'}

In [8]:
vocab_lineage_list = {}
for k, v in inner_vocab_list.items():
    template_dict = {
        
    }
    vocab_lineage_list[k] = {token:{
                                'frequency': -1,
                                'order': -1,
                                'parent_pair': [],
                                'parent_mutation': "",
                                'parent_mutation_similarity': -1,
                                'partner_pair_self': False,
                                'partner_pair_left': [],
                                'partner_pair_right': [],
                                'child_pair': [],
                                'child_mutation': []
                            } for token in v.keys()}

for method_name, vocab in tqdm(inner_vocab_list.items()):
    for token, inner_vocab_elements in vocab.items():
        vocab_lineage_list[method_name][token]['frequency'] = inner_vocab_elements['frequency']
        vocab_lineage_list[method_name][token]['order'] = inner_vocab_elements['order']
        vocab_lineage_list[method_name][token]['parent_pair'] = inner_vocab_elements['pair'] if 'pair' in inner_vocab_elements else []
        vocab_lineage_list[method_name][token]['parent_mutation'] = inner_vocab_elements['parent'] if 'parent' in inner_vocab_elements else ""
        vocab_lineage_list[method_name][token]['parent_mutation_similarity'] = inner_vocab_elements['similarity'] if 'similarity' in inner_vocab_elements else -1

        if 'pair' in inner_vocab_elements:
            if inner_vocab_elements['pair'][0] == inner_vocab_elements['pair'][1]:
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][0]]['partner_pair_self'] = True
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][0]]['child_pair'].append(token)
            else:
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][0]]['partner_pair_right'].append(inner_vocab_elements['pair'][1])
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][1]]['partner_pair_left'].append(inner_vocab_elements['pair'][0])
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][0]]['child_pair'].append(token)
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][1]]['child_pair'].append(token)
        if 'parent' in inner_vocab_elements:
                vocab_lineage_list[method_name][inner_vocab_elements['parent']]['child_mutation'].append(token)

100%|██████████| 6/6 [00:00<00:00, 58.94it/s]


In [9]:
method_name = 'mutBPE blosum62 0.7 0.05 1600'

In [10]:
vocab_lineage_list[method_name]

{'A': {'frequency': 0,
  'order': 0,
  'parent_pair': [],
  'parent_mutation': '',
  'parent_mutation_similarity': -1,
  'partner_pair_self': True,
  'partner_pair_left': ['S',
   'E',
   'P',
   'T',
   'K',
   'R',
   'Q',
   'M',
   'D',
   'F',
   'V',
   'N',
   'I',
   'G',
   'H',
   'C',
   'Y',
   'W',
   'SS',
   'AA',
   'SD',
   'SQ',
   'PC',
   'PG',
   'PV',
   'PP',
   'SP',
   'SAA',
   'SL',
   'SI',
   'SV',
   'AL',
   'NL',
   'TL',
   'ED',
   'GL',
   'EK',
   'DL',
   'LL',
   'IL',
   'ML',
   'VL',
   'PD',
   'PQ',
   'EE',
   'EQ',
   'EG',
   'AG',
   'RD',
   'RK',
   'EL',
   'EI',
   'EV',
   'QL',
   'AV'],
  'partner_pair_right': ['L',
   'V',
   'G',
   'Q',
   'T',
   'SL',
   'LL',
   'D',
   'I',
   'R',
   'P',
   'PP',
   'GL',
   'SG',
   'EE',
   'PL',
   'RL',
   'F',
   'K',
   'PG',
   'EL',
   'FL',
   'H',
   'C',
   'Y',
   'RR',
   'M',
   'EA',
   'HL',
   'KK',
   'N',
   'CG',
   'GG',
   'TG',
   'W',
   'ET',
   'CL',
   'CI',
   'E

In [14]:
# Connect to DB
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)

df_protein = pd.read_sql(f"""SELECT Entry as uniprot_id, Sequence as sequence, "Gene Ontology (GO)" as go_all,
                          "Gene Ontology (biological process)" as go_bp, "Gene Ontology (cellular component)" as go_cc, "Gene Ontology (molecular function)" as go_mf
                          FROM proteins
                          WHERE Entry IN (SELECT uniprot_accession FROM uniref{uniref_id}_distilled)""", conn)
df_protein = df_protein[df_protein['sequence'].str.len() < 3000].reset_index(drop=True)

df_protein_sliced = pd.read_sql(f"SELECT uniprot_id, sequence FROM uniref{uniref_id}_domain_sliced_plddt70", conn)
df_protein_sliced = df_protein_sliced[df_protein_sliced['uniprot_id'].isin(df_protein['uniprot_id'])].reset_index(drop=True)

conn.close()

In [15]:
df_protein = df_protein.fillna('')
df_protein['go_ids'] = df_protein['go_all'] + df_protein['go_bp'] + df_protein['go_cc'] + df_protein['go_mf']
df_protein['go_ids'] = df_protein['go_ids'].apply(lambda x: list(set(re.findall(r'GO:\d+', (x)))))
df_protein

Unnamed: 0,uniprot_id,sequence,go_all,go_bp,go_cc,go_mf,go_ids
0,A0A087WZT3,MELSAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLGS,,,,,[]
1,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,cytoplasm [GO:0005737]; intracellular membrane...,arachidonate metabolic process [GO:0019369]; x...,cytoplasm [GO:0005737]; intracellular membrane...,aromatase activity [GO:0070330]; heme binding ...,"[GO:0005506, GO:0020037, GO:0016020, GO:000573..."
2,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,endoplasmic reticulum membrane [GO:0005789]; G...,prostaglandin biosynthetic process [GO:0001516...,endoplasmic reticulum membrane [GO:0005789]; G...,dioxygenase activity [GO:0051213]; heme bindin...,"[GO:0046872, GO:0020037, GO:0001516, GO:000466..."
3,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,mitochondrial outer membrane [GO:0005741]; mit...,regulation of endoplasmic reticulum unfolded p...,mitochondrial outer membrane [GO:0005741]; mit...,,"[GO:0005739, GO:1900101, GO:0005741, GO:0006986]"
4,A0A0C5B5G6,MRWQEMGYIFYPRKLR,extracellular space [GO:0005615]; mitochondrio...,activation of protein kinase activity [GO:0032...,extracellular space [GO:0005615]; mitochondrio...,DNA binding [GO:0003677]; DNA-binding transcri...,"[GO:0032147, GO:0003677, GO:0001649, GO:004361..."
...,...,...,...,...,...,...,...
70687,X6RL83,MLQEWLAAVGDDYAAVVWRPEGEPRFYPDEEGPKHWTKERHQFLME...,,,,,[]
70688,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,cytosol [GO:0005829],,cytosol [GO:0005829],,[GO:0005829]
70689,X6RLR1,MAGLTDLQRLQARVEELERWVYGPGGARGSRKVADGLVKVQVALGN...,cytosol [GO:0005829]; dynactin complex [GO:000...,cytoskeleton-dependent cytokinesis [GO:0061640],cytosol [GO:0005829]; dynactin complex [GO:000...,,"[GO:0005829, GO:0061640, GO:0005869, GO:0005730]"
70690,X6RLV5,MSGYSSDRDRGRDRGFGAPRFGGSRAGPLSGKKFGNPGEKLVKKKW...,,,,,[]


In [None]:
# Map full GO terms to GO slim terms
def map_to_goslim(go_terms):
    slim_mapping = {}
    slim_list = []
    for go_id in go_terms:
        if go_id in go_dag:
            slim_terms = mapslim(go_id, go_dag, goslim_dag)
            slim_mapping[go_id] = list(slim_terms[0])
            slim_list.extend(list(slim_terms[0]))
    return list(set(slim_list))#, slim_mapping

# map_to_goslim(['GO:0005764', 'GO:0016020']) --> ['GO:0005764']

df_protein['go_slim_ids'] = df_protein['go_ids'].apply(map_to_goslim)
df_protein = df_protein.drop(columns=['go_all', 'go_bp', 'go_cc', 'go_mf', 'go_ids'])
df_protein = df_protein[df_protein['go_slim_ids'].str.len() > 0].reset_index(drop=True)
df_protein

Unnamed: 0,uniprot_id,sequence,go_slim_ids
0,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"[GO:0016491, GO:0005739, GO:0043226, GO:0006629]"
1,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,"[GO:0043226, GO:0016209, GO:0006629, GO:000579..."
2,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,[GO:0005739]
3,A0A0C5B5G6,MRWQEMGYIFYPRKLR,"[GO:0003677, GO:0005739, GO:0005615, GO:000635..."
4,A0A0K2S4Q6,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,"[GO:0005886, GO:0005576, GO:0002376, GO:0048870]"
...,...,...,...
43520,X6RJD6,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPAGLWTWLRNSLRPSP...,"[GO:0016491, GO:0016209]"
43521,X6RK96,TSVNYLDSAFRNIRNLGIVSVTSTDISSLYAKAQHVARRHYGCNIV...,"[GO:0140098, GO:0016740, GO:0006399, GO:0003723]"
43522,X6RL45,MVRCYVEIVEKLPERRPDPATIEGCAQLKPNNYLLAWHTPFNEKGS...,[GO:0016787]
43523,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,[GO:0005829]


In [None]:
df_protein['go_slim_names'] = df_protein['go_slim_ids'].apply(lambda lst: [go_dag[go_id].name for go_id in lst])
df_protein['go_slim_types'] = df_protein['go_slim_ids'].apply(lambda lst: [go_dag[go_id].namespace for go_id in lst])
df_protein

Unnamed: 0,uniprot_id,sequence,go_slim_ids,go_slim_names,go_slim_types
0,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"[GO:0016491, GO:0005739, GO:0043226, GO:0006629]","[oxidoreductase activity, mitochondrion, organ...","[molecular_function, cellular_component, cellu..."
1,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,"[GO:0043226, GO:0016209, GO:0006629, GO:000579...","[organelle, antioxidant activity, lipid metabo...","[cellular_component, molecular_function, biolo..."
2,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,[GO:0005739],[mitochondrion],[cellular_component]
3,A0A0C5B5G6,MRWQEMGYIFYPRKLR,"[GO:0003677, GO:0005739, GO:0005615, GO:000635...","[DNA binding, mitochondrion, extracellular spa...","[molecular_function, cellular_component, cellu..."
4,A0A0K2S4Q6,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,"[GO:0005886, GO:0005576, GO:0002376, GO:0048870]","[plasma membrane, extracellular region, immune...","[cellular_component, cellular_component, biolo..."
...,...,...,...,...,...
43520,X6RJD6,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPAGLWTWLRNSLRPSP...,"[GO:0016491, GO:0016209]","[oxidoreductase activity, antioxidant activity]","[molecular_function, molecular_function]"
43521,X6RK96,TSVNYLDSAFRNIRNLGIVSVTSTDISSLYAKAQHVARRHYGCNIV...,"[GO:0140098, GO:0016740, GO:0006399, GO:0003723]","[catalytic activity, acting on RNA, transferas...","[molecular_function, molecular_function, biolo..."
43522,X6RL45,MVRCYVEIVEKLPERRPDPATIEGCAQLKPNNYLLAWHTPFNEKGS...,[GO:0016787],[hydrolase activity],[molecular_function]
43523,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,[GO:0005829],[cytosol],[cellular_component]


In [25]:
df_protein_sliced = df_protein_sliced[df_protein_sliced['uniprot_id'].isin(df_protein['uniprot_id'])].reset_index(drop=True)
df_protein_sliced

Unnamed: 0,uniprot_id,sequence
0,A8CLL2,VMDNPLVMHQLRCNGVLEGIRICRKGFPNRILYGDFRQ
1,Q68DN1,MELTPGAQQQGINYQELTSGWQDVKSMMLVPEPTRKFPSGPLLTSV...
2,Q156A1,MQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ...
3,A0A410SEU7,TELTFNYNLECLGNGKTVCK
4,A0A994J774,MATAPYNYSYIFKYIIIGDMGVGKSCLLHQFTEKK
...,...,...
181075,A0A0A0N0L1,M
181076,A0A0A0N0L1,AEKQKHDGRVKIGHYVLGDTLGVGTFGKVKNTR
181077,Q16330,M
181078,Q16330,LNSPTICQSYVGQAIEPTPKKFSQCYIIH


In [33]:
all_gos = [go for gos in df_protein['go_slim_types'] for go in gos]
len(Counter(all_gos)), len(all_gos), Counter(all_gos)

(3,
 151058,
 Counter({'cellular_component': 52485,
          'molecular_function': 51394,
          'biological_process': 47179}))

In [32]:
all_gos = [go for gos in df_protein['go_slim_names'] for go in gos]
len(Counter(all_gos)), len(all_gos), Counter(all_gos)

(132,
 151058,
 Counter({'nucleus': 9177,
          'plasma membrane': 7137,
          'cytosol': 6595,
          'transferase activity': 5938,
          'catalytic activity, acting on a protein': 5616,
          'nucleoplasm': 5291,
          'hydrolase activity': 5147,
          'organelle': 4776,
          'regulation of DNA-templated transcription': 4393,
          'DNA binding': 4354,
          'anatomical structure development': 3420,
          'extracellular region': 3228,
          'transporter activity': 2866,
          'molecular function regulator activity': 2836,
          'RNA binding': 2821,
          'immune system process': 2691,
          'transcription regulator activity': 2687,
          'molecular transducer activity': 2315,
          'mitochondrion': 2314,
          'cell differentiation': 2303,
          'lipid metabolic process': 2062,
          'oxidoreductase activity': 1916,
          'cytoskeletal protein binding': 1904,
          'vesicle-mediated transport'

In [34]:
for name, tokenizer in tqdm(list(tokenizer_list.items())):
    if 'pre' in name:
        df_protein_sliced[name] = [enc.tokens for enc in tokenizer.encode_batch(df_protein_sliced['sequence'])]
    else:
        df_protein[name] = [enc.tokens for enc in tokenizer.encode_batch(df_protein['sequence'])]

  0%|          | 0/6 [00:00<?, ?it/s]

100%|██████████| 6/6 [00:09<00:00,  1.64s/it]


In [None]:
df_protein_sliced = df_protein_sliced.groupby('uniprot_id').sum().reset_index()
df_protein = df_protein.set_index(['uniprot_id', 'sequence']).join(df_protein_sliced.set_index(['uniprot_id', 'sequence'])).reset_index()
df_protein

Unnamed: 0,uniprot_id,sequence,go_slim_ids,go_slim_names,go_slim_types,mutBPE blosum62 0.7 0.05 1600,mutBPE blosum62 0.7 0.05 6400,mutBPE blosum62 0.7 0.05 25600,mutBPE pre pam70 0.7 0.05 1600,mutBPE pre pam70 0.7 0.05 6400,mutBPE pre pam70 0.7 0.05 25600
0,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"[GO:0016491, GO:0005739, GO:0043226, GO:0006629]","[oxidoreductase activity, mitochondrion, organ...","[molecular_function, cellular_component, cellu...","[M, GL, EAL, V, PL, AMI, VA, IF, LLLV, DLM, H,...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DLM, HRH, Q...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DLM, HRH, Q...","[MGL, EAL, V, PL, AMI, VA, IF, LLLV, DL, MH, R...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DL, MH, RH,...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DL, MH, RH,..."
1,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,"[GO:0043226, GO:0016209, GO:0006629, GO:000579...","[organelle, antioxidant activity, lipid metabo...","[cellular_component, molecular_function, biolo...","[M, SR, S, LLL, W, FLL, FLL, LL, PPL, PV, LL, ...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, ...","[MSR, SLLL, WFLL, FLLLL, PPL, PVLL, AD, PGA, P...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PV, LL, AD...","[MSR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, PG...","[MSR, SLLL, WFLL, FLL, LL, PPL, PVLL, AD, PGA,..."
2,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,[GO:0005739],[mitochondrion],[cellular_component],"[M, FR, RL, T, FA, QLL, FA, T, VLG, IA, GGV, Y...","[M, FR, RL, TFA, QLL, FA, T, VLG, IA, GGV, YI,...","[M, FRRL, TFA, QLL, FAT, VLG, IA, GGV, YI, FQ,...","[M, FR, RL, T, FA, QLL, FA, TVL, G, IA, GGV, Y...","[M, FR, RL, TFA, QLL, FA, TVL, GIA, GGV, YI, F...","[M, FRRL, TFA, QLL, FA, TVL, GIA, GGV, YI, FQ,..."
3,A0A0C5B5G6,MRWQEMGYIFYPRKLR,"[GO:0003677, GO:0005739, GO:0005615, GO:000635...","[DNA binding, mitochondrion, extracellular spa...","[molecular_function, cellular_component, cellu...","[M, RW, QE, MG, YI, FY, PR, KL, R]","[M, RW, QE, MG, YI, FY, PR, KL, R]","[MRW, QEMG, YI, FY, PRKL, R]","[M, RW, QE, MG, YI, FY, PR, KL, R]","[M, RW, QE, MG, YI, FY, PR, KL, R]","[M, RW, QE, MG, YI, FY, PRKL, R]"
4,A0A0K2S4Q6,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,"[GO:0005886, GO:0005576, GO:0002376, GO:0048870]","[plasma membrane, extracellular region, immune...","[cellular_component, cellular_component, biolo...","[M, TQ, R, AGAA, ML, PSA, LLLL, CV, PG, CL, TV...","[M, TQR, AGAA, ML, PSA, LLLL, CV, PG, CL, TV, ...","[M, TQR, AGAA, ML, PSA, LLLL, CV, PGCL, TVSG, ...","[M, TQ, R, AGAA, ML, PSA, LLLL, CV, PG, CL, TV...","[M, TQR, AGAA, ML, PSA, LLLL, CV, PG, CL, TV, ...","[M, TQR, AGAA, ML, PSA, LLLL, CVPG, CL, TVSG, ..."
...,...,...,...,...,...,...,...,...,...,...,...
43520,X6RJD6,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPAGLWTWLRNSLRPSP...,"[GO:0016491, GO:0016209]","[oxidoreductase activity, antioxidant activity]","[molecular_function, molecular_function]","[M, SR, S, LLL, W, FLL, FLL, LL, PPL, PV, LL, ...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, ...","[MSR, SLLL, WFLL, FLLLL, PPL, PVLL, AD, PGA, P...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PV, LL, AD...","[MSR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, PG...","[MSR, SLLL, WFLL, FLL, LL, PPL, PVLL, AD, PGA,..."
43521,X6RK96,TSVNYLDSAFRNIRNLGIVSVTSTDISSLYAKAQHVARRHYGCNIV...,"[GO:0140098, GO:0016740, GO:0006399, GO:0003723]","[catalytic activity, acting on RNA, transferas...","[molecular_function, molecular_function, biolo...","[TSV, N, YL, D, SA, FR, NI, R, NLG, IV, SV, TS...","[TSV, NYL, DSA, FR, NI, R, NLG, IV, SV, TST, D...","[TSV, NYL, DSA, FRNI, R, NLG, IVSV, TST, DI, S...","[TSV, N, YL, D, SA, FR, NI, R, NLG, IV, SV, TS...","[TSV, NYL, DSA, FR, NI, R, NLG, IV, SV, TST, D...","[TSV, NYL, DSA, FRNI, R, NLG, IV, SV, TST, DI,..."
43522,X6RL45,MVRCYVEIVEKLPERRPDPATIEGCAQLKPNNYLLAWHTPFNEKGS...,[GO:0016787],[hydrolase activity],[molecular_function],"[MV, RC, YV, EIV, EKL, PE, RR, PD, PA, TI, EG,...","[MV, RC, YV, EIV, EKL, PE, RR, PD, PA, TI, EG,...","[MV, RC, YV, EIV, EKL, PERR, PD, PATI, EG, CA,...","[MV, RC, YV, EIV, EKL, PE, RR, PD, PA, TI, EG,...","[MV, RC, YV, EIV, EKL, PE, RR, PD, PA, TI, EG,...","[MV, RC, YV, EIV, EKL, PERR, PDPA, TIEG, CA, Q..."
43523,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,[GO:0005829],[cytosol],[cellular_component],"[EV, KGL, FK, SEN, C, P, KVI, SC, E, FA, HN, S...","[EV, KGL, FK, SEN, C, P, KVI, SC, EFA, HN, SN,...","[EV, KGL, FK, SEN, C, PKVI, SC, EFA, HN, SN, W...","[EV, KGL, FK, SEN, C, P, KVI, SC, E, FA, HN, S...","[EV, KGL, FK, SEN, C, P, KVI, SC, EFA, HN, SNW...","[EV, KGL, FK, SEN, CP, KVI, SC, EFA, HN, SNW, ..."
