Import dependencies

In [3]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

Load sequence data

In [4]:
with open('../../RSRC/uniref_taxonomy_id_9606_AND_identity_2024_09_13.json') as f:
    human_proteins_json = json.load(f)['results']
    
human_proteins_df = []
for prot in human_proteins_json:
    human_proteins_df.append({'id': prot['id'], 'sequence': prot['representativeMember']['sequence']['value']})
human_proteins_df = pd.DataFrame(human_proteins_df)
human_proteins_df = human_proteins_df[~human_proteins_df['sequence'].str.contains('U')]

df_ds_train, df_ds_test = train_test_split(human_proteins_df, test_size=0.2, random_state=42)

corpus = df_ds_train['sequence']

Extract the initial symbols

In [3]:
alphabet = []
for seq in corpus:
    for letter in seq:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()
print(alphabet)

['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y']


Import functions for BPE

In [1]:
import bpe_functions

Convert every protein sequence into a SymList (a doubly linked list containing tokens as nodes)

In [5]:
sequences = bpe_functions.corpus_to_symlist_list(corpus)

In [6]:
print(sequences[0])

MEVLRRSSVFAAEIMDAFDRCGDAADGLMSSSVWSAQTLASAPTGWWLHSAASAAS


Generate the min heap data structure for bookkeeping of all symbol pairs found in the data.

In [None]:
merge_heap = bpe_functions.sequences_to_heap(sequences)
[str(elem) for elem in merge_heap.heap[:10]]

['Pair: (L, L), Count: 153309',
 'Pair: (S, S), Count: 135477',
 'Pair: (L, S), Count: 111026',
 'Pair: (L, A), Count: 93549',
 'Pair: (S, L), Count: 115163',
 'Pair: (A, L), Count: 99568',
 'Pair: (E, E), Count: 107112',
 'Pair: (S, G), Count: 82440',
 'Pair: (G, G), Count: 73809',
 'Pair: (A, A), Count: 98339']

Example generate_mutations usage:

In [6]:
from Bio.Align import substitution_matrices

blosum62 = substitution_matrices.load("BLOSUM62")
pam250 = substitution_matrices.load("PAM250")
bpe_functions.generate_mutations('HMVL', blosum62, 0.8)

[('HMVL', 1.0),
 ('HMIL', 0.9523809523809523),
 ('HMVI', 0.9047619047619048),
 ('HMVM', 0.9047619047619048),
 ('HMVV', 0.8571428571428572),
 ('HMII', 0.8571428571428572),
 ('HMIM', 0.8571428571428572),
 ('HMLL', 0.8571428571428572),
 ('HMML', 0.8571428571428572),
 ('HLVL', 0.8571428571428572),
 ('HMVF', 0.8095238095238095),
 ('HMIV', 0.8095238095238095),
 ('HMAL', 0.8095238095238095),
 ('HMTL', 0.8095238095238095),
 ('HLIL', 0.8095238095238095),
 ('HIVL', 0.8095238095238095),
 ('HVVL', 0.8095238095238095)]

### Default BPE examples:

Add words until a desired vocabulary size is reached

In [7]:
options = {
    "corpus": corpus,
    "stop_type": "vocab_size",
    "stop_parameter": 1000
}

new_vocabulary = bpe_functions.train_bpe(**options)
new_vocabulary

['A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'X',
 'Y',
 'LL',
 'SS',
 'EE',
 'AA',
 'SL',
 'PP',
 'AL',
 'VL',
 'GL',
 'SP',
 'GG',
 'EL',
 'RL',
 'TL',
 'EK',
 'SG',
 'RR',
 'DL',
 'KK',
 'QL',
 'PL',
 'SA',
 'PG',
 'IL',
 'SV',
 'ST',
 'EA',
 'SR',
 'FL',
 'KL',
 'ED',
 'PA',
 'SQ',
 'EV',
 'EG',
 'NL',
 'TV',
 'AV',
 'SD',
 'SK',
 'TG',
 'ER',
 'PV',
 'AG',
 'HL',
 'PR',
 'QQ',
 'SI',
 'TT',
 'SF',
 'EN',
 'RG',
 'VV',
 'EI',
 'DG',
 'EQ',
 'RK',
 'YL',
 'TA',
 'PQ',
 'DV',
 'ET',
 'KA',
 'SN',
 'KV',
 'CL',
 'KG',
 'RA',
 'RV',
 'QA',
 'QG',
 'PT',
 'DI',
 'SH',
 'QV',
 'PD',
 'KI',
 'ML',
 'FG',
 'CG',
 'NG',
 'KD',
 'SY',
 'KT',
 'FV',
 'RI',
 'SC',
 'SE',
 'NI',
 'RD',
 'RT',
 'NV',
 'PI',
 'FI',
 'TI',
 'MA',
 'RQ',
 'DA',
 'PE',
 'PK',
 'HG',
 'VG',
 'YG',
 'VA',
 'PF',
 'KQ',
 'DD',
 'NA',
 'FA',
 'WL',
 'FT',
 'PY',
 'MG',
 'KN',
 'QI',
 'PH',
 'QD',
 'RN',
 'II',
 'KE',
 'TD',
 'PN',
 'SW',
 'SM',
 'FQ',

Add words by cutoff frequency threshold

In [8]:
options = {
    "corpus": corpus,
    "stop_type": "freq_cutoff",
    "stop_parameter": 10000
}

new_vocabulary = bpe_functions.train_bpe(**options)
new_vocabulary

['A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'X',
 'Y',
 'LL',
 'SS',
 'EE',
 'AA',
 'SL',
 'PP',
 'AL',
 'VL',
 'GL',
 'SP',
 'GG',
 'EL',
 'RL',
 'TL',
 'EK',
 'SG',
 'RR',
 'DL',
 'KK',
 'QL',
 'PL',
 'SA',
 'PG',
 'IL',
 'SV',
 'ST',
 'EA',
 'SR',
 'FL',
 'KL',
 'ED',
 'PA',
 'SQ',
 'EV',
 'EG',
 'NL',
 'TV',
 'AV',
 'SD',
 'SK',
 'TG',
 'ER',
 'PV',
 'AG',
 'HL',
 'PR',
 'QQ',
 'SI',
 'TT',
 'SF',
 'EN',
 'RG',
 'VV',
 'EI',
 'DG',
 'EQ',
 'RK',
 'YL',
 'TA',
 'PQ',
 'DV',
 'ET',
 'KA',
 'SN',
 'KV',
 'CL',
 'KG',
 'RA',
 'RV',
 'QA',
 'QG',
 'PT',
 'DI',
 'SH',
 'QV',
 'PD',
 'KI',
 'ML',
 'FG',
 'CG',
 'NG',
 'KD',
 'SY',
 'KT',
 'FV',
 'RI',
 'SC',
 'SE',
 'NI',
 'RD',
 'RT',
 'NV',
 'PI',
 'FI',
 'TI',
 'MA',
 'RQ',
 'DA',
 'PE',
 'PK',
 'HG',
 'VG',
 'YG',
 'VA',
 'PF',
 'KQ',
 'DD',
 'NA',
 'FA',
 'WL',
 'FT',
 'PY',
 'MG',
 'KN',
 'QI',
 'PH',
 'QD',
 'RN',
 'II',
 'KE',
 'TD',
 'PN',
 'SW',
 'SM',
 'FQ',

Add words by cutoff frequency proportion

In [9]:
options = {
    "corpus": corpus,
    "stop_type": "freq_proportion",
    "stop_parameter": 0.01
}

new_vocabulary = bpe_functions.train_bpe(**options)
new_vocabulary

['A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'X',
 'Y',
 'LL',
 'SS',
 'EE',
 'AA',
 'SL',
 'PP',
 'AL',
 'VL',
 'GL',
 'SP',
 'GG',
 'EL',
 'RL',
 'TL',
 'EK',
 'SG',
 'RR',
 'DL',
 'KK',
 'QL',
 'PL',
 'SA',
 'PG',
 'IL',
 'SV',
 'ST',
 'EA',
 'SR',
 'FL',
 'KL',
 'ED',
 'PA',
 'SQ',
 'EV',
 'EG',
 'NL',
 'TV',
 'AV',
 'SD',
 'SK',
 'TG',
 'ER',
 'PV',
 'AG',
 'HL',
 'PR',
 'QQ',
 'SI',
 'TT',
 'SF',
 'EN',
 'RG',
 'VV',
 'EI',
 'DG',
 'EQ',
 'RK',
 'YL',
 'TA',
 'PQ',
 'DV',
 'ET',
 'KA',
 'SN',
 'KV',
 'CL',
 'KG',
 'RA',
 'RV',
 'QA',
 'QG',
 'PT',
 'DI',
 'SH',
 'QV',
 'PD',
 'KI',
 'ML',
 'FG',
 'CG',
 'NG',
 'KD',
 'SY',
 'KT',
 'FV',
 'RI',
 'SC',
 'SE',
 'NI',
 'RD',
 'RT',
 'NV',
 'PI',
 'FI',
 'TI',
 'MA',
 'RQ',
 'DA',
 'PE',
 'PK',
 'HG',
 'VG',
 'YG',
 'VA',
 'PF',
 'KQ',
 'DD',
 'NA',
 'FA',
 'WL',
 'FT',
 'PY',
 'MG',
 'KN',
 'QI',
 'PH',
 'QD',
 'RN',
 'II',
 'KE',
 'TD',
 'PN',
 'SW',
 'SM',
 'FQ',

### Mutated BPE examples:

Add words until a desired vocabulary size is reached

In [7]:
from Bio.Align import substitution_matrices

options = {
    "corpus": corpus,
    "tokenizer_type": "mutated",
    "subs_matrix": substitution_matrices.load("BLOSUM62"),
    "mutation_cutoff": 0.8,
    "stop_type": "vocab_size",
    "stop_parameter": 1000
}

new_vocabulary = bpe_functions.train_bpe(**options)
new_vocabulary

['A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'X',
 'Y',
 'LL',
 'SS',
 'EE',
 'AA',
 'SL',
 'PP',
 'AL',
 'VL',
 'IL',
 'GL',
 'SP',
 'GG',
 'EL',
 'RL',
 'TL',
 'EK',
 'SG',
 'RR',
 'DL',
 'KK',
 'QL',
 'PL',
 'PI',
 'PM',
 'SA',
 'PG',
 'SV',
 'SI',
 'ST',
 'EA',
 'SR',
 'FL',
 'KL',
 'ED',
 'PA',
 'SQ',
 'EV',
 'EI',
 'EG',
 'NL',
 'TV',
 'TI',
 'AV',
 'AI',
 'SD',
 'SK',
 'TG',
 'ER',
 'PV',
 'AG',
 'HL',
 'HI',
 'HM',
 'PR',
 'QQ',
 'SF',
 'EN',
 'RG',
 'TT',
 'VV',
 'VI',
 'IV',
 'DG',
 'EQ',
 'RK',
 'YL',
 'YI',
 'YM',
 'PQ',
 'TA',
 'SN',
 'CL',
 'CI',
 'CM',
 'KG',
 'DV',
 'DI',
 'ET',
 'KA',
 'KV',
 'KI',
 'RA',
 'RV',
 'RI',
 'QG',
 'QA',
 'PT',
 'PD',
 'SH',
 'QV',
 'QI',
 'FG',
 'KD',
 'CG',
 'NG',
 'SE',
 'ML',
 'RD',
 'SY',
 'FV',
 'FI',
 'KT',
 'SC',
 'NI',
 'NV',
 'RT',
 'PE',
 'PK',
 'QD',
 'AT',
 'QK',
 'HG',
 'QR',
 'YG',
 'PF',
 'DD',
 'VG',
 'IG',
 'SM',
 'FT',
 'WL',
 'WI',
 'WM',
 'QT',
 'KN',

In [8]:
len(new_vocabulary)

1004