# General Tokenizer Statistics

In [None]:
import json
from tokenizers import Tokenizer
from itertools import product

base_path = "/cta/share/users/mutbpe/tokenizers/"
def generate_filepaths_labels(uniref_id, pret, subs, cut, minlen, maxlen, freq, vocab_sizes):
    paths = []
    labels = []
    for id, pre, sub, c, ml, mxl, f, vs in product(uniref_id, pret, subs, cut, minlen, maxlen, freq, vocab_sizes):
        if sub == "std":
            paths.append(base_path + f"blosum62/uniref{id}{"pre" if pre else ""}_bpe_{vs}.json")
            labels.append(f"BPE{" pre" if pre else ""} {vs}")
        else:
            paths.append(base_path + f"{sub}/uniref{id}{"pre" if pre else ""}_mutbpe_{c}_{ml}_{mxl}_{f}_{vs}.json")
            labels.append(f"evoBPE{" pre" if pre else ""} {sub} {c} {f} {vs}")
    return paths, labels

vocab_sizes = [800, 1600, 3200, 6400, 12800, 25600, 51200]
paths, labels = generate_filepaths_labels([50], [True, False], ["std", "pam70", "blosum62"], [0.7], [3], [12], [0.05], vocab_sizes)
vocab_dict = {}
tokenizer_dict = {}
for path, label in zip(paths, labels):
    with open(path) as f:
        vocab_dict[label] = json.load(f)
    idx = path.rfind("/")
    hf_path = path[:idx] + "/hf_" + path[idx+1:] 
    tokenizer_dict[label] = Tokenizer.from_file(hf_path)
list(vocab_dict.keys())
list(tokenizer_dict.keys())

['BPE pre 800',
 'BPE pre 1600',
 'BPE pre 3200',
 'BPE pre 6400',
 'BPE pre 12800',
 'BPE pre 25600',
 'BPE pre 51200',
 'evoBPE pre pam70 0.7 0.05 800',
 'evoBPE pre pam70 0.7 0.05 1600',
 'evoBPE pre pam70 0.7 0.05 3200',
 'evoBPE pre pam70 0.7 0.05 6400',
 'evoBPE pre pam70 0.7 0.05 12800',
 'evoBPE pre pam70 0.7 0.05 25600',
 'evoBPE pre pam70 0.7 0.05 51200',
 'evoBPE pre blosum62 0.7 0.05 800',
 'evoBPE pre blosum62 0.7 0.05 1600',
 'evoBPE pre blosum62 0.7 0.05 3200',
 'evoBPE pre blosum62 0.7 0.05 6400',
 'evoBPE pre blosum62 0.7 0.05 12800',
 'evoBPE pre blosum62 0.7 0.05 25600',
 'evoBPE pre blosum62 0.7 0.05 51200',
 'BPE 800',
 'BPE 1600',
 'BPE 3200',
 'BPE 6400',
 'BPE 12800',
 'BPE 25600',
 'BPE 51200',
 'evoBPE pam70 0.7 0.05 800',
 'evoBPE pam70 0.7 0.05 1600',
 'evoBPE pam70 0.7 0.05 3200',
 'evoBPE pam70 0.7 0.05 6400',
 'evoBPE pam70 0.7 0.05 12800',
 'evoBPE pam70 0.7 0.05 25600',
 'evoBPE pam70 0.7 0.05 51200',
 'evoBPE blosum62 0.7 0.05 800',
 'evoBPE blosum62 0

In [13]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
import seaborn as sns

db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)
df_uniprot_human_seqs = pd.read_sql(f"SELECT Sequence FROM proteins WHERE Entry IN (SELECT uniprot_accession FROM uniref50_distilled)", conn)
conn.close()
# filtered_sequences = df_uniprot_human_seqs[
#     (df_uniprot_human_seqs["Sequence"].str.count("X") <= 1) &
#     (df_uniprot_human_seqs["Sequence"].str.count("B") <= 1) &
#     (df_uniprot_human_seqs["Sequence"].str.count("U") <= 1) &
#     (df_uniprot_human_seqs["Sequence"].str.count("Z") <= 1)
# ]["Sequence"].tolist()
filtered_sequences = df_uniprot_human_seqs["Sequence"].tolist()
print(len(filtered_sequences))

70901


In [14]:
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)
uniref_id = 50

df_uniprot_human_seqs = pd.read_sql(f"SELECT * FROM uniref{uniref_id}_domain_sliced_plddt70", conn)
df_protein = pd.read_sql(f"""SELECT Entry as uniprot_id, Sequence as sequence
                        FROM proteins
                        WHERE Entry IN (SELECT uniprot_accession FROM uniref{uniref_id}_distilled)""", conn)

conn.close()

df_uniprot_human_seqs = df_uniprot_human_seqs[~df_uniprot_human_seqs['uniprot_id'].isin(df_protein[df_protein['sequence'].str.len() > 3000]['uniprot_id'].unique())]

filtered_sequences2 = df_uniprot_human_seqs[
    (df_uniprot_human_seqs["sequence"].str.count("X") <= 1) &
    (df_uniprot_human_seqs["sequence"].str.count("B") <= 1) &
    (df_uniprot_human_seqs["sequence"].str.count("U") <= 1) &
    (df_uniprot_human_seqs["sequence"].str.count("Z") <= 1)
]["sequence"].tolist()
print(len(filtered_sequences2))

236550


In [15]:
test_vocabs = ["800", "6400", "25600"]
encoded_dict = {}
for k, v in tokenizer_dict.items():
    if k.split()[-1] in test_vocabs:
        if "pre" in k:
            encoded_dict[k] = [enc.tokens for enc in v.encode_batch(filtered_sequences2)]
        else:
            encoded_dict[k] = [enc.tokens for enc in v.encode_batch(filtered_sequences)]

list(encoded_dict.keys())

['BPE pre 800',
 'BPE pre 6400',
 'BPE pre 25600',
 'evoBPE pre pam70 0.7 0.05 800',
 'evoBPE pre pam70 0.7 0.05 6400',
 'evoBPE pre pam70 0.7 0.05 25600',
 'evoBPE pre blosum62 0.7 0.05 800',
 'evoBPE pre blosum62 0.7 0.05 6400',
 'evoBPE pre blosum62 0.7 0.05 25600',
 'BPE 800',
 'BPE 6400',
 'BPE 25600',
 'evoBPE pam70 0.7 0.05 800',
 'evoBPE pam70 0.7 0.05 6400',
 'evoBPE pam70 0.7 0.05 25600',
 'evoBPE blosum62 0.7 0.05 800',
 'evoBPE blosum62 0.7 0.05 6400',
 'evoBPE blosum62 0.7 0.05 25600']

In [None]:
import vocabulary_functions as vf
from collections import Counter
from statistics import mean, stdev
print(f"Method|pretokenized|s_matrix|v_size|p_ratio|p_ratio*|m_ratio|m_ratio*|avg_t_length|avg_t_length*")
for k in encoded_dict:
    cur_vocab = vocab_dict[k]
    if "evo" not in k:
        parents = cur_vocab
    else:
        parents = vf.get_parents(cur_vocab)
    mutateds = vf.get_mutated(cur_vocab)
    parent_ratio = round(len(parents)/len(cur_vocab), 3)
    mutated_ratio = round(len(mutateds)/len(cur_vocab), 3)

    token_lengths = [len(k) for k in cur_vocab]
    avg_token_length = round(mean(token_lengths), 3)
    std_token_length = round(stdev(token_lengths), 3)
    
    cur_encodings = encoded_dict[k]
    all_tokens = []
    for enc in cur_encodings:
        for t in enc:
            all_tokens.append(t)
    used_token_counts = Counter(all_tokens)
    used_parent_counts = vf.set_intersection(used_token_counts, parents)
    used_mutated_counts = vf.set_intersection(used_token_counts, mutateds)

    used_token_lengths = [len(k) for k in all_tokens]
    avg_used_token_length = round(mean(used_token_lengths), 3)
    std_used_token_length = round(stdev(used_token_lengths), 3)

    used_parent_ratio = round(sum(used_parent_counts.values())/len(all_tokens), 3)
    used_mutated_ratio = round(sum(used_mutated_counts.values())/len(all_tokens), 3)
    
    pret = "Yes" if "pre" in k else "No"
    name = k.split()[0]
    if "evo" in k:
        matrix = k.split()[2] if "pre" in k else k.split()[1]
    else:
        matrix = "-"
    vsize = k.split()[-1]

    out_str = f"{name}|{pret}|{matrix.upper()}|{vsize}|{parent_ratio}|{used_parent_ratio}|{mutated_ratio}|{used_mutated_ratio}|{avg_token_length} ± {std_token_length}|{avg_used_token_length} ± {std_used_token_length}"
    print(out_str)


Method|pretokenized|s_matrix|v_size|p_ratio|p_ratio*|m_ratio|m_ratio*|avg_t_length|avg_t_length*
['BPE', 'pre', '800']|Yes|-|800|1.0|1.0|0.0|0.0|2.513 ± 0.594|1.937 ± 0.562
['BPE', 'pre', '6400']|Yes|-|6400|1.0|1.0|0.0|0.0|3.4 ± 0.711|2.438 ± 0.711
['BPE', 'pre', '25600']|Yes|-|25600|1.0|1.0|0.0|0.0|3.891 ± 1.185|2.815 ± 0.912
['evoBPE', 'pre', 'pam70', '0.7', '0.05', '800']|Yes|PAM70|800|0.081|0.034|0.487|0.077|2.591 ± 0.646|1.891 ± 0.577
['evoBPE', 'pre', 'pam70', '0.7', '0.05', '6400']|Yes|PAM70|6400|0.145|0.117|0.787|0.267|3.347 ± 0.679|2.39 ± 0.679
['evoBPE', 'pre', 'pam70', '0.7', '0.05', '25600']|Yes|PAM70|25600|0.114|0.158|0.866|0.403|3.894 ± 0.952|2.737 ± 0.863
['evoBPE', 'pre', 'blosum62', '0.7', '0.05', '800']|Yes|BLOSUM62|800|0.062|0.029|0.524|0.085|2.6 ± 0.627|1.884 ± 0.585
['evoBPE', 'pre', 'blosum62', '0.7', '0.05', '6400']|Yes|BLOSUM62|6400|0.103|0.098|0.836|0.293|3.334 ± 0.654|2.386 ± 0.678
['evoBPE', 'pre', 'blosum62', '0.7', '0.05', '25600']|Yes|BLOSUM62|25600|0.074|