<a href="https://colab.research.google.com/github/boun-tabi-lifelu/PUMA/blob/main/Prog/examples/training_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !wget https://raw.githubusercontent.com/boun-tabi-lifelu/evolutionary-subword-tokenization/refs/heads/main/Prog/bpe_functions.py
# !wget https://raw.githubusercontent.com/boun-tabi-lifelu/evolutionary-subword-tokenization/refs/heads/main/Prog/vocabulary_functions.py
# !wget https://raw.githubusercontent.com/boun-tabi-lifelu/evolutionary-subword-tokenization/refs/heads/main/Prog/examples/sample.fasta
# !wget https://raw.githubusercontent.com/boun-tabi-lifelu/evolutionary-subword-tokenization/refs/heads/main/Prog/helper_classes.py
!pip install biopython
!pip install tokenizers

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m2.6/3.2 MB[0m [31m37.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [None]:
# Note: this wouldn't work for fasta files containing multi-line sequences
with open("sample.fasta", "r") as f:
    sequences = f.readlines()[1::2]
    sequences = [s.strip() for s in sequences]
print(f"Loaded {len(sequences)} sequences.")
print(f"First sequence: {sequences[0]}")

Loaded 1000 sequences.
First sequence: MELSAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLGS


In [None]:
import bpe_functions, vocabulary_functions
import json

In [None]:
def training(args):
    options, save_filepath, hf_filepath = args
    print(f"Training started for {save_filepath}")
    vocab = bpe_functions.train_bpe(**options)
    print(f"Training finished for {save_filepath}")
    with open(save_filepath, "w") as f:
        json.dump(vocab, f, indent=2)
    print(f"Generating hugging face format for {save_filepath}")
    vocabulary_functions.vocab_json_to_HF_json(save_filepath, hf_filepath)

BPE training example

In [None]:

corpus = sequences
alphabet = ['A', 'R', 'N', 'D', 'C', 'E',
            'Q', 'G', 'H', 'I', 'L', 'K',
            'M', 'F', 'P', 'S', 'T', 'W',
            'Y', 'V', 'U', 'O', 'X', 'B',
            'Z', 'J']
vocab_size = 800
argument = {
    "corpus": corpus,
    "alphabet": alphabet,
    "tokenizer_type": "default",
    "stop_type": "vocab_size",
    "stop_parameter": vocab_size
    }
save_path = "bpe_model.json"
hugging_face_save_path = "hf_bpe_model.json"
training((argument, save_path, hugging_face_save_path))


Training started for bpe_model.json
Training finished for bpe_model.json
Generating hugging face format for bpe_model.json


PUMA training example

In [None]:
from Bio.Align import substitution_matrices
corpus = sequences
alphabet = ['A', 'R', 'N', 'D', 'C', 'E',
            'Q', 'G', 'H', 'I', 'L', 'K',
            'M', 'F', 'P', 'S', 'T', 'W',
            'Y', 'V', 'U', 'O', 'X', 'B',
            'Z', 'J']
vocab_size = 800
argument = {
    "corpus": corpus,
    "alphabet": alphabet,
    'subs_matrix': substitution_matrices.load("BLOSUM62"),
    "tokenizer_type": "mutated",
    "mutation_cutoff": 0.7,
    "min_mutation_len": 3,
    "max_mutation_len": 12,
    "min_mutation_freq": 0.05,
    "stop_type": "vocab_size",
    "stop_parameter": vocab_size

}
save_path = "puma_model.json"
hugging_face_save_path = "hf_puma_model.json"
training((argument, save_path, hugging_face_save_path))


Training started for puma_model.json
Training finished for puma_model.json
Generating hugging face format for puma_model.json


Usage example



In [None]:
from tokenizers import Tokenizer

In [None]:
current_sequence = sequences[0]
bpe_tokenizer = Tokenizer.from_file("hf_bpe_model.json")
puma_tokenizer = Tokenizer.from_file("hf_puma_model.json")
bpe_segmented = bpe_tokenizer.encode(current_sequence).tokens
puma_segmented = puma_tokenizer.encode(current_sequence).tokens
print(current_sequence)
print(bpe_segmented)
print(puma_segmented)

MELSAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLGS
['MEL', 'SA', 'EYL', 'R', 'EKL', 'QR', 'DL', 'EA', 'EH', 'VLP', 'SP', 'GG', 'VG', 'QV', 'RG', 'ET', 'AA', 'SET', 'QLG', 'S']
['M', 'EL', 'SA', 'E', 'YL', 'R', 'EKL', 'QR', 'DL', 'EA', 'EH', 'VL', 'PSP', 'GG', 'VG', 'QV', 'RG', 'ET', 'AA', 'S', 'ET', 'QLG', 'S']
