In [1]:
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
from tests import *

In [2]:
def load_data():
    """
    Load the specified dataset and return the text data.
    """
    dataset = load_dataset("wikitext", name="wikitext-2-raw-v1")
    return dataset["train"]["text"]

In [3]:
def initialize_bpe_tokenizer():
    """
    Initialize a Byte Pair Encoding (BPE) tokenizer with a Whitespace pre-tokenizer.
    """
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    return tokenizer

In [4]:
def train_bpe_tokenizer(tokenizer, texts, vocab_size=30000, min_frequency=2, special_tokens=None):
    """
    Train a BPE tokenizer on the provided texts.
    """
    if special_tokens is None:
        special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=special_tokens
    )

    def batch_iterator(batch_size=1000):
        for i in range(0, len(texts), batch_size):
            yield texts[i : i + batch_size]

    tokenizer.train_from_iterator(batch_iterator(), trainer)
    return tokenizer

In [5]:
def configure_post_processing(tokenizer):
    """
    Configure the post-processing and decoding rules for the tokenizer.
    """
    tokenizer.post_processor = processors.TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ],
    )
    tokenizer.decoder = decoders.BPEDecoder()

In [6]:
def save_tokenizer(tokenizer, filepath):
    """
    Save the tokenizer to the specified filepath.
    """
    tokenizer.save(filepath)

In [7]:
def test_tokenizer(tokenizer, text):
    """
    Test the tokenizer on a sample text and return the tokens and IDs.
    """
    output = tokenizer.encode(text)
    return output.tokens, output.ids

In [9]:
# Load the data
texts = load_data()


# Initialize the tokenizer
tokenizer = initialize_bpe_tokenizer()

# Train the tokenizer
tokenizer = train_bpe_tokenizer(tokenizer, texts)

# Configure post-processing and decoding
configure_post_processing(tokenizer)

# Save the tokenizer
save_tokenizer(tokenizer, "bpe_tokenizer-wikitext2.json")

# Test the tokenizer
test_text = "Natural Language Processing is fascinating."
tokens, ids = test_tokenizer(tokenizer, test_text)
print(f"Tokens: {tokens}")
print(f"IDs: {ids}")

test_tokenizer_func(tokens, ids)





Tokens: ['[CLS]', 'Natural', 'Language', 'Pro', 'cess', 'ing', 'is', 'fac', 'inating', '.', '[SEP]']
IDs: [2, 9634, 19539, 2101, 1379, 1035, 1034, 2126, 17091, 18, 3]


AssertionError: Tokens generated and expected tokens do not match