# Tokenizers CPP

In this note book I want to compare the time to tokenize certain texts with common tokenizers compared to my own!

In [1]:
import tokenizers_cpp
from min_bpe.basic import BasicTokenizer
import time
from datasets import load_dataset
import random
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


# Train BPE Tokenizer

I use MSMARCO to train my own BPE tokenizer. I'm going to be using other text for inference so I'm going to mix training, validation and test data from MSMARCO to get the most amount of data.

## Load Training Data

In [None]:
msmarco = load_dataset("ms_marco", "v2.1")

In [None]:
text_data = []

for phase in ["test"]:
    dataset = msmarco[phase]

    for entry in dataset:
        for answer in entry["answers"]:
            text_data.append(answer)
        
        for passage in entry["passages"]["passage_text"]:
            text_data.append(passage)
        
        text_data.append(entry["query"])

In [None]:
random.shuffle(text_data)
with open("training_text.txt", 'w') as f:
    for line in text_data:
        f.write(f"{line}\n")

## Train BPETokenizer

In [2]:
training_text = Path("training_text.txt").read_text()

In [4]:
vocab_size = 50257
tok = tokenizers_cpp.BPETokenizer(vocab_size)
basic_tok = BasicTokenizer()

In [5]:
s = time.time()
tok.train(training_text)
e = time.time()

In [None]:
print(f"Tokenizers CPP took {e - s} time!")

In [None]:
s = time.time()
basic_tok.train(training_text, vocab_size)
e = time.time()

In [None]:
print(f"Basic Tokenizer took {e - s} time!")