# Comparisons of Different BPE Implementations

## BPE From tiktoken

In [1]:
from importlib.metadata import version
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [2]:
import tiktoken

tik_tokenizer = tiktoken.get_encoding("gpt2")
text = "Did the quick brown fox jump over the lazy dogs?"

integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[11633, 262, 2068, 7586, 21831, 4391, 625, 262, 16931, 6844, 30]


In [3]:
strings = tik_tokenizer.decode(integers)
print(strings)

Did the quick brown fox jump over the lazy dogs?


In [4]:
print(tik_tokenizer.n_vocab)

50257


## BPE Implementation of GPT-2 

In [5]:
from openai_gpt2_bpe import get_encoder, download_vocab

In [6]:
download_vocab()

Fetching encoder.json: 1.04Mit [00:11, 92.7kit/s]                                                   
Fetching vocab.bpe: 457kit [00:05, 83.3kit/s]                                                       


In [7]:
oai_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")

In [8]:
integers = oai_tokenizer.encode(text)
print(integers)

[11633, 262, 2068, 7586, 21831, 4391, 625, 262, 16931, 6844, 30]


In [9]:
strings = oai_tokenizer.decode(integers)
print(strings)

Did the quick brown fox jump over the lazy dogs?


## BPE via HuggingFace Transformers

In [10]:
import transformers
transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.51.3'

In [11]:
from transformers import GPT2Tokenizer

hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [12]:
hf_tokenizer(strings)["input_ids"]

[11633, 262, 2068, 7586, 21831, 4391, 625, 262, 16931, 6844, 30]

In [13]:
from transformers import GPT2TokenizerFast

hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained("gpt2")

In [14]:
hf_tokenizer_fast(strings)["input_ids"]

[11633, 262, 2068, 7586, 21831, 4391, 625, 262, 16931, 6844, 30]

## Sebastian's Local BPE Tokenizer

In [15]:
import os, sys, io, nbformat, types
from functools import lru_cache

In [18]:
def import_from_notebook():
    def import_definitions_from_notebook(fullname, names):
        current_dir = os.getcwd()
        path = os.path.join(current_dir, "..", "supplementary", fullname + ".ipynb")
        path = os.path.normpath(path)
        
        # Load the NB
        if not os.path.exists(path):
            raise FileNotFoundError(f"Notebook not found at --> {path}")
        
        with io.open(path, "r", encoding="utf-8") as f:
            nb = nbformat.read(f, as_version=4)
            
        # Create module to store the imported funcs and classes
        mod = types.ModuleType(fullname)
        sys.modules[fullname] = mod
        
        # Add the notebook's cells to the module's namespac
        exec("from collections import Counter, deque", mod.__dict__)
        exec("from functools import lru_cache", mod.__dict__)
        exec("import json", mod.__dict__)
        
        # Go through the nb cells and execute func or class definitions
        for cell in nb.cells:
            if cell.cell_type == "code":
                cell_code = cell.source
                for name in names:
                    # Funcs or class definition check
                    if f"def {name}" in cell_code or f"class {name}" in cell_code:
                        exec(cell_code, mod.__dict__)
        return mod
    
    fullname = "01b-bpe-from-scratch"
    names = ["BPETokenizerLocal"]
    
    return import_definitions_from_notebook(fullname, names)
            
        

In [19]:
import_module = import_from_notebook()
BPETokenizerLocal = getattr(import_module, "BPETokenizerLocal", None)

tokenizer_gpt2 = BPETokenizerLocal()
tokenizer_gpt2.load_vocab_and_merges_from_openai(
    vocab_path=os.path.join("gpt2_model", "encoder.json"),
    bpe_merges_path=os.path.join("gpt2_model", "vocab.bpe")
)

In [20]:
integers = tokenizer_gpt2.encode(text)

print(integers)

[11633, 262, 2068, 7586, 21831, 4391, 625, 262, 16931, 6844, 30]


## Performance Benchmarks

In [25]:
with open("../data/the-law-bastiat.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [26]:
# Using the original OpenAI GPT2-Tokenizer
%timeit oai_tokenizer.encode(text)

16.9 ms ± 56.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
# Using the tiktoken Tokenizer
%timeit tik_tokenizer.encode(text)

3.32 ms ± 30.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [28]:
# Using the HuggingFace GPT-2 tokenizer
%timeit hf_tokenizer(text)["input_ids"]

44.2 ms ± 148 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [31]:
# HuggingFace Tokenizer with max length and truncation
%timeit hf_tokenizer(text, max_length=5145, truncation=True)["input_ids"]

44.7 ms ± 420 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [33]:
# HuggingFace Tokenizer - Fast version
%timeit hf_tokenizer_fast(text)["input_ids"]

Token indices sequence length is longer than the specified maximum sequence length for this model (21939 > 1024). Running this sequence through the model will result in indexing errors


17.8 ms ± 225 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
# HuggingFace - Fast Tokenizer with max length and truncation
%timeit hf_tokenizer_fast(text, max_length=5145, truncation=True)["input_ids"]

18.5 ms ± 116 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
