# Comparisons of Different BPE Implementations

## BPE From tiktoken

In [1]:
from importlib.metadata import version
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [2]:
import tiktoken

tik_tokenizer = tiktoken.get_encoding("gpt2")
text = "Did the quick brown fox jump over the lazy dogs?"

integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[11633, 262, 2068, 7586, 21831, 4391, 625, 262, 16931, 6844, 30]


In [3]:
strings = tik_tokenizer.decode(integers)
print(strings)

Did the quick brown fox jump over the lazy dogs?


In [4]:
print(tik_tokenizer.n_vocab)

50257


## BPE Implementation of GPT-2 

In [5]:
from openai_gpt2_bpe import get_encoder, download_vocab

In [6]:
download_vocab()

Fetching encoder.json: 1.04Mit [00:20, 51.5kit/s]                                                   
Fetching vocab.bpe: 457kit [00:04, 93.1kit/s]                                                       


In [7]:
oai_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")

In [10]:
integers = oai_tokenizer.encode(text)
print(integers)

[11633, 262, 2068, 7586, 21831, 4391, 625, 262, 16931, 6844, 30]


In [11]:
strings = oai_tokenizer.decode(integers)
print(strings)

Did the quick brown fox jump over the lazy dogs?


## BPE via HuggingFace Transformers

In [12]:
import transformers
transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.51.3'

In [13]:
from transformers import GPT2Tokenizer

hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [14]:
hf_tokenizer(strings)["input_ids"]

[11633, 262, 2068, 7586, 21831, 4391, 625, 262, 16931, 6844, 30]

In [16]:
from transformers import GPT2TokenizerFast

hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained("gpt2")

In [17]:
hf_tokenizer_fast(strings)["input_ids"]

[11633, 262, 2068, 7586, 21831, 4391, 625, 262, 16931, 6844, 30]

## Sebastian's Local BPE Tokenizer

In [18]:
import os, sys, io, nbformat, types

In [None]:
def import_from_notebook():
    def import_definitions_from_notebook(fullname, names):
        current_dir = os.getcwd()
        path = os.path.join(current_dir, "..", "01b-bpe-from-scratch", fullname + ".ipynb")
        path = os.path.normpath(path)
        
        # Load the NB
        if not os.path.exists(path):
            raise FileNotFoundError(f"Notebook not found at --> {path}")
        
        with io.open(path, "r", encoding="utf-8") as f:
            nb = nbformat.read(f, as_version=4)
            
        # Create module to store the imported funcs and classes
        mod = types.ModuleType(fullname)
        sys.modules[fullname] = mod
            
        