# LLM tokenizer

A tokenizer can be seen as a mapping of words to numbers. Well, not exactly words, but sometimes words, sometimes fragments of words.

In [92]:
!pip install regex tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting requests>=2.26.0 (from tiktoken)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests>=2.26.0->tiktoken)
  Using cached charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Collecting idna<4,>=2.5 (from requests>=2.26.0->tiktoken)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.26.0->tiktoken)
  Using cached urllib3-2.2.3-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests>=2.26.0->tiktoken)
  Using cached certifi-2024.12.14-py3-none-any.whl.metadata (2.3 kB)
Downloading tiktoken-0.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m8.4 MB/s[

In [4]:
import urllib.request
from pathlib import Path

if not Path("tokenizer.json").exists():
    with open("HF_TOKEN") as f:
        HF_TOKEN = f.read()


    display("Downloading llama 3.2 tokenizer")

    opener = urllib.request.build_opener()
    opener.addheaders = [("Authorization", f"Bearer {HF_TOKEN}")]
    urllib.request.install_opener(opener)

    tokenizer_json_path, headers = urllib.request.urlretrieve('https://huggingface.co/meta-llama/Llama-3.2-1B/raw/main/tokenizer.json', "tokenizer.json")
else:
    tokenizer_json_path = "tokenizer.json"

Load the tokenizer data from the JSON file.

The split regex will be used to split the text into tokens

the vocabulary maps the token strings to their corresponding ids

In [5]:
import json
with open(tokenizer_json_path) as f:
    tokenizer_data = json.load(f)
split = next(filter(lambda t: t["type"] == "Split", tokenizer_data["pre_tokenizer"]["pretokenizers"]))
split_regex = split["pattern"]["Regex"]
vocab = {k.replace("Ġ", " ").encode("utf-8"): v for k, v in tokenizer_data["model"]["vocab"].items()}

display(f"Splitting regex: {split_regex}")
display(f"Vocabulary length: {len(vocab)}")

"Splitting regex: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"

'Vocabulary length: 128000'

Let's try to split a text with this regex

In [6]:
vocab_inv = {v: k for k, v in vocab.items()}

In [7]:
import regex

tok_regex = regex.compile(split_regex)

In [8]:
text = "I'm pretty much fucked. That's my considered opinion. Fucked. Six days into what should be one of the greatest two months of my life, and it's turned into a nightmare."
tokens = [t.strip() for t in tok_regex.findall(text)]
tokens

['I',
 "'m",
 'pretty',
 'much',
 'fucked',
 '.',
 'That',
 "'s",
 'my',
 'considered',
 'opinion',
 '.',
 'Fucked',
 '.',
 'Six',
 'days',
 'into',
 'what',
 'should',
 'be',
 'one',
 'of',
 'the',
 'greatest',
 'two',
 'months',
 'of',
 'my',
 'life',
 ',',
 'and',
 'it',
 "'s",
 'turned',
 'into',
 'a',
 'nightmare',
 '.']

In [33]:
def bpe_encode(
    mergeable_ranks: dict[bytes, int], input: str, visualise: str | None = "colour"
) -> list[int]:
    parts = [bytes([b]) for b in input]
    while True:
        # See the intermediate merges play out!
        # if visualise:
        #     if visualise in ["colour", "color"]:
        #         visualise_tokens(parts)
        #     elif visualise == "simple":
        #         print(parts)

        # Iterate over all pairs and find the pair we want to merge the most
        min_idx = None
        min_rank = None
        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
            rank = mergeable_ranks.get(pair[0] + pair[1])
            if rank is not None and (min_rank is None or rank < min_rank):
                min_idx = i
                min_rank = rank

        # If there were no pairs we could merge, we're done!
        if min_rank is None:
            break
        assert min_idx is not None

        # Otherwise, merge that pair and leave the rest unchanged. Then repeat.
        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]

    if visualise:
        print()

    tokens = [mergeable_ranks[part] for part in parts]
    return tokens

In [34]:
bpe_encode(vocab, b"hello", None)

[b'hello']


[15339]

In [12]:
vocab_inv[15339]

b'hello'

In [129]:
encoded = []
for token in tokens:
    enc = bpe_encode(vocab, token, None)
    print(enc)
    encoded.extend(enc)

display(encoded)

[40]
[2846]
[34055]
[59178]
[69, 40458]
[13]
[4897]
[596]
[2465]
[25742, 291]
[454, 37400]
[13]
[37, 40458]
[13]
[42560]
[14097]
[18614]
[12840]
[5562]
[1395]
[606]
[1073]
[1820]
[70, 11423]
[20375]
[50814]
[1073]
[2465]
[14789]
[11]
[438]
[275]
[596]
[42286]
[18614]
[64]
[9471, 28755]
[13]


[40,
 2846,
 34055,
 59178,
 69,
 40458,
 13,
 4897,
 596,
 2465,
 25742,
 291,
 454,
 37400,
 13,
 37,
 40458,
 13,
 42560,
 14097,
 18614,
 12840,
 5562,
 1395,
 606,
 1073,
 1820,
 70,
 11423,
 20375,
 50814,
 1073,
 2465,
 14789,
 11,
 438,
 275,
 596,
 42286,
 18614,
 64,
 9471,
 28755,
 13]

In [132]:
decoded = ""

for t in encoded:
    decoded += vocab_inv[t] + " "
decoded

"I 'm pretty much f ucked . That 's my consider ed op inion . F ucked . Six days into what should be one of the g reatest two months of my life , and it 's turned into a night mare . "

In [94]:
import tiktoken
cl100k_base = tiktoken.get_encoding("cl100k_base")

In [98]:

encoded = []
for token in tok_regex.findall(text):
    enc = bpe_encode(cl100k_base._mergeable_ranks, token, None)
    print(enc)
    encoded.extend(enc)

display(encoded)

KeyError: 'I'

In [99]:
cl100k_base._pat_str

"'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}++|\\p{N}{1,3}+| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*+|\\s++$|\\s*[\\r\\n]|\\s+(?!\\S)|\\s"

In [100]:
split_regex

"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"

In [101]:
regex.compile(cl100k_base._pat_str).findall(text)

['I',
 "'m",
 ' pretty',
 ' much',
 ' fucked',
 '.',
 ' That',
 "'s",
 ' my',
 ' considered',
 ' opinion',
 '.',
 ' Fucked',
 '.',
 ' Six',
 ' days',
 ' into',
 ' what',
 ' should',
 ' be',
 ' one',
 ' of',
 ' the',
 ' greatest',
 ' two',
 ' months',
 ' of',
 ' my',
 ' life',
 ',',
 ' and',
 ' it',
 "'s",
 ' turned',
 ' into',
 ' a',
 ' nightmare',
 '.']

In [102]:
" pretty".encode("utf-8")

b' pretty'

In [103]:
import tiktoken._educational as tike

In [105]:
tke = tike.SimpleBytePairEncoding(pat_str=cl100k_base._pat_str, mergeable_ranks=cl100k_base._mergeable_ranks)

In [106]:
tke.encode(text)

[48;5;167mI[0m

[48;5;167m'[48;5;179mm[0m
[48;5;167m'm[0m

[48;5;167m [48;5;179mp[48;5;185mr[48;5;77me[48;5;80mt[48;5;68mt[48;5;134my[0m
[48;5;167m [48;5;179mp[48;5;185mre[48;5;80mt[48;5;68mt[48;5;134my[0m
[48;5;167m p[48;5;185mre[48;5;80mt[48;5;68mt[48;5;134my[0m
[48;5;167m pre[48;5;80mt[48;5;68mt[48;5;134my[0m
[48;5;167m pre[48;5;80mt[48;5;68mty[0m
[48;5;167m pret[48;5;68mty[0m
[48;5;167m pretty[0m

[48;5;167m [48;5;179mm[48;5;185mu[48;5;77mc[48;5;80mh[0m
[48;5;167m m[48;5;185mu[48;5;77mc[48;5;80mh[0m
[48;5;167m m[48;5;185mu[48;5;77mch[0m
[48;5;167m m[48;5;185much[0m
[48;5;167m much[0m

[48;5;167m [48;5;179mf[48;5;185mu[48;5;77mc[48;5;80mk[48;5;68me[48;5;134md[0m
[48;5;167m f[48;5;185mu[48;5;77mc[48;5;80mk[48;5;68me[48;5;134md[0m
[48;5;167m f[48;5;185mu[48;5;77mc[48;5;80mk[48;5;68med[0m
[48;5;167m f[48;5;185mu[48;5;77mck[48;5;68med[0m
[48;5;167m f[48;5;185muck[48;5;68med[0m
[48;5;167m fuck[4

[40,
 2846,
 5128,
 1790,
 28252,
 13,
 3011,
 596,
 856,
 6646,
 9647,
 13,
 64662,
 13,
 19198,
 2919,
 1139,
 1148,
 1288,
 387,
 832,
 315,
 279,
 12474,
 1403,
 4038,
 315,
 856,
 2324,
 11,
 323,
 433,
 596,
 6656,
 1139,
 264,
 38911,
 13]

In [133]:
encoded = []
for token in tok_regex.findall(text):
    enc = tike.bpe_encode(cl100k_base._mergeable_ranks, bytes(token, "utf-8"), None)
    encoded.extend(enc)

display(encoded)

[40,
 2846,
 5128,
 1790,
 28252,
 13,
 3011,
 596,
 856,
 6646,
 9647,
 13,
 64662,
 13,
 19198,
 2919,
 1139,
 1148,
 1288,
 387,
 832,
 315,
 279,
 12474,
 1403,
 4038,
 315,
 856,
 2324,
 11,
 323,
 433,
 596,
 6656,
 1139,
 264,
 38911,
 13]

In [134]:
tke.decode(encoded)

"I'm pretty much fucked. That's my considered opinion. Fucked. Six days into what should be one of the greatest two months of my life, and it's turned into a nightmare."

In [135]:
tike.SimpleBytePairEncoding.from_tiktoken("tokenizer.json")

ValueError: Unknown encoding tokenizer.json.
Plugins found: ['tiktoken_ext.openai_public']
tiktoken version: 0.8.0 (are you on latest?)

In [143]:
vocab_bytes = { k.encode("utf_8"): v for k, v in vocab.items()}

In [144]:
t2  =tike.SimpleBytePairEncoding(pat_str=split_regex, mergeable_ranks=vocab_bytes)

In [145]:
enc2  =t2.encode(text)

[48;5;167mI[0m

[48;5;167m'[48;5;179mm[0m
[48;5;167m'm[0m

[48;5;167m [48;5;179mp[48;5;185mr[48;5;77me[48;5;80mt[48;5;68mt[48;5;134my[0m
[48;5;167m [48;5;179mp[48;5;185mre[48;5;80mt[48;5;68mt[48;5;134my[0m
[48;5;167m [48;5;179mpre[48;5;80mt[48;5;68mt[48;5;134my[0m
[48;5;167m [48;5;179mpre[48;5;80mt[48;5;68mty[0m
[48;5;167m [48;5;179mpret[48;5;68mty[0m
[48;5;167m [48;5;179mpretty[0m



KeyError: b' '

In [1]:
import tiktoken

In [4]:
import tiktoken.load


tiktoken.load.load_tiktoken_bpe("tokenizer.json")

ImportError: blobfile is not installed. Please install it by running `pip install blobfile`.