In [1]:
%load_ext autoreload
%autoreload 2

In [44]:
import tiktoken
import regex as re

from gpt_from_scratch.dataset_loaders import (
    tinystories_loader,
    tinyshakespeare_loader,
)
from gpt_from_scratch import (
    file_utils,
    tokenizer_utils,
    byte_pair_encoding_tokenizer,
)


In [4]:
# reference tokenizer to use for special tokens
tokenizer = tiktoken.get_encoding('gpt2')

In [6]:
tokenizer._special_tokens

{'<|endoftext|>': 50256}

In [40]:
tokenizer._mergeable_ranks?

[0;31mType:[0m        dict
[0;31mString form:[0m {b'!': 0, b'"': 1, b'#': 2, b'$': 3, b'%': 4, b'&': 5, b"'": 6, b'(': 7, b')': 8, b'*': 9, b'+':  <...> inated': 50251, b' regress': 50252, b' Collider': 50253, b' informants': 50254, b' gazed': 50255}
[0;31mLength:[0m      50256
[0;31mDocstring:[0m  
dict() -> new empty dictionary
dict(mapping) -> new dictionary initialized from a mapping object's
    (key, value) pairs
dict(iterable) -> new dictionary initialized as if via:
    d = {}
    for k, v in iterable:
        d[k] = v
dict(**kwargs) -> new dictionary initialized with the name=value pairs
    in the keyword argument list.  For example:  dict(one=1, two=2)

In [11]:
# load tinystories
tinystories_version = tinystories_loader.TinyStoriesVersion.V2

tinystories_filepaths = tinystories_loader.download_tinystories(
    tinystories_version,
)

Downloading TinyStoriesV2-GPT4-train.txt...
Downloaded TinyStoriesV2-GPT4-train.txt to /Users/bronsonschoen/.cache/huggingface/hub/datasets--roneneldan--TinyStories/snapshots/f54c09fd23315a6f9c86f9dc80f725de7d8f9c64/TinyStoriesV2-GPT4-train.txt
Downloading TinyStoriesV2-GPT4-valid.txt...
Downloaded TinyStoriesV2-GPT4-valid.txt to /Users/bronsonschoen/.cache/huggingface/hub/datasets--roneneldan--TinyStories/snapshots/f54c09fd23315a6f9c86f9dc80f725de7d8f9c64/TinyStoriesV2-GPT4-valid.txt


In [94]:
# select a subset to actually test out tokenizer on

# note: full tinystories training set is 15,600,056 lines
num_samples = 100000

input_text_lines = file_utils.head(
    filepath=tinystories_filepaths.train,
    n=num_samples,
)

# join it together, since this is small enough for us to keep in memory
# note: removes empty lines
input_text = '\n'.join([x for x in input_text_lines if x != ''])

In [95]:
# TODO(bschoen): More general handling of special tokens? Is this okay? We actually
#                still want to split this (so it needs to be in the regex pattern)
#                but we don't want them merged during `bpe_merge`
#
#                This means we can assume that special tokens come in already split
#                as an exact match (even if in a public api we'd want to construct
#                this regex automatically for the user given special tokens)
regex_pattern_str = '|'.join([
    # Match whole words
    #
    #   \b    - Represents a word boundary (transition from a non-word char to a word char or vice versa)
    #   \w+   - Matches one or more word characters (letters, digits, or underscores)
    #   \b    - Another word boundary to ensure we match whole words
    #
    r'\b\w+\b', 
    #
    # Match single punctuation marks
    #
    #   []       - Character set: match any single character listed inside the brackets
    #   .,!?;:"  - The actual characters we want to match (various punctuation marks)
    #
    r'[.,!?;:"]',
    # 
    # Match one or more whitespace characters  (spaces, tabs)
    #
    r'\s+',
    #
    # Match the newline character
    #
    r'\n',
    #
    # Match the special end-of-text token exactly
    #
    r'<\|endoftext\|>',
])

regex_pattern = re.compile(regex_pattern_str)

In [96]:
# show what it looks like (this also sanity checks that we get back the original string)
split_string = regex_pattern.findall("Jack and Jill went up the hill\nIt was raining")

print(tokenizer_utils.get_colored_tokenization_of_split_string(split_string))

[42m[97mJack[0m[46m[97m [0m[41m[97mand[0m[43m[97m [0m[44m[97mJill[0m[45m[97m [0m[42m[97mwent[0m[46m[97m [0m[41m[97mup[0m[43m[97m [0m[44m[97mthe[0m[45m[97m [0m[42m[97mhill[0m[46m[97m
[0m[41m[97mIt[0m[43m[97m [0m[44m[97mwas[0m[45m[97m [0m[42m[97mraining[0m


In [102]:
vocab_size = 2048

tokenizer = byte_pair_encoding_tokenizer.BytePairEncodingWordTokenizer.from_input_text(
    input_text=input_text,
    regex_split_pattern_string=regex_pattern_str,
    vocab_size=vocab_size,
)

Constructing vocab_size=2048 from 620164 unmerged words...
[vocab: 257 / 2048] Merging	(b'h', b'e')	(count: 40486)	-> new token: 512
[vocab: 258 / 2048] Merging	(b'a', b'n')	(count: 20944)	-> new token: 513
[vocab: 259 / 2048] Merging	(b't', b'he')	(count: 20276)	-> new token: 514
[vocab: 260 / 2048] Merging	(b'e', b'd')	(count: 15744)	-> new token: 515
[vocab: 261 / 2048] Merging	(b't', b'o')	(count: 14655)	-> new token: 516
[vocab: 262 / 2048] Merging	(b'an', b'd')	(count: 13376)	-> new token: 517
[vocab: 263 / 2048] Merging	(b'i', b'n')	(count: 11563)	-> new token: 518
[vocab: 264 / 2048] Merging	(b'i', b't')	(count: 9535)	-> new token: 519
[vocab: 265 / 2048] Merging	(b'r', b'e')	(count: 9453)	-> new token: 520
[vocab: 266 / 2048] Merging	(b'w', b'a')	(count: 9191)	-> new token: 521
[vocab: 267 / 2048] Merging	(b'o', b'u')	(count: 8947)	-> new token: 522
[vocab: 268 / 2048] Merging	(b'e', b'n')	(count: 8349)	-> new token: 523
[vocab: 269 / 2048] Merging	(b'h', b'a')	(count: 8228)	-

KeyboardInterrupt: 

In [None]:
encoded = tokenizer.encode('Jack and Jill went up the hill')
print(tokenizer.decode(encoded))

Jack and Jill went up the hill


In [None]:
tokenizer.encode('Jack and Jill went up the hill')

[74, 691, 32, 517, 32, 74, 667, 32, 650, 32, 590, 32, 514, 32, 104, 667]

In [None]:
# show some examples from the vocabulary
list(tokenizer.merges.items())[-10:]

[(b'say', 758),
 (b'other', 759),
 (b'pr', 760),
 (b'ouse', 761),
 (b'make', 762),
 (b'ight', 763),
 (b'wor', 764),
 (b'small', 765),
 (b'Ben', 766),
 (b'asked', 767)]

In [None]:
# show how it looks for the first N characters to sanity check
example_text = input_text[:100]

# note: this includes a call to `get_colored_tokenization`
tokenizer_utils.show_token_mapping(tokenizer=tokenizer, input_string=example_text)

Input:		Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw
Tokenized:	[44m[97mOnce[0m[45m[97m [0m[42m[97mupon[0m[41m[97m [0m[43m[97ma[0m[46m[97m [0m[44m[97mtime[0m[45m[97m [0m[42m[97mthere[0m[41m[97m [0m[43m[97mwas[0m[46m[97m [0m[44m[97ma[0m[45m[97m [0m[42m[97mlittle[0m[41m[97m [0m[43m[97mboy[0m[46m[97m [0m[44m[97mnamed[0m[45m[97m [0m[42m[97mBen[0m[41m[97m.[0m[43m[97m [0m[46m[97mBen[0m[44m[97m [0m[45m[97mloved[0m[42m[97m [0m[41m[97mto[0m[43m[97m [0m[46m[97mex[0m[44m[97mpl[0m[45m[97mo[0m[42m[97mre[0m[41m[97m [0m[43m[97mthe[0m[46m[97m [0m[44m[97mwor[0m[45m[97mld[0m[42m[97m [0m[41m[97mar[0m[43m[97mound[0m[46m[97m [0m[44m[97mhim[0m[45m[97m.[0m[42m[97m [0m[41m[97mHe[0m[43m[97m [0m[46m[97msaw[0m
Token ID | Token Bytes | Token String
---------+-------------+--------------
     660 | [38;5;2m4F[0m [3

In [None]:
encoded = tokenizer.encode('Jack and Jill went up the hill')
print(tokenizer.decode(encoded))

JackandJillwentupthehill


In [None]:
tokenizer_utils.BytePairEncodingTokenizer.from_input_bytes(
    input_bytes=input_text.encode('utf-8'),
    vocab_size=1000,
)