In [1]:
%load_ext autoreload
%autoreload 2

In [142]:
import tiktoken
import regex as re

from gpt_from_scratch.dataset_loaders import (
    tinystories_loader,
    tinyshakespeare_loader,
)
from gpt_from_scratch import (
    file_utils,
    tokenizer_utils,
    byte_pair_encoding_tokenizer,
)


In [4]:
# reference tokenizer to use for special tokens
tokenizer = tiktoken.get_encoding('gpt2')

In [6]:
tokenizer._special_tokens

{'<|endoftext|>': 50256}

In [40]:
tokenizer._mergeable_ranks?

[0;31mType:[0m        dict
[0;31mString form:[0m {b'!': 0, b'"': 1, b'#': 2, b'$': 3, b'%': 4, b'&': 5, b"'": 6, b'(': 7, b')': 8, b'*': 9, b'+':  <...> inated': 50251, b' regress': 50252, b' Collider': 50253, b' informants': 50254, b' gazed': 50255}
[0;31mLength:[0m      50256
[0;31mDocstring:[0m  
dict() -> new empty dictionary
dict(mapping) -> new dictionary initialized from a mapping object's
    (key, value) pairs
dict(iterable) -> new dictionary initialized as if via:
    d = {}
    for k, v in iterable:
        d[k] = v
dict(**kwargs) -> new dictionary initialized with the name=value pairs
    in the keyword argument list.  For example:  dict(one=1, two=2)

In [11]:
# load tinystories
tinystories_version = tinystories_loader.TinyStoriesVersion.V2

tinystories_filepaths = tinystories_loader.download_tinystories(
    tinystories_version,
)

Downloading TinyStoriesV2-GPT4-train.txt...
Downloaded TinyStoriesV2-GPT4-train.txt to /Users/bronsonschoen/.cache/huggingface/hub/datasets--roneneldan--TinyStories/snapshots/f54c09fd23315a6f9c86f9dc80f725de7d8f9c64/TinyStoriesV2-GPT4-train.txt
Downloading TinyStoriesV2-GPT4-valid.txt...
Downloaded TinyStoriesV2-GPT4-valid.txt to /Users/bronsonschoen/.cache/huggingface/hub/datasets--roneneldan--TinyStories/snapshots/f54c09fd23315a6f9c86f9dc80f725de7d8f9c64/TinyStoriesV2-GPT4-valid.txt


In [138]:
# select a subset to actually test out tokenizer on

# note: full tinystories training set is 15,600,056 lines
num_samples = 100000

input_text_lines = file_utils.head(
    filepath=tinystories_filepaths.train,
    n=num_samples,
)

# join it together, since this is small enough for us to keep in memory
# note: removes empty lines
input_text = '\n'.join([x for x in input_text_lines if x != ''])

In [139]:
# TODO(bschoen): More general handling of special tokens? Is this okay? We actually
#                still want to split this (so it needs to be in the regex pattern)
#                but we don't want them merged during `bpe_merge`
#
#                This means we can assume that special tokens come in already split
#                as an exact match (even if in a public api we'd want to construct
#                this regex automatically for the user given special tokens)
regex_pattern_str = '|'.join([
    # Match whole words
    #
    #   \b    - Represents a word boundary (transition from a non-word char to a word char or vice versa)
    #   \w+   - Matches one or more word characters (letters, digits, or underscores)
    #   \b    - Another word boundary to ensure we match whole words
    #
    r'\b\w+\b', 
    #
    # Match single punctuation marks
    #
    #   []       - Character set: match any single character listed inside the brackets
    #   .,!?;:"  - The actual characters we want to match (various punctuation marks)
    #
    r'[.,!?;:"]',
    # 
    # Match one or more whitespace characters  (spaces, tabs)
    #
    r'\s+',
    #
    # Match the newline character
    #
    r'\n',
    #
    # Match the special end-of-text token exactly
    #
    r'<\|endoftext\|>',
])

regex_pattern = re.compile(regex_pattern_str)

In [140]:
# show what it looks like (this also sanity checks that we get back the original string)
split_string = regex_pattern.findall('''Jack and Jill went up the hill\nIt was raining\n<|endoftext|>\nBill said, "Let's go to the park!"''')

print(tokenizer_utils.get_colored_tokenization_of_split_string(split_string))

[41m[97mJack[0m[46m[97m [0m[42m[97mand[0m[44m[97m [0m[43m[97mJill[0m[45m[97m [0m[41m[97mwent[0m[46m[97m [0m[42m[97mup[0m[44m[97m [0m[43m[97mthe[0m[45m[97m [0m[41m[97mhill[0m[46m[97m
[0m[42m[97mIt[0m[44m[97m [0m[43m[97mwas[0m[45m[97m [0m[41m[97mraining[0m[46m[97m
[0m[42m[97m<|endoftext|>[0m[44m[97m
[0m[43m[97mBill[0m[45m[97m [0m[41m[97msaid[0m[46m[97m,[0m[42m[97m [0m[44m[97m"[0m[43m[97mLet[0m[45m[97ms[0m[41m[97m [0m[46m[97mgo[0m[42m[97m [0m[44m[97mto[0m[43m[97m [0m[45m[97mthe[0m[41m[97m [0m[46m[97mpark[0m[42m[97m![0m[44m[97m"[0m


In [141]:
vocab_size = 2048

tokenizer = byte_pair_encoding_tokenizer.BytePairEncodingWordTokenizer.from_input_text(
    input_text=input_text,
    regex_split_pattern_string=regex_pattern_str,
    vocab_size=vocab_size,
    special_tokens={'<|endoftext|>'},
)

Constructing vocab_size=2048 from 6182573 unmerged words...
[vocab: 258 / 2048] Merging	(b'h', b'e')	(count: 404479)	-> new token: 258
[vocab: 259 / 2048] Merging	(b'a', b'n')	(count: 207108)	-> new token: 259
[vocab: 260 / 2048] Merging	(b't', b'he')	(count: 203147)	-> new token: 260
[vocab: 261 / 2048] Merging	(b'e', b'd')	(count: 158162)	-> new token: 261
[vocab: 262 / 2048] Merging	(b't', b'o')	(count: 145165)	-> new token: 262
[vocab: 263 / 2048] Merging	(b'an', b'd')	(count: 132078)	-> new token: 263
[vocab: 264 / 2048] Merging	(b'i', b'n')	(count: 115299)	-> new token: 264
[vocab: 265 / 2048] Merging	(b'r', b'e')	(count: 97192)	-> new token: 265
[vocab: 266 / 2048] Merging	(b'i', b't')	(count: 93800)	-> new token: 266
[vocab: 267 / 2048] Merging	(b'w', b'a')	(count: 93387)	-> new token: 267
[vocab: 268 / 2048] Merging	(b'o', b'u')	(count: 92826)	-> new token: 268
[vocab: 269 / 2048] Merging	(b'h', b'a')	(count: 82750)	-> new token: 269
[vocab: 270 / 2048] Merging	(b'T', b'he')	(

In [143]:
encoded = tokenizer.encode('Jack and Jill went up the hill')
print(tokenizer.decode(encoded))

Jack and Jill went up the hill


In [144]:
tokenizer.encode('Jack and Jill went up the hill')

[957, 32, 263, 32, 1890, 32, 399, 32, 335, 32, 260, 32, 1058]

In [145]:
# show some examples from the vocabulary
list(tokenizer.merges.items())[-10:]

[(b'ina', 2039),
 (b'bridge', 2040),
 (b'easy', 2041),
 (b'adventures', 2042),
 (b'sheep', 2043),
 (b'sour', 2044),
 (b'Pe', 2045),
 (b'takes', 2046),
 (b'drew', 2047),
 (b'ps', 2048)]

In [146]:
# show how it looks for the first N characters to sanity check
example_text = input_text[:100]

# note: this includes a call to `get_colored_tokenization`
tokenizer_utils.show_token_mapping(tokenizer=tokenizer, input_string=example_text)

Input:		Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw
Tokenized:	[42m[97mOnce[0m[41m[97m [0m[46m[97mupon[0m[44m[97m [0m[43m[97ma[0m[45m[97m [0m[42m[97mtime[0m[41m[97m [0m[46m[97mthere[0m[44m[97m [0m[43m[97mwas[0m[45m[97m [0m[42m[97ma[0m[41m[97m [0m[46m[97mlittle[0m[44m[97m [0m[43m[97mboy[0m[45m[97m [0m[42m[97mnamed[0m[41m[97m [0m[46m[97mBen[0m[44m[97m.[0m[43m[97m [0m[45m[97mBen[0m[42m[97m [0m[41m[97mloved[0m[46m[97m [0m[44m[97mto[0m[43m[97m [0m[45m[97mexplore[0m[42m[97m [0m[41m[97mthe[0m[46m[97m [0m[44m[97mworld[0m[43m[97m [0m[45m[97maround[0m[42m[97m [0m[41m[97mhim[0m[46m[97m.[0m[44m[97m [0m[43m[97mHe[0m[45m[97m [0m[42m[97msaw[0m
Token ID | Token Bytes | Token String
---------+-------------+--------------
     400 | [38;5;2m4F[0m [38;5;2m6E[0m [38;5;2m63[0m [38;5;2m65[0m | 'Once'
          [48;5

In [150]:
# save to file
tokenizer_filepath = f'tokenizer__vocab_{vocab_size}_samples_{num_samples}_dataset_tinystories.pkl'
file_utils.serialize_dataclass_to_pickle_file(tokenizer, tokenizer_filepath)

In [151]:
tokenizer_from_pickle = file_utils.deserialize_dataclass_from_pickle_file(
    cls=byte_pair_encoding_tokenizer.BytePairEncodingWordTokenizer,
    file_path=tokenizer_filepath,
)

[957, 32, 263, 32, 1890, 32, 399, 32, 335, 32, 260, 32, 1058]

In [152]:
# check that round tripping works
tokenizer_utils.show_token_mapping(
    tokenizer=tokenizer_from_pickle,
    input_string='Jack and Jill went up the hill',
)

Input:		Jack and Jill went up the hill
Tokenized:	[42m[97mJack[0m[41m[97m [0m[44m[97mand[0m[43m[97m [0m[45m[97mJill[0m[46m[97m [0m[42m[97mwent[0m[41m[97m [0m[44m[97mup[0m[43m[97m [0m[45m[97mthe[0m[46m[97m [0m[42m[97mhill[0m
Token ID | Token Bytes | Token String
---------+-------------+--------------
     957 | [38;5;2m4A[0m [38;5;2m61[0m [38;5;2m63[0m [38;5;2m6B[0m | 'Jack'
          [48;5;1m[38;5;15mJack[0m and Jill went up the hill
          U+004A LATIN CAPITAL LETTER J (1 bytes: [38;5;2m4A[0m)
          U+0061 LATIN SMALL LETTER A (1 bytes: [38;5;2m61[0m)
          U+0063 LATIN SMALL LETTER C (1 bytes: [38;5;2m63[0m)
          U+006B LATIN SMALL LETTER K (1 bytes: [38;5;2m6B[0m)
      32 | [38;5;2m20[0m | ' '
          Jack[48;5;1m[38;5;15m [0mand Jill went up the hill
          U+0020 SPACE (1 bytes: [38;5;2m20[0m)
     263 | [38;5;2m61[0m [38;5;2m6E[0m [38;5;2m64[0m | 'and'
          Jack [48;5;1m[38;5;15man