In [85]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

%load_ext autoreload
%autoreload 2

In [9]:
chr(0)
# This is a control character with no visible representation — 
# it's often used as a string terminator in C-style strings
# or as padding/null bytes in binary formats.

'\x00'

In [10]:
print(chr(0))

 


In [11]:
chr(0)

'\x00'

In [16]:
"this is a test" + chr(0) + "string"

'this is a test\x00string'

In [17]:
print("this is a test" + chr(0) + "string")

this is a test string


In [18]:
2**8

256

In [25]:
2**32

4294967296

In [21]:
test_string = "hello! こんにちは!"
utf8_encoded = test_string.encode("utf-8")
utf16_encoded = test_string.encode("utf-16")
utf32_encoded = test_string.encode("utf-32")

In [24]:
print(f"utf8 length: {len(utf8_encoded)}")
print(f"utf16 length: {len(utf16_encoded)}")
print(f"utf32 length: {len(utf32_encoded)}")

utf8 length: 23
utf16 length: 28
utf32 length: 56


In [26]:
def decode_utf8_bytes_to_str_wrong(bytestring: bytes):
    return "".join([bytes([b]).decode("utf-8") for b in bytestring])

In [27]:
decode_utf8_bytes_to_str_wrong("hello".encode("utf-8"))

'hello'

In [29]:
decode_utf8_bytes_to_str_wrong("hello! こんにちは!".encode("utf-8"))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe3 in position 0: unexpected end of data

In [30]:
bytes([0xC2–0xDF] [0x80–0xBF])

SyntaxError: invalid character '–' (U+2013) (4254768052.py, line 1)

In [32]:
bytes([0xC0, 0xAF]).decode("utf-8")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 0: invalid start byte

In [1]:
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

In [2]:
import regex as re
from collections import Counter

In [35]:
re.findall(PAT, "some text that i'll pre-tokenize")

['some', ' text', ' that', ' i', "'ll", ' pre', '-', 'tokenize']

In [37]:
list(re.finditer(PAT, "some text that i'll pre-tokenize"))

[<regex.Match object; span=(0, 4), match='some'>,
 <regex.Match object; span=(4, 9), match=' text'>,
 <regex.Match object; span=(9, 14), match=' that'>,
 <regex.Match object; span=(14, 16), match=' i'>,
 <regex.Match object; span=(16, 19), match="'ll">,
 <regex.Match object; span=(19, 23), match=' pre'>,
 <regex.Match object; span=(23, 24), match='-'>,
 <regex.Match object; span=(24, 32), match='tokenize'>]

In [38]:
pre_tokens = re.findall(PAT, "some text that i'll pre-tokenize")

In [39]:
pre_token_bytes = [x.encode("utf-8") for x in pre_tokens]
pre_token_bytes

[b'some', b' text', b' that', b' i', b"'ll", b' pre', b'-', b'tokenize']

In [3]:
# BPE training example
text = "low low low low low lower lower widest widest widest newest newest newest newest newest newest"

In [11]:
pre_tokens = [x.strip() for x in text.split(" ")]
pre_token_freqs = Counter(pre_tokens)
print(pre_token_freqs)

Counter({'newest': 6, 'low': 5, 'widest': 3, 'lower': 2})


In [15]:
pre_token_byte_freqs = {tuple([c for c in key]) : count for key,count in pre_token_freqs.items() }
pre_token_byte_freqs

{('l', 'o', 'w'): 5,
 ('l', 'o', 'w', 'e', 'r'): 2,
 ('w', 'i', 'd', 'e', 's', 't'): 3,
 ('n', 'e', 'w', 'e', 's', 't'): 6}

In [16]:
def get_pair_freqs(pre_token_byte_freqs):
    char_pair_freqs = Counter()
    for char_sequence, freq in pre_token_byte_freqs.items():
        for char_pair in zip(char_sequence, char_sequence[1:]):
            char_pair_freqs[char_pair] += freq
    return char_pair_freqs

In [17]:
def update_byte_freq_with_max(pre_token_byte_freqs, max_char_pair):
    pre_token_byte_freqs_updated = {}
    for char_sequence, freq in pre_token_byte_freqs.items():
        char_seq_updated = []
        i = 0
        while i < len(char_sequence):
            if i == len(char_sequence) - 1:
                char_seq_updated.append(char_sequence[i])
                break
            char_pair = (char_sequence[i], char_sequence[i+1])        
            if char_pair == max_char_pair:
                char_seq_updated.append("".join(max_char_pair))
                i += 1
            else:
                char_seq_updated.append(char_sequence[i])
            i += 1
        pre_token_byte_freqs_updated[tuple(char_seq_updated)] = freq
    return pre_token_byte_freqs_updated

def merge_update(pre_token_byte_freqs):
    char_pair_freqs = get_pair_freqs(pre_token_byte_freqs)
    # x[1] is the freq count of the char pair.
    # break ties by the lexicographically greater (e.g. alphabetically pair) wins.
    max_char_pair = max(char_pair_freqs.items(), key = lambda x: (x[1], x[0]))[0]
    pre_token_byte_freqs_updated = update_byte_freq_with_max(pre_token_byte_freqs, max_char_pair)
    return pre_token_byte_freqs_updated, "".join(max_char_pair)


merges = []
for i in range(6):
    print(f"Merge [{i}]\npre_token_byte_freqs: {pre_token_byte_freqs}")
    pre_token_byte_freqs, merged_chars = merge_update(pre_token_byte_freqs)
    print(f"\tmerge: {merged_chars}\n\tpre_token_byte_freqs: {pre_token_byte_freqs}")

    merges.append(merged_chars)

Merge [0]
pre_token_byte_freqs: {('l', 'o', 'w'): 5, ('l', 'o', 'w', 'e', 'r'): 2, ('w', 'i', 'd', 'e', 's', 't'): 3, ('n', 'e', 'w', 'e', 's', 't'): 6}
	merge: st
	pre_token_byte_freqs: {('l', 'o', 'w'): 5, ('l', 'o', 'w', 'e', 'r'): 2, ('w', 'i', 'd', 'e', 'st'): 3, ('n', 'e', 'w', 'e', 'st'): 6}
Merge [1]
pre_token_byte_freqs: {('l', 'o', 'w'): 5, ('l', 'o', 'w', 'e', 'r'): 2, ('w', 'i', 'd', 'e', 'st'): 3, ('n', 'e', 'w', 'e', 'st'): 6}
	merge: est
	pre_token_byte_freqs: {('l', 'o', 'w'): 5, ('l', 'o', 'w', 'e', 'r'): 2, ('w', 'i', 'd', 'est'): 3, ('n', 'e', 'w', 'est'): 6}
Merge [2]
pre_token_byte_freqs: {('l', 'o', 'w'): 5, ('l', 'o', 'w', 'e', 'r'): 2, ('w', 'i', 'd', 'est'): 3, ('n', 'e', 'w', 'est'): 6}
	merge: ow
	pre_token_byte_freqs: {('l', 'ow'): 5, ('l', 'ow', 'e', 'r'): 2, ('w', 'i', 'd', 'est'): 3, ('n', 'e', 'w', 'est'): 6}
Merge [3]
pre_token_byte_freqs: {('l', 'ow'): 5, ('l', 'ow', 'e', 'r'): 2, ('w', 'i', 'd', 'est'): 3, ('n', 'e', 'w', 'est'): 6}
	merge: low
	pre_t

In [20]:
import os

In [19]:
data_dir = "/media/bryan/ssd01/data/cs336"

In [24]:
train_dataset_fpath = os.path.join(data_dir, "TinyStoriesV2-GPT4-valid.txt")

In [25]:
with open(train_dataset_fpath, 'r', encoding='utf-8') as f:
    train_dataset_text = f.read()

In [28]:
train_dataset_text[:500]

'u don\'t have to be scared of the loud dog, I\'ll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.\n<|endoftext|>\nOnce upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\nTom asked his friend, Sam, to help him search for the ball. They looked high a'

In [29]:
special_tokens = ["<|endoftext|>",]

In [49]:
def split_text_on_special_tokens(text, special_tokens = ("<|endoftext|>",)):
    delimiter = re.escape("|".join(special_tokens))
    return re.split(delimiter, text)

In [43]:
re.escape(train_dataset_text[:300])

'u\\ don\'t\\ have\\ to\\ be\\ scared\\ of\\ the\\ loud\\ dog,\\ I\'ll\\ protect\\ you"\\.\\ The\\ mole\\ felt\\ so\\ safe\\ with\\ the\\ little\\ girl\\.\\ She\\ was\\ very\\ kind\\ and\\ the\\ mole\\ soon\\ came\\ to\\ trust\\ her\\.\\ He\\ leaned\\ against\\ her\\ and\\ she\\ kept\\ him\\ safe\\.\\ The\\ mole\\ had\\ found\\ his\\ best\\ friend\\.\\\n<\\|endoftext\\|>\\\nOnce\\ upon\\ a\\ time,\\ in\\ a\\ warm\\ and\\ sunny\\ place,'

In [50]:
text_split = split_text_on_special_tokens(train_dataset_text[:500])

In [51]:
text_split

['u don\'t have to be scared of the loud dog, I\'ll protect you". The mole felt so safe with the little girl. She was very kind and the mole soon came to trust her. He leaned against her and she kept him safe. The mole had found his best friend.\n',
 '\nOnce upon a time, in a warm and sunny place, there was a big pit. A little boy named Tom liked to play near the pit. One day, Tom lost his red ball. He was very sad.\nTom asked his friend, Sam, to help him search for the ball. They looked high a']

In [77]:
tuple(map(int, " They".encode("utf-8")))

(32, 84, 104, 101, 121)

In [62]:
encoded_bytes = " They".encode("utf-8")
for i in range(len(encoded_bytes)):
    print(encoded_bytes[i])

32
84
104
101
121


In [78]:
vocab: dict[int, bytes] = {x: bytes([x]) for x in range(256)}

In [81]:
vocab[84] + vocab[104]

b'Th'

In [84]:
bytes([86]).decode("utf-8")

'V'

In [76]:
list(map(int, "<|endoftext|>".encode("utf-8")))

[60, 124, 101, 110, 100, 111, 102, 116, 101, 120, 116, 124, 62]

In [86]:
import pickle

In [112]:
with open("/media/bryan/ssd01/expr/llm_from_scratch/tokenization/bpe_10k_tinystories.pkl", "rb") as f:
    bpe_data = pickle.load(f)

In [113]:
bpe_data.keys()

dict_keys(['vocab', 'merges'])

In [103]:
len(bpe_data["vocab"])

10000

In [104]:
max_token_len = 0
max_token = None
for token in bpe_data["vocab"].values():
    if len(token) > max_token_len:
        max_token_len = len(token)
        max_token = token

print(f"max_token: {max_token}, length: {max_token_len}")

max_token: b' accomplishment', length: 15


In [106]:

bpe_data["merges"][:10]

with 

[(b' ', b't'),
 (b'h', b'e'),
 (b' ', b'a'),
 (b' ', b's'),
 (b' ', b'w'),
 (b'n', b'd'),
 (b' t', b'he'),
 (b'e', b'd'),
 (b' ', b'b'),
 (b' t', b'o')]

In [99]:
with open("/media/bryan/ssd01/expr/llm_from_scratch/tokenization/bpe_32k_owt_train.pkl", "rb") as f:
    bpe_data = pickle.load(f)

In [100]:
max_token_len = 0
max_token = None
for token in bpe_data["vocab"].values():
    if len(token) > max_token_len:
        max_token_len = len(token)
        max_token = token

print(f"max_token: {max_token}, length: {max_token_len}")

max_token: b'\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82\xc3\x83\xc3\x82', length: 64


In [109]:
import pickle
import os
import json
def pickle_as_text(pkl_file_path: str):
    dirname = os.path.dirname(pkl_file_path)
    basename = os.path.splitext(os.path.basename(pkl_file_path))[0]
    print(f"Loading {pkl_file_path}")
    with open(pkl_file_path, "rb") as f:
        bpe_data = pickle.load(f)
    
    for key, data in bpe_data.items():
        file_path = os.path.join(dirname, f"{basename}_{key}.json")
        print(f"Saving {key} to {file_path}")
        with open(file_path, "w") as f:
            json.dump(data, f, indent=4)

In [110]:
pickle_as_text("/media/bryan/ssd01/expr/llm_from_scratch/tokenization/bpe_10k_tinystories.pkl")

Loading /media/bryan/ssd01/expr/llm_from_scratch/tokenization/bpe_10k_tinystories.pkl
Saving vocab to /media/bryan/ssd01/expr/llm_from_scratch/tokenization/bpe_10k_tinystories_vocab.json


TypeError: Object of type bytes is not JSON serializable

In [114]:
vocab = bpe_data["vocab"]

In [117]:
bytes_to_token_id = {v:k for k,v in vocab.items()}

In [119]:
import regex

special_tokens = ["<|endoftext|>", "SPECIAL"]
text = "This is some text<|endoftext|>followed by SPECIALmore text<|endoftext|>end."

# Escape special characters in tokens and join into a regex pattern
pattern = "(" + "|".join(map(regex.escape, special_tokens)) + ")"

# Split the text while keeping the delimiters
chunks = regex.split(pattern, text)

# Optionally remove empty strings (e.g., from split at the beginning)
chunks = [chunk for chunk in chunks if chunk]

print(chunks)

['This is some text', '<|endoftext|>', 'followed by ', 'SPECIAL', 'more text', '<|endoftext|>', 'end.']


In [134]:
re.findall?

In [138]:
bytes1, bytes2 = bpe_data["merges"][-10]

In [139]:
bytes2[:3]

b'iskers'

In [141]:
bytes2[:3] == b"isk"

True

In [142]:
len(bytes1)

3

In [143]:
bytes2[1:3]

b'sk'

In [145]:
token = list(bytes2)
token

[105, 115, 107, 101, 114, 115]

In [146]:
token[0] == bytes2[0]

True

In [147]:
list(map(int, bytes2))

[105, 115, 107, 101, 114, 115]

In [148]:
token  =  b"bytes"
A = list(token)
B = list(map(int, token))

In [154]:
A[2:5] = [1000]

In [156]:
test_bytes = "hello! こんにちは!".encode("utf-8")

In [157]:
test_bytes

b'hello! \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf!'

In [158]:
list(test_bytes)

[104,
 101,
 108,
 108,
 111,
 33,
 32,
 227,
 129,
 147,
 227,
 130,
 147,
 227,
 129,
 171,
 227,
 129,
 161,
 227,
 129,
 175,
 33]

In [159]:
replacement_char = '\uFFFD'

In [160]:
type(replacement_char)

str

In [161]:
replacement_char = chr(0xFFFD)

In [163]:
type(replacement_char)

str

In [165]:
chr(0x1F600)

'😀'

In [167]:
import regex as re
PRETOKENIZATION_REGEX = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

In [168]:
text = "This is some text<|endoftext|>followed by SPECIALmore text<|endoftext|>end."

In [169]:
re.findall(PRETOKENIZATION_REGEX, text)

['This',
 ' is',
 ' some',
 ' text',
 '<|',
 'endoftext',
 '|>',
 'followed',
 ' by',
 ' SPECIALmore',
 ' text',
 '<|',
 'endoftext',
 '|>',
 'end',
 '.']

In [170]:
bytes1 = b's'

In [172]:
bytes1_list = list(bytes1)
bytes1_list

[115]

In [173]:
bytes(bytes1_list)

b's'

In [181]:
token = b"bytes"
split_bytes = [bytes([b]) for b in token]
print(split_bytes)

[b'b', b'y', b't', b'e', b's']


In [182]:
for b in token:
    print(b)

98
121
116
101
115


In [186]:
# with open("/home/bryan/src/LLM-from-scratch/tests/fixtures/tinystories_sample.txt", "r") as f:
#     for text in f:
#         print(text)

In [191]:
b'\xFF'.decode("utf-8", errors="replace")

'�'