### Chap -2 Code Building a tokenizer


In [1]:
pip install tiktoken


Collecting tiktoken
  Downloading tiktoken-0.8.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Downloading tiktoken-0.8.0-cp312-cp312-win_amd64.whl (883 kB)
   ---------------------------------------- 0.0/883.8 kB ? eta -:--:--
   --------------------------------------- 883.8/883.8 kB 10.0 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.8.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import tiktoken
import torch
import os
import urllib.request
import re

### Loading raw text

In [5]:
if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [6]:
with open("the-verdict.txt", "r") as f:
    text = f.read()

print(text[:500])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it'


### Creating tokens using regex and converting it into token IDs

In [9]:
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
tokens = [t.strip() for t in tokens if t.strip()]
print(tokens[:20])


['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was']
4690


In [11]:
words  = sorted(set(tokens))
l = len(words)
print(l)
vocab = {token:integer for integer, token in enumerate(words)}


1130


In [19]:
# putting it all in class
class SimpleTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.int_to_str = {v:k for k,v in self.vocab.items()}

    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [t.strip() for t in tokens if t.strip()]
        return [self.vocab[token] for token in tokens]
    
    def decode(self, tokens):
        text = "".join([self.int_to_str[token] for token in tokens])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [20]:
# example
tokenizer = SimpleTokenizer(vocab)
encoded = tokenizer.encode(text)
print(encoded[:10])



[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486]


In [23]:
tokenizer.decode(encoded[:10])

'I HAD always thought Jack Gisburn rather a cheap genius'

In [22]:
#Code from book.
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    
tokenizer = SimpleTokenizerV1(vocab)
encoded = tokenizer.encode(text)
print(encoded[:10])
tokenizer.decode(encoded[:10])


[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486]


'I HAD always thought Jack Gisburn rather a cheap genius'

In [24]:
# adding special tokens
vocab['<PAD>'] = l
vocab['<UNK>'] = l+1
vocab['<BOS>'] = l+2
vocab['<EOS>'] = l+3
l += 4



In [27]:
# writing the tokenizer classs so that it knows when and how to use the special tokens
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text, add_bos=False, add_eos=False):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int.get(s, self.str_to_int['<UNK>']) for s in preprocessed]
        if add_bos:
            ids = [self.str_to_int['<BOS>']] + ids
        if add_eos:
            ids = ids + [self.str_to_int['<EOS>']]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    
tokenizer = SimpleTokenizerV2(vocab)
encoded = tokenizer.encode(text, add_bos=True, add_eos=True)
print(encoded[:10])
tokenizer.decode(encoded[:100])


[1132, 53, 44, 149, 1003, 57, 38, 818, 115, 256]


'<BOS> I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera.( Though I rather thought it would have been Rome or Florence.)" The height of his glory" -- that was what the women called it. I can hear Mrs. Gideon Thwing -- his last Chicago sitter'

In [30]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <EOS> ".join((text1, text2))

encoded = tokenizer.encode(text, add_bos=True, add_eos=True)
print(encoded[:10])
tokenizer.decode(encoded)




[1132, 1131, 5, 355, 1126, 628, 975, 10, 1133, 55]


'<BOS> <UNK>, do you like tea? <EOS> In the sunlit terraces of the <UNK>. <EOS>'

### BytePair Encoding and Data Sampling
using the tiktoken library

In [31]:
tokenizer = tiktoken.get_encoding("gpt2")

In [35]:
with open("the-verdict.txt", "r") as f:
    text = f.read()

enc_text = tokenizer.encode(text)
print(len(enc_text))
enc_sample  = enc_text[100:]


5145


In [36]:
context_size = 8
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(x)
print(y)

[5469, 438, 14363, 938, 4842, 1650, 353, 438]
[438, 14363, 938, 4842, 1650, 353, 438, 2934]


In [37]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(tokenizer.decode(context), "==>", tokenizer.decode([target]))

wing ==> --
wing-- ==> his
wing--his ==>  last
wing--his last ==>  Chicago
wing--his last Chicago ==>  sit
wing--his last Chicago sit ==> ter
wing--his last Chicago sitter ==> --
wing--his last Chicago sitter-- ==> de


In [38]:
# Implementing a simple data loader using pytorch, that iterates over the input dataset and returns the inputs and targets shifted by one
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, text, tokenizer, context_size=8,stride=1):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text, allowed_special = {"<EOS>"})

        for i in range(0, len(token_ids)-context_size, stride):
            self.input_ids.append(token_ids[i:i+context_size])
            self.target_ids.append(token_ids[i+1:i+context_size+1])

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return torch.LongTensor(self.input_ids[idx]), torch.LongTensor(self.target_ids[idx])
    
def create_data_loader(text, tokenizer, context_size=8, stride=128, batch_size=4):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = TextDataset(text, tokenizer, context_size, stride)

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader




In [40]:
dataloader = create_data_loader(text, tokenizer, context_size=8, stride=128, batch_size=8)
for x, y in dataloader:
    print(x)
    print(y)
    break

tensor([[ 3347, 27846,   503,  2048,  4628, 24882,   379,   262],
        [  257,  2726,  6227,   284,  1833,   683,  1365,    13],
        [  286,   616,  4286,   705,  1014,   510,    26,   475],
        [   11,   508,   550, 18459,  1068,   284,  1577,   257],
        [ 2612,  4369,    11,   523,   326,   612,   550,   587],
        [ 1021,   757,   438, 10919,   257,   410,  5040,   329],
        [  616,   705, 23873,  2350,     6, 14707,   588,   257],
        [  314,   550,  1775,   683,    11,   523,  1690,    11]])
tensor([[27846,   503,  2048,  4628, 24882,   379,   262,  8812],
        [ 2726,  6227,   284,  1833,   683,  1365,    13,   198],
        [  616,  4286,   705,  1014,   510,    26,   475,   314],
        [  508,   550, 18459,  1068,   284,  1577,   257, 23844],
        [ 4369,    11,   523,   326,   612,   550,   587,   645],
        [  757,   438, 10919,   257,   410,  5040,   329,   257],
        [  705, 23873,  2350,     6, 14707,   588,   257,  2156],
        [

### Creating token embeddings and encoding word positions


In [None]:
# using torch embedding
vocab_size = len(tokenizer.encoder)
embedding_dim = 4
torch.manual_seed(0)
embedding = torch.nn.Embedding(vocab_size, embedding_dim)