## Building a basic tokenizer

In [1]:
from importlib.metadata import version

print("torch version: ", version("torch"))
print("tiktoken version: ", version("tiktoken"))

torch version:  2.4.1
tiktoken version:  0.7.0


In [2]:
# playing around with the verdict by edith wharton
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [4]:
# Some basic tokenization examples
# Basic tokenization - space separated
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [5]:
# Basic tokenization - space and punctuation separated
result = re.split(r'([,.]|\s)', text)
print(result)

# this will create groups of empty string too, which we want to skip
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']
['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [6]:
# tokenization above, handles only basic punctuation, make it more general

text = "Hello, world. Is this-- a text?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'text', '?']


In [7]:
# the regex that we've arrived at seems pretty general
# time to apply it to raw text

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

print("Total tokens: ", len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
Total tokens:  4690


In [8]:
# build a vocabulary from these set of tokens

all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

# num unique words - 1130
# everytime I get a word from the dict - it will be mapped to the same int on the input side

1130


In [9]:
# intialize a vocabulary to keep track of all words
vocab = {token:integer for integer, token in enumerate(all_words)}

In [10]:
for key, val in vocab.items():
    print(key, val)
    if val >= 50:
        break

! 0
" 1
' 2
( 3
) 4
, 5
-- 6
. 7
: 8
; 9
? 10
A 11
Ah 12
Among 13
And 14
Are 15
Arrt 16
As 17
At 18
Be 19
Begin 20
Burlington 21
But 22
By 23
Carlo 24
Chicago 25
Claude 26
Come 27
Croft 28
Destroyed 29
Devonshire 30
Don 31
Dubarry 32
Emperors 33
Florence 34
For 35
Gallery 36
Gideon 37
Gisburn 38
Gisburns 39
Grafton 40
Greek 41
Grindle 42
Grindles 43
HAD 44
Had 45
Hang 46
Has 47
He 48
Her 49
Hermia 50


In [11]:
# Let's put it all together into a class


class SimpleTokenizerV1():
    def __init__(self, vocab):

        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):

        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):

        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [12]:
# let's test the tokenizer we've defined above

tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [13]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [14]:
# simple test to encode and decode, expect the same input text as output

tokenizer.decode(tokenizer.encode(text))

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

## Note about special tokens and GPT2 Tokenization

- Many LLMs use additional tokens <bos> <eos> <pad> to provide LLMs with additional context
- GPT2 uses <|endoftext|> as the only special token - one for padding and the second one for denoting end of sequence.
- Usually <|endoftext|> is used to concetante 2 unrelated pieces of text
- Also, GPT2 uses byte pair encoding, which breaks down the text at a subword level making sure that nothing is out of vocabulary

In [15]:
tokenizer = SimpleTokenizerV1(vocab)
text = "Hello, do you like tea. Is this-- a test?"
tokenizer.encode(text)

# Here we didn't have code for managing out of vocabulary text
# resulting in an error - when we encounter such text

KeyError: 'Hello'

In [16]:
# let's include special tokens to handle these situations

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

# update the original vocabulary
vocab = {s:i for i,s in enumerate(all_tokens)}

In [17]:
# updated len of vocab
len(vocab.items())

1132

In [18]:
# let's peek into the ids assigned to the vocab
for i, item in list(vocab.items())[-5:]:
    print(i, item)

younger 1127
your 1128
yourself 1129
<|endoftext|> 1130
<|unk|> 1131


In [19]:
# Update the tokenizer class created earlier with support for
# handling tokens out of the vocabulary

class SimpleTokenizerV2():
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):

        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [20]:
# Let's try out the modified tokenizer
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [21]:
tokenizer.encode(text)

# hello, palace - 1131
# <|endoftext|> - 1130

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [22]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

## Playing around with BytePairEncoding

- GPT2 uses a BPE as its tokenizer
- Allows the model to breakdown, words not in it's predefined vocabulary into smaller words - subwords or individual characters, enabling it to handle out of vocabulary words


In [23]:
import importlib
import tiktoken

print("tiktoken version: ", version("tiktoken"))

tiktoken version:  0.7.0


In [24]:
# set the tokenizer to use BPE

tokenizer = tiktoken.get_encoding("gpt2")

In [25]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

# allowed special maps the given token in the vocabulary to a given integer
# otherwise if we disable allowed_special it will throw an error, further more it will break it down into subwords

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [26]:
strings = tokenizer.decode(integers)
print(strings)

# we've got the same text back as the original

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


## Data Sampling with a Sliding Window

- We train LLMs one word at a time, so we want to prepare the training data accordingly where the next word in the sequence represents the target to predict

In [27]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [33]:
enc_sample = enc_text[50:]

context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print("x: ", x)
print("y:     ", y)

x:  [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [34]:
# visualizing how the prediction would look one by one

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, '---->', desired)
    print(tokenizer.decode(context), '---->', tokenizer.decode([desired]))
    print()

[290] ----> 4920
 and ---->  established

[290, 4920] ----> 2241
 and established ---->  himself

[290, 4920, 2241] ----> 287
 and established himself ---->  in

[290, 4920, 2241, 287] ----> 257
 and established himself in ---->  a



## Creating a simple dataloader to create the dataset above

In [35]:
import torch

print("torch version: ", version("torch"))

torch version:  2.4.1


In [36]:
# we give samples from the dataloader in the format which we intend to train
# embedding - label

# here we are passing
# context -> label [extra tokens]

from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):

    def __init__(self, txt, tokenizer, max_length, stride):
        
        self.input_ids = []
        self.target_ids = []

        # tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        for i in range(0, len(token_ids) - max_length, stride):

            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [38]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                        stride=128, shuffle=True, drop_last=True,
                        num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [42]:
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=1,
    max_length=4,
    stride=3,
    shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [43]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1464, 1807, 3619,  402]]), tensor([[1807, 3619,  402,  271]])]


In [44]:
for i in range(10):
    curr_batch = next(data_iter)
    print(curr_batch)
    print()

[tensor([[  402,   271, 10899,  2138]]), tensor([[  271, 10899,  2138,   257]])]

[tensor([[ 2138,   257,  7026, 15632]]), tensor([[  257,  7026, 15632,   438]])]

[tensor([[15632,   438,  2016,   257]]), tensor([[ 438, 2016,  257,  922]])]

[tensor([[ 257,  922, 5891, 1576]]), tensor([[ 922, 5891, 1576,  438]])]

[tensor([[1576,  438,  568,  340]]), tensor([[438, 568, 340, 373]])]

[tensor([[ 340,  373,  645, 1049]]), tensor([[ 373,  645, 1049, 5975]])]

[tensor([[1049, 5975,  284,  502]]), tensor([[5975,  284,  502,  284]])]

[tensor([[ 502,  284, 3285,  326]]), tensor([[ 284, 3285,  326,   11]])]

[tensor([[326,  11, 287, 262]]), tensor([[  11,  287,  262, 6001]])]

[tensor([[ 262, 6001,  286,  465]]), tensor([[ 6001,   286,   465, 13476]])]



In [46]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4,
    stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs, target = next(data_iter)

print("Inputs: \n", inputs)
print("Targets: \n", target)

Inputs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets: 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## Creating token embeddings

In [47]:
input_ids = torch.tensor([2, 3, 5, 1])

In [48]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [49]:
print(embedding_layer.weight)

# embedding layer is just a lookup - where we convet specific ids to specific continuous numbers

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [53]:
# Get the embedding for each of the inde

print("2 : ", embedding_layer(torch.tensor(2)))
print("3 : ", embedding_layer(torch.tensor(3)))
print("5 : ", embedding_layer(torch.tensor(5)))
print("1 : ", embedding_layer(torch.tensor(1)))

# embedding layer is basically a lookup
# let's say that I wanted a 512 size one hot vector for the number 3, 
# then the position at which the numbers are one hot encoded would change according the ouptut vector dimension

2 :  tensor([ 1.2753, -0.2010, -0.1606], grad_fn=<EmbeddingBackward0>)
3 :  tensor([-0.4015,  0.9666, -1.1481], grad_fn=<EmbeddingBackward0>)
5 :  tensor([-2.8400, -0.7849, -1.4096], grad_fn=<EmbeddingBackward0>)
1 :  tensor([0.9178, 1.5810, 1.3010], grad_fn=<EmbeddingBackward0>)


### Creating token embeddings

- embedding layer converts ids into vector representation irrespective of where they are located.
- positional embedding is combined with the token embedding vector to form the input embedding to the LLM

In [54]:
vocab_size = 505257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [55]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, target = next(data_iter)

In [56]:
print("Token Ids \n", inputs)
print("\nInputs shape: \n", inputs.shape)

Token Ids 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape: 
 torch.Size([8, 4])


In [57]:
# let's create token embeddings
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [58]:
# gpt2 uses absolute position embeddings, so we create another embedding layers

context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)


In [59]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [60]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
