# Creating Tokens

In [1]:
with open("./the-verdict.txt", "r", encoding="UTF-8") as f:
    raw_text = f.read()

In [2]:
len(raw_text)

20479

In [3]:
raw_text[:99]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no '

### Splitting using re

In [4]:
import re
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


Tokens still have punctuation. \s seperates tokens based on space encountered. Moreoever, each space in also counted as a token.

<strong>Note:</strong> We will not lowercase the text because the capitalization helps LLMs distinguish between proper nouns and common nouns, understand sentence structure, and learn to generate text with proper capitalization.

In [5]:
result = re.split(r'([,.]|\s)', text)
result = [item for item in result if item.strip()] # Removing space tokens
result

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']

<strong>Note:</strong> Removing whitespaces reduces memory and computation. However, when training on data sensitive to space such as Python code, we will keep the spaces.

In [6]:
# Applying to our data
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.split()]
len(preprocessed)

4690

In [8]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


### Creating Token IDs

In [9]:
# Creating list of all unique tokens and sorting them alphabetically to determine vocab size
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [10]:
# Creating vocab
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i>= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [17]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        # Stores the vocabulary as a class attribute for access in the encode and decode methods
        self.str_to_int = vocab

        # Creates an inverse vocabulary that maps token IDs back to the original text tokens
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text): # Processes input text into token IDs
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.split()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids): # Converts token IDs back into text
        text = " ".join([self.int_to_str[i] for i in ids])

        # Removes spaces before the specified punctuation
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) 
        return text

In [18]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [19]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [20]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

<strong>Note:</strong> This problem occured because this word is not part of the voacb we created. To handle unknown words, we will add 2 new tokens - <|unk|> and <|endoftext|>.

<|unk|>: we will modify the tokenizer to use this token whenever it encounters an unknown word.<br>
<|endoftext|>: To separate two unrelated text source.

In [21]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

print(len(vocab.items()))

1132


In [22]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [28]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.split()]
        # Replaces unknown words by <|unk|> tokens
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) 
        return text

In [24]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [29]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [30]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


### Byte Pair Encoding

In [31]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.11.0-cp312-cp312-win_amd64.whl.metadata (6.9 kB)
Downloading tiktoken-0.11.0-cp312-cp312-win_amd64.whl (884 kB)
   ---------------------------------------- 0.0/884.3 kB ? eta -:--:--
   --------------------------------------- 884.3/884.3 kB 10.0 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.11.0


In [32]:
import tiktoken

In [34]:
tokenizer = tiktoken.get_encoding("gpt2")

In [35]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace.")
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [36]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


Original chatgpt has a total vocab size of 50,257, with <|endoftext|> being assigned the largest token id.

In [38]:
# Implementing on our dataset
enc_text = tokenizer.encode(raw_text)
len(enc_text)

5145

## Creating input-target pairs

In [39]:
enc_sample = enc_text[50:]

<strong>To create input-target pairs, we create 2 variables, x and y, where x contains the input and y the target, which is input shifted by 1.</strong>

In [41]:
# Context size 4 means that the model us trained to look at a sequence of 4 words (or token) 
# to predict the next word in the seq.
# The input x is the first 4 tokens [1, 2, 3, 4] and y is the next 4 tokens [2, 3, 4 5]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [42]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


Everything left of the arrow (---->) refers to the input an LLM would receive, and the token ID on the right side of the arrow represents the target token ID that the LLM is supposed to predict

In [44]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


<strong>Note:</strong> Here each input-target pairs corresponds to 1 prediction task. Since our context window is 4 here, we has 4 prediction tasks. 

To feed these input-output pairs to an LLM, we need to create a data loader that can return the input-target pairs as Pytorch tensors.

Note: BPE encoder performed the tokenization and conversion to token id in a single step.

In [45]:
import torch
from torch.utils.data import DataLoader, Dataset 

In [81]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_id = tokenizer.encode(text)

        # using sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_id) - max_length, stride):
            input_chunk = token_id[i:i+max_length]
            target_chunk = token_id[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [82]:
def create_dataloader_v1(text, batch_size=4, max_length = 256, stride = 128, 
                         shuffle = True, drop_last = True, num_workers = 0):
    # drop_last = True drops the last batch if its shorter than the specified batch size 
    # to prevent loss spikes during training
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)

    # Checks the __getitem__ class and returns the specified item, in our case the input-target pairs
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers)

    return dataloader

In [51]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464, 1807]]), tensor([[ 367, 2885, 1464, 1807]])]


In [52]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


<strong>Note:</strong> We increase the stride to 4 to utilize the data set fully (we don't skip a single word) but also avoid any overlap between the batches, since more overlap can lead to overfitting.

### Creating Token Embeddings

In [53]:
# Example
input_ids = torch.tensor([2, 3, 5, 1])

In [56]:
vocab_size = 6
output_dum = 3

torch.manual_seed(123)

# Embedding is a simple lookup table that stores embeddings of a fixed dictionary and size
# Used to store word embeddings and retrieve them using indices.
# Input is list of indices and output is word embeddings.
embedding_layer = torch.nn.Embedding(vocab_size, output_dum)

In [57]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

These are random values which will be later optimized.<br>
Note that the weight matrix has 6 rows and 3 columns.

In [58]:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

It is the same as the 4th row in weight matrix

### Positional Embedding

In [65]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [84]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
   stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [85]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


For each token id, we have created a embedding of size 256. Hence we have the size 8 x 4 x 256

In [86]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, out_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
pos_embeddings.shape

torch.Size([4, 256])

The context_length is a variable that represents the supported input size of the LLM. Here, we choose it similar to the maximum length of the input text. In practice, input text can be longer than the supported context length, in which case we have to truncate the text.

In [87]:
input_embeddings = token_embeddings + pos_embeddings
input_embeddings.shape

torch.Size([8, 4, 256])