In [1]:
## Download text file

import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
        "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
        "the-verdict.txt")

file_path = "the-verdict-txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict-txt', <http.client.HTTPMessage at 0x1fce28c70a0>)

In [2]:
# Read some lines in short story as a sample
with open("the-verdict-txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of characters: ", len(raw_text))
print(raw_text[:99]) # gives first 100 characters of the file

Total number of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


Our task is to tokenize these 20,479 character into individual words and special characters so that we can then turn into embeddings for LLM training.

We can use re.split for understandig purpose. We can see how this step gives list of individual words, whitespaces and punctuation characters.

In [3]:
import re
text = "Hello, Aditya. This is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'Aditya.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']


but still we have white space and that is of no use, also we will refrain from changing all characters small,as it can affect LLM training

In [4]:
text = "Hello, Aditya, Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()] # to remove whitespace
print(result)

['Hello', ',', 'Aditya', ',', 'Is', 'this', '--', 'a', 'test', '?']


Now we have our basic tokenizer scheme, let's apply it to our short story

In [5]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()] # to remove whitespace
print(len(preprocessed))

4690


outputs 4690, which is number of tokens in this text without whitespaces. Let's see first 30 tokens-

In [6]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


##### Now next task-
##### Converting tokens into token IDS

convert tokens to an integer representation to produce token IDs, this is an intermediate step before embedding


In [7]:
all_words = sorted(set(preprocessed)) # list of all unique tokens and sort them alphabatically
vocab_size = len(all_words)
print(vocab_size)

1130


Our vocabulary size is 1130, now create the vocabulary and print fist 51 entries to see.
Below dictionary contains individual tokens associated with unique integer label.

In [8]:
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


Now, we have to apply above created vocabulary to convert new text into tokens Ids.

Let's create tokenizer class with `encode` method that split text into tokens and carries out string-to-integrer mapping to produce token Ids via vocabulary. Alos `decode` method that carries out reverse integer-to-string mapping, and convert the tokens IDs back into text.

In [9]:
## Implementing a Simple text tokenizer
class SimpleTokenizerV1:
    def __init__(self, vocab):
        """
        Stores the vocabulary as class attribute for access in the encode and decode method
        """
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()} # inverse vocabualry that maps Ids to text
        
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed] 
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids]) # convert IDs into text
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # remove spaces before specified punctuation
        return text 

From this class, we can initiate new tokenizer objects via existing vocabulary. Let' see example from our short story and take some lines.

In [10]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)



[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [11]:
# back to text
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [12]:
"""
# more example
text = "The brown dog playfully chased the swift fox"
print(tokenizer.encode(text))
"""
# it will thow error becuase our short story does not have word dog, hello etc


'\n# more example\ntext = "The brown dog playfully chased the swift fox"\nprint(tokenizer.encode(text))\n'

#### Adding special context tokens

Now, we need to modify the tokenizer to handle unknown words. And for this we use special tokens. These special tokens can include markers for unknown words and document boundaries, fro ex - we can support two new tokens <|unk|> and <|endoftext|>. The unk token can be used when word is not from vocabulary. endoftext tokens are prepended to each susequent text source.

In [13]:
# let's add these tokens by modifying our vocab
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}

print(len(vocab.items()))

1132


new vocabulary size is 1,132 (the previous vocabulary size was 1,130). Let's quick check and print last five entries

In [14]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [15]:
## Implementing a Simple text tokenizer with 2 more tokens
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
        item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int
                        else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

V2 will replace unknown word with unk tokens. Let's try 

In [16]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [17]:
# tokenize
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [18]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


There are more special tokens used in LLM's like BOS-Beginning of seq, EOS- end of seq, PAD- padding- take care of texts of varying lengths.

Tokenizer use for GPT does not use any of these tokens it only uses endoftext for simplicity. Also, unk is also not used by GPT, Instead GPT model uses a `byte pair encoding` tokenizer which breaks words into subwords units. We will see this concept now.

The code will be based on tiktoken 0.7.0 
BPE tokenizer is used to train GPT 2,3. It is complex so we will access using tiktoken

In [19]:
pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [20]:
from importlib.metadata import version
import tiktoken
print("tiktoken version: ", version("tiktoken"))

tiktoken version:  0.8.0


Now, we can insantiate the BPE tokenizer, it will work same as SimpleTokenizerV2 via an encode method

In [21]:
tokenizer = tiktoken.get_encoding("gpt2")

In [22]:
text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces"
        "of someunknownPlace." )
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [23]:
# convert back to text from above token IDs
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


A quick observation - 
First, the <|endoftext|> token is assigned a relatively large token ID, namely, 50256. In fact, the BPE tokenizer which was used to train models such as GPT-2, GPT-3, and the original model used in ChatGPT, has a total vocabulary size of 50,257, with <|endoftext|> being assigned the largest token ID.


Second, the BPE tokenizer encodes and decodes unknown words, such as someunknownPlace, correctly. The BPE tokenizer can handle any unknown word. How does it achieve this without using <|unk|> tokens?

BPE - breaks down unknown words that are not in vocabualry into smaller subword unit or individual characters, enabeling it to handle out of vocabualry words. So for unknown words it can represent it as a sequence of subword tokens or characters.

In [24]:
# Exercise 2.1
text = ("Akwirw ier")
integer = tokenizer.encode(text)
print(integer)

[33901, 86, 343, 86, 220, 959]


In [25]:
words = tokenizer.decode(integer)
print(words)

Akwirw ier


##### Next task - Data Sampling with a sliding window

Now we have to generate input-target pairs required for training an LLM. AS we know LLM are pretrained by predicting the next word in a text. So the diagonal elements will be target.

Implement data loader that fetches the input-target pair from training dataset using a sliding window approach. 

First let's tokenize whole "The verdict" story using BPE

In [26]:
with open("the-verdict-txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


Total number of tokens in training set are 5145 after BPE tokenizer. We remove first 50 tokens from dataset fro demonstration,as there is more intersting text passage in next steps:

In [27]:
enc_sample = enc_text[50:]

To create input-target pair for next prediction task is to create two variable, x and y, where x contain input tokens and y target. Shifted input by 1.

In [28]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x:  {x}")
print(f"y:       {y}")

x:  [290, 4920, 2241, 287]
y:       [4920, 2241, 287, 257]


By processsing the inputs alog with the target, which are the inputs shifted by one position, we can create the next-word prediction tasks

In [29]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "----->", desired)

[290] -----> 4920
[290, 4920] -----> 2241
[290, 4920, 2241] -----> 287
[290, 4920, 2241, 287] -----> 257


### Dataset for batched input and targets

For the efficient data loader implementation, we will use PyTorch's build-in `Dataset` and `DataLoader` classes.

In [30]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(txt) # Tokenizes the entire text
        for i in range(0, len(token_ids) - max_length, stride):
            # Uses a sliding window to chunk the book into overlapping sequences of max_length
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
            
    def __len__(self):
        return len(self.input_ids)  #returns the total number of rows 
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]  # returns a single row from the dataset

##### DataLoader to generate batches with input with pairs

Following code uses the GPTDatasetV1 to load the inputs in batches via PyTorch `DataLoader`

In [31]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) # create dataset
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last, # drop_last=True drops the last batch if it is shorter than the specified batch size, prevent loss
        num_workers=num_workers # number of CPU processes to use for preprocessing
    )
    
    return dataloader

Test the dataloader with batch size 1 for an LLM with context size 4 to develope an intiution of how GPTDatasetV1 class and create_dataloader_v1 function

In [32]:
with open("the-verdict-txt","r", encoding="utf-8") as f:
    raw_text = f.read()
    
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
# convert dataloader into a python iterator to fetch the next entry via built in next() function
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


so we can see that, first_batch variable contains two tensors: first - stores the input tokens IDs and second- stores the target token IDs. Here max_length is 4, each token have 4 token IDs. It is small, usually input size is 256 to train LLMs.


to undertstand meaning of stride, let's fetch another batch from this dataset.

In [33]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


we compare the first and second batches, we can see that the second batch’s token IDs are shifted by one position (for example, the second ID in the first batch’s input is 367, which is the first ID of the second batch’s input). The stride setting dictates the number of positions the inputs shift across batches, emulating a sliding window approach.

let's see how dataloader work with batch size greater than 1

In [34]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs: \n", inputs)
print("\nTargets: \n", targets)

Inputs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets: 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


##### Creating token embeddings

Last step in preparing the input text for LLM training is to convert token Ids into embedding vectors. These embedding weights are initialized with random values. A continous vector representation or embedding is necessary as GPT like LLM are deep neural network trained with the backpropagation algorithm.

Let's see how the token ID to embedding vector conversion works with simple example, take input tokens Ids 2, 3, 5 and 1

In [35]:
input_ids = torch.tensor([2, 3, 5, 1])

For simplicity, assume we have small vocabulary of 6 words and we want to create embedding of size 3 

In [36]:
vocab_size = 6
output_dim = 3

Now we can instantiate an embedding layer in PyTorch, set random seed = 123 for reproducibility.

In [37]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


It gives small random values. These values are optimized during LLM training as part of the LLM optimization itself. We can see, 6 rows and 3 cols, there is one row for each of the 6 possible tokens in vocabulaty and 1 col for each of three embedding dims.
Now, let's apply to a token ID to get embedding vector.

In [39]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [40]:
# for our example
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


#### Emcoding word positions
we can use two broad categories of position-aware embeddings: relative positional embeddings and absolute positional embeddings. Absolute positional embeddings are directly associated with specific positions in a sequence. For each position
in the input sequence, a unique embedding is added to the token’s embedding to convey its exact location.

Instead of focusing on the absolute position of a token, the emphasis of relative positional embeddings is on the relative position or distance between tokens. This means the model learns the relationships in terms of “how far apart” rather than “at which exact position.” The advantage here is that the model can generalize better to sequences
of varying lengths, even if it hasn’t seen such lengths during training.
Both types of positional embeddings aim to augment the capacity of LLMs to understand the order and relationships between tokens, ensuring more accurate and context-aware predictions. The choice between them often depends on the specific
application and the nature of the data being processed.
OpenAI’s GPT models use absolute positional embeddings that are optimized during the training process rather than being fixed or predefined like the positional encodings in the original transformer model. This optimization process is part of the
model training itself. For now, let’s create the initial positional embeddings to create the LLM inputs.

Previously, we focused on very small embedding sizes for simplicity. Now, let’s consider more realistic and useful embedding sizes and encode the input tokens into a 256-dimensional vector representation, which is smaller than what the original GPT-3
model used (in GPT-3, the embedding size is 12,288 dimensions) but still reasonable for experimentation. Furthermore, we assume that the token IDs were created by the BPE tokenizer we implemented earlier, which has a vocabulary size of 50,257:

In [41]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

If we sample data from `dataloader`, we embed each token in each other into 256 dim vector. If we have a batch size of 8 with 4 tokens each, the result will be 8 X 4 X 256 tensor.

In [42]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


we can see, the token ID tensor is 8 X 4 dim, meaning data batch consists of 8 text samples with 4 token each. Let's now use embedding layer to embed these tokens Ids into 256 dim vectors.

In [43]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


For GPT's absolute embedding approach, we need to create another embedding layer that has same embedding dimension as `token_embedding_layer`

In [44]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


As we can see, the positional embedding tensor consists of four 256-dimensional vectors. We can now add these directly to the token embeddings, where PyTorch will add the 4 × 256–dimensional pos_embeddings tensor to each 4 × 256–dimensional token
embedding tensor in each of the eight batches:

In [45]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
