# Chapter 2: Working with Data

We will first download the data if it does't exist, we want the text in the file the-verdict.txt

In [9]:
import os

from src.chapter2.data_sampling import first_batch

if os.path.exists("./data/the-verdict.txt"):
    print("Data already exists")
else:
    import requests
    data = requests.get("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
    if data.status_code == 200:
        os.makedirs("./data", exist_ok=True)
        with open("./data/the-verdict.txt", "w") as f:
            f.write(data.text)
        print("Data downloaded successfully")
    else:
        print("Failed to download data")

Data already exists


We will now read the data from the file

In [12]:
with open("data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


We will now split this text using the below regex (no need to memorize this regex)

In [16]:
import re
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip() != '']
print("Total number of tokens:", len(preprocessed))
print(preprocessed[:30])

Total number of tokens: 4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


We now create token ids for these tokens by assigning a unique integer to each token.

In [20]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print("Vocab size is:", vocab_size)
vocab = {token: idx for idx, token in enumerate(all_words)}  # This is a dictionary mapping tokens to their ids
for token, idx in list(vocab.items())[:51]:
    print(f"({token:>10s} -> {idx:>3d})")

Vocab size is: 1130
(         ! ->   0)
(         " ->   1)
(         ' ->   2)
(         ( ->   3)
(         ) ->   4)
(         , ->   5)
(        -- ->   6)
(         . ->   7)
(         : ->   8)
(         ; ->   9)
(         ? ->  10)
(         A ->  11)
(        Ah ->  12)
(     Among ->  13)
(       And ->  14)
(       Are ->  15)
(      Arrt ->  16)
(        As ->  17)
(        At ->  18)
(        Be ->  19)
(     Begin ->  20)
(Burlington ->  21)
(       But ->  22)
(        By ->  23)
(     Carlo ->  24)
(   Chicago ->  25)
(    Claude ->  26)
(      Come ->  27)
(     Croft ->  28)
( Destroyed ->  29)
(Devonshire ->  30)
(       Don ->  31)
(   Dubarry ->  32)
(  Emperors ->  33)
(  Florence ->  34)
(       For ->  35)
(   Gallery ->  36)
(    Gideon ->  37)
(   Gisburn ->  38)
(  Gisburns ->  39)
(   Grafton ->  40)
(     Greek ->  41)
(   Grindle ->  42)
(  Grindles ->  43)
(       HAD ->  44)
(       Had ->  45)
(      Hang ->  46)
(       Has ->  47)
(        He ->  48)


Now lets define a simple tokenizer that encapsulates this functionality, given a vocab, it should be able to conver the given text to tokens, that is encode and decode tokens to text


In [33]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {v:k for k, v in vocab.items()}


    def encode(self, text):
        processed = re.split(r'([,.?_!"()\']|--|\s)', text)
        processed = [item for item in processed if item.strip() != '']
        return [self.str_to_int[token] for token in processed]

    def decode(self, token_ids):
        text =  " ".join([self.int_to_str[token_id] for token_id in token_ids])
        # Remove the preceding space before the punctuations
        return re.sub(r'\s+([,.?!"()\'])', r'\1', text)

Lets test the implemented code with a sample text

In [35]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
SimpleTokenizerV1(vocab)
print(ids)
print(tokenizer.decode(ids))

# This code will not work as we wont find keys for missing tokens
# text = "Hello, do you like tea?"
# print(tokenizer.encode(text))

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [49]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(f"New vocab size is {len(vocab.items())}")
for item in list(vocab.items())[-5:]:
    print(f"({item[0]:>10s} -> {item[1]:>3d})")

New vocab size is 1132
(   younger -> 1127)
(      your -> 1128)
(  yourself -> 1129)
(<|endoftext|> -> 1130)
(   <|unk|> -> 1131)


We will now create a V2 tokenizer that is same as V1 but handles the unknown tokens and accepts the concatenated text as input seperated by ``<endoftext>`` token

In [50]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {v:k for k, v in vocab.items()}
        self.unknown_token = "<|unk|>"


    def encode(self, text):
        processed = re.split(r'([,.?_!"()\']|--|\s)', text)
        processed = [item for item in processed if item.strip() != '']
        return [self.str_to_int[token] if token in self.str_to_int else self.str_to_int[self.unknown_token] for token in processed]

    def decode(self, token_ids):
        text =  " ".join([self.int_to_str[token_id] for token_id in token_ids])
        # Remove the preceding space before the punctuations
        return re.sub(r'\s+([,.?!"()\'])', r'\1', text)

Lets test this new tomenizer

In [53]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## Byte Pair

In [86]:
from importlib.metadata import version
print("tiktoken version: ",version("tiktoken"))

tiktoken version:  0.8.0


Create the gpt-2 tokenizer and use it

In [75]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
text = (
            "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
             "of someunknownPlace."
)
print(f"Text is: \n{text}")
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(f"Total vocabulary size for {tokenizer.name} is {tokenizer.n_vocab}")
print("Encoded tokens are: ")
print(integers)
print("Decoding the above encoded tokens give:")
print(tokenizer.decode(integers))

Text is: 
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.
Total vocabulary size for gpt2 is 50257
Encoded tokens are: 
[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Decoding the above encoded tokens give:
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


### Data sampling with a sliding window

Start by encoding the entire text from the text file

In [77]:
with open("./data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text)
print(f"Total number of tokens in the text: {len(enc_text)}")

Total number of tokens in the text: 5145


Simple example of how the next word prediction data can be generated by creating two variables x and y

In [83]:
enc_sample = enc_text[50:]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]
print(f"x: {x}")
print(f"y:      {y}")
print("-"*50)
for i in range(1,context_size + 1):
  context = enc_sample[:i]
  desired = enc_sample[i]
  print(context, "---->", desired)
print("-"*50)
print("\t\t\tDecoded text")
print("-"*50)
for i in range(1,context_size + 1):
  context = enc_sample[:i]
  desired = enc_sample[i]
  print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]
--------------------------------------------------
[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257
--------------------------------------------------
			Decoded text
--------------------------------------------------
 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


Lets create a Pytorch data loader

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        tokenized_text = tokenizer.encode(txt)
        for i in range(0, len(tokenized_text) - max_length, stride):
            input_chunk = tokenized_text[i:i + max_length]
            target_chunk = tokenized_text[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    import tiktoken
    gpt2_tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, gpt2_tokenizer, max_length, stride)
    return DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, drop_last=drop_last)


Now instantiate this dataloader with the text file we have

In [11]:
with open("data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(f"First batch {first_batch}")

First batch [tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [12]:
second_batch = next(data_iter)
print(f"First batch {second_batch}")


First batch [tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


Lets look at  larger batch size and stride

In [13]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4,
    shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)


Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


### Creating token embeddings

Assume a vocab of 6 words (GPT has 50,257) and each embedding is 3 dimension (GPT-3 has 12288 dimensions). The embeddings are essentially lookup tables of size (vocab_size, embedding_dim) where vocab_size is the number of unique tokens and embedding_dim is the dimension of the embedding vector for each token. In this case it will have the shape of (6, 3). The embedding matrix is initialized randomly and then trained during the training process.

In [14]:
# Tokens
input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 6
output_dim = 3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(f"Embedding layer weights are: {embedding_layer.weight}")

Embedding layer weights are: Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [16]:
# Simply lookup the embeddings for the input tokens
embedding_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

While this lookup works, the position information of the tokens is lost. To retain the position information, we will use positional embeddings. The positional embeddings are added to the token embeddings to create a final embedding that contains both the token and position information.

Two types of position embeddings are
1. Absolute positional embeddings: where each position has a unique embedding vector.
2. Relative positional embeddings: where the position of a token is relative to the other tokens in the sequence.

OpenAI's GPT models use absolute positional embeddings. The positional embeddings are added to the token embeddings to create a final embedding that contains both the token and position information. These embeddings are learned during the training process and are not fixed.

Let us now create the embeddings layer for the vocabulary size same as GPT but the output dimension of 256

In [18]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

Lets get a batch of input ids and get the embeddings for these input ids

In [22]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=4,
    shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nToken IDs shape:\n", inputs.shape)

# Lookup embeddings for the input tokens
token_embeddings = token_embedding_layer(inputs)
print(f"\nToken Embeddings shape:\n{token_embeddings.shape}")


Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Token IDs shape:
 torch.Size([8, 4])

Token Embeddings shape:
torch.Size([8, 4, 256])


We will need another embeddings to represent the positions. The number of distinct embeddings will be same as the number of fixed tokens in an input sequence. Each of these will have the same dimension as the token embedding itself as we will add these two embeddings together to get the final embedding.

Thus in our case since the max_length, which i same as context_length is 4, we will have 4 positional embeddings, each of dimension 256.

In [27]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(f"Positional Embedding layer weights are: {pos_embeddings.shape}")

Positional Embedding layer weights are: torch.Size([4, 256])


In [30]:
# Notice how we are adding a tensor of shape (4, 256) to a tensor of shape (8, 4, 256). This is possible because of broadcasting in PyTorch.
input_embeddings = token_embeddings + pos_embeddings
print(f"Input Embeddings shape: {input_embeddings.shape}")

Input Embeddings shape: torch.Size([8, 4, 256])
