# Chapter 2: Working with Text

Packages used in this notebook

In [3]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.5.1
tiktoken version: 0.8.0


2.2. Tokenizing the text

In [1]:
# Read the file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

file_path = '/mnt/c/Users/haanh/building-llm-from-scratch/data/the-verdict.txt'
data = read_file(file_path)
print("The number of characters in the file:", len(data))

# Get the first 100 characters in the data
print(data[:100])

The number of characters in the file: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [2]:
# Split based on whitespace and punctuation marks
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', data)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [3]:
# Get the number of tokens
print(len(preprocessed))

4690


Note: This tokenization pipeline can also be done with nltk through word_tokenize()


Sample code:

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Ensure you have the necessary resources
nltk.download('punkt')

text = "It's snowing? Are you cold?"

# Tokenize the text
tokens = word_tokenize(text)

print(tokens)

2.3. Converting tokens into toke IDs
Next, we convert the text tokens into token IDs that we can process via embedding layers later

In [8]:
# Build a vocabulary alphabetically based on the unique tokens in the data
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

# Get the size of the vocabulary
print(vocab_size)

1130


In [10]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [11]:
# Print out the first 15 items and their IDs in the vocabulary
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 15:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)


We will now create a simple tokenizer class, where encode function turns words into ids, and decode function converts ids into their respective words

Putting it all in a tokenizer class where:
- The encode function turns text into token IDs
- The decode function turns the token IDs back into text

In [12]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [14]:
# Examples
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)

print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [16]:
sample_text = tokenizer.decode(ids)

print(sample_text)

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [17]:
# Another way
sample_text = tokenizer.decode(tokenizer.encode(text))

print(sample_text)

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


2.4. Adding special context tokens
Here, we will add special tokens to signify unknown words and end of text. These tokens help provide more context for the LLMs.

In GPT-2, the <|endoftext|> tokens are used between two independent sources of text. 

In [18]:
# Add |<unk>| and |<endoftext>| tokens to the vocab
all_tokens = sorted(list(set(preprocessed)))

all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [19]:
# Size of the vocab
len(vocab.items())

1132

In [20]:
# Print the last 5 items in the vocab now
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


We also need to adjust the tokenizer accordingly so that it knows when and how to use the new <unk> token

In [21]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [22]:
# Examples
tokenizer = SimpleTokenizerV2(vocab)

text1 = "This Chirstmas is cold, but we have Wallace & Gromit!"
text2 = "Totally! And Feathers McGraw, too! Cracking!"

text = " <|endoftext|> ".join((text1, text2))

print(text)

This Chirstmas is cold, but we have Wallace & Gromit! <|endoftext|> Totally! And Feathers McGraw, too! Cracking!


In [24]:
ids = tokenizer.encode(text)
print(ids)

[97, 1131, 584, 1131, 5, 239, 1131, 530, 1131, 1131, 1131, 0, 1130, 1131, 0, 14, 1131, 1131, 5, 1020, 0, 1131, 0]


In [25]:
sample_text = tokenizer.decode(tokenizer.encode(text))

print(sample_text)

This <|unk|> is <|unk|>, but <|unk|> have <|unk|> <|unk|> <|unk|>! <|endoftext|> <|unk|>! And <|unk|> <|unk|>, too! <|unk|>!


2.5. Byte Pair Encoding (BPE):
In brevity, Byte Pair Encoding (BPE) builds the vocabulary by iteratively merging frequent characters into subwords and
frequent subwords into words.


For example: With BPE, the word "surely" can be broken into 2 tokens: "sure" and "ly"


The advantages of BPE are as follows:
- Reducing Out-Of-Vocabulary (OOV) issues: BPE breaks down rare or unseen words into smaller subword units, ensuring that even OOV words can be represented effectively. This eliminates the need for a fixed vocabulary size, reducing issues related to OOV words.
- Improving morphologial representation: BPE can effectively capture meaningful subword units like prefixes, roots, and suffixes, which are essential for understanding the structure of words, particularly in morphologically rich languages.
- Reducing sparsity in language models: A vocabulary composed of subwords ensures that most sequences of tokens have been seen during training. This reduces sparsity in the training data and helps models generalize better.


To improve the robustness of vocabulary representation, BPE-dropout can be used instead of BPE. BPE-dropout is a tokenization technique that introduces stochasticity into the BPE merge process. Unlike standard BPE, which always applies the same set of merge operations to tokenize a word or text, BPE-dropout randomly skips certain merge operations during tokenization.


We can use different tokenizers through the tiktoken library.

In [26]:
# GPT-2 tokenizer
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [27]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(ids)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [29]:
strings = tokenizer.decode(ids)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


2.6. Data sampling with a sliding window
We train LLMs to generate one word at a time, so we want to prepare the training data accordingly where the next word in a sequence represents the target to predict

In [None]:
# Tokenize "The Verdict" with GPT-2's tokenizer
enc_text = tokenizer.encode(data)
print(len(enc_text))

5145


- For each text chunk, we want the inputs and targets
- Since we want the model to predict the next word, the targets are the inputs shifted by one position to the right

In [32]:
enc_sample = enc_text[50:]

In [33]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


One by one, the prediction would look like as follows:

In [34]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [35]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


Create dataset and dataloader that extract chunks from the input text dataset

In [36]:
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [37]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

Testing the dataloader with a batch size of 1 for an LLM with a context of size 4

In [38]:
dataloader = create_dataloader_v1(
    data, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [39]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


We increase the stride here so that we don't have overlaps between the batches, since more overlap could lead to increased overfitting

Excercise 2.2: 
(1) max_length=2 and stride=2
(2) max_length=8 and stride=2

In [40]:
# For (1)
dataloader = create_dataloader_v1(
    data, batch_size=4, max_length=2, stride=2, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  40,  367],
        [2885, 1464],
        [1807, 3619],
        [ 402,  271]])

Targets:
 tensor([[  367,  2885],
        [ 1464,  1807],
        [ 3619,   402],
        [  271, 10899]])


In [41]:
# For (2)
dataloader = create_dataloader_v1(
    data, batch_size=8, max_length=8, stride=2, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464,  1807,  3619,   402,   271],
        [ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138],
        [ 1807,  3619,   402,   271, 10899,  2138,   257,  7026],
        [  402,   271, 10899,  2138,   257,  7026, 15632,   438],
        [10899,  2138,   257,  7026, 15632,   438,  2016,   257],
        [  257,  7026, 15632,   438,  2016,   257,   922,  5891],
        [15632,   438,  2016,   257,   922,  5891,  1576,   438],
        [ 2016,   257,   922,  5891,  1576,   438,   568,   340]])

Targets:
 tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899],
        [ 1464,  1807,  3619,   402,   271, 10899,  2138,   257],
        [ 3619,   402,   271, 10899,  2138,   257,  7026, 15632],
        [  271, 10899,  2138,   257,  7026, 15632,   438,  2016],
        [ 2138,   257,  7026, 15632,   438,  2016,   257,   922],
        [ 7026, 15632,   438,  2016,   257,   922,  5891,  1576],
        [  438,  2016,   257,   922,  5891,  1576,   43

We can see that there are overlaps between batches. This can lead to overfitting during training. Increasing the stride can solve this issue.