# Tokenization

# 2.1 Building a Simple Tokenizer - with regex | With Example

In [153]:
import re

text = "Hello, world. This, is a test.?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item for item in result if item.strip()]
print("Text :", result)


Text : ['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.', '?']


# 2.2 Converting tokens into token IDs

In [154]:
all_words = sorted(set(result))
vocab_size = len(all_words)
vocab = {token:integer for integer,token in enumerate(all_words)}
print("Vocab Size : ",vocab_size)
print(vocab)

Vocab Size :  9
{',': 0, '.': 1, '?': 2, 'Hello': 3, 'This': 4, 'a': 5, 'is': 6, 'test': 7, 'world': 8}


# 2.3 Putting it now all together into a tokenizer class

In [155]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text



tokenizer = SimpleTokenizerV1(vocab)
ids = tokenizer.encode(text)
print("Tokens Ids: ",ids)
print("Decoded Tokens: ",tokenizer.decode(ids))

Tokens Ids:  [3, 0, 8, 1, 4, 0, 6, 5, 7, 1, 2]
Decoded Tokens:  Hello, world. This, is a test.?


# 2.4 Adding special context tokens 
* It's useful to add some "special" tokens for unknown words and to denote the end of a text

In [156]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))

 # Repeating above steps again below
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item for item in result if item.strip()]
print("Text :", result)


all_tokens = sorted(list(set(result)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab_size = len(all_tokens)
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print("Vocab Size -",len(vocab.items()))


class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text
    


tokenizer = SimpleTokenizerV2(vocab)
print("Tokens Ids : ", tokenizer.encode(text))
print("Decoded Tokens :",tokenizer.decode(tokenizer.encode(text)))


Text : ['Hello', ',', 'do', 'you', 'like', 'tea', '?', '<|endoftext|>', 'In', 'the', 'sunlit', 'terraces', 'of', 'the', 'palace', '.']
Vocab Size - 16
Tokens Ids :  [4, 0, 6, 14, 7, 11, 3, 15, 5, 13, 10, 12, 8, 13, 9, 1]
Decoded Tokens : Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.




# 2.5 BytePair encoding 
* GPT-2 used BytePair encoding (BPE) as its tokenizer
* it allows the model to break down words that aren't in its predefined vocabulary into smaller subword units or even individual characters, enabling it to handle out-of-vocabulary words
* For instance, if GPT-2's vocabulary doesn't have the word "unfamiliarword," it might tokenize it as ["unfam", "iliar", "word"] or some other subword breakdown, depending on its trained BPE merges
* In this chapter, we are using the BPE tokenizer from OpenAI's open-source tiktoken library,


In [157]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))
tokenizer = tiktoken.get_encoding("gpt2")


text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print("Tokens Ids :  ", ids)
print("Decoded Tokens :",tokenizer.decode(ids))



tiktoken version: 0.11.0
Tokens Ids :   [15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Decoded Tokens : Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


# 2.6 Data sampling with a sliding window

In [158]:
enc_sample = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print("Tokens Ids :  ", enc_sample)
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print("\n")
print(f"x: {x}")
print(f"y:        {y}")
print("\n")
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired, " || ", tokenizer.decode(context), "---->", tokenizer.decode([desired]))


Tokens Ids :   [15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


x: [15496, 11, 466, 345]
y:        [11, 466, 345, 588]


[15496] ----> 11  ||  Hello ----> ,
[15496, 11] ----> 466  ||  Hello, ---->  do
[15496, 11, 466] ----> 345  ||  Hello, do ---->  you
[15496, 11, 466, 345] ----> 588  ||  Hello, do you ---->  like


* We implement a simple data loader that iterates over the input dataset and returns the inputs and targets shifted by one
* Create dataset and dataloader that extract chunks from the input text dataset

In [159]:

import torch
print("PyTorch version:", torch.__version__)


from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

max_length = 4
dataloader = create_dataloader_v1(
    text, batch_size=1, max_length=4, stride=1, shuffle=False
)


# Note - More overlap could lead to increased overfitting
dataloader = create_dataloader_v1(text, batch_size=3, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)


PyTorch version: 2.2.2
Inputs:
 tensor([[15496,    11,   466,   345],
        [  588,  8887,    30,   220],
        [50256,   554,   262,  4252]])

Targets:
 tensor([[   11,   466,   345,   588],
        [ 8887,    30,   220, 50256],
        [  554,   262,  4252, 18250]])


# 2.7 Creating token embeddings

In [160]:
output_dim = 256
token_embedding_layer = torch.nn.Embedding(tokenizer.n_vocab, output_dim)
print("Embedding Layer's Weight: \n ",token_embedding_layer.weight)

print("Token IDs:\n", inputs)
print("\nInputs shape:", inputs.shape)
token_embeddings = token_embedding_layer(inputs)
print("Token Embeddings Shape:", token_embeddings.shape)

# print("\n After Embedding Input Token: \n",token_embedding_layer(inputs))

Embedding Layer's Weight: 
  Parameter containing:
tensor([[ 7.3013e-01, -6.2206e-02,  1.5696e+00,  ..., -1.5445e-01,
          6.1283e-01,  8.4908e-02],
        [ 3.7902e-01,  5.7276e-01,  4.8475e-01,  ..., -2.9849e-01,
         -4.6095e-01, -3.4968e-01],
        [-1.1645e+00, -3.3439e-01,  8.5036e-01,  ...,  4.5088e-02,
          4.3319e-01,  1.3975e+00],
        ...,
        [ 6.3251e-01, -2.4393e-01,  2.6448e-01,  ..., -4.4956e-01,
          9.0974e-02,  3.8077e-01],
        [ 2.2503e+00, -5.7059e-01,  2.7960e-01,  ...,  1.7222e-01,
         -9.5370e-04, -6.5163e-02],
        [-1.3264e+00,  4.4279e-01, -1.5576e-01,  ...,  9.7685e-01,
          2.6135e-01, -6.0856e-01]], requires_grad=True)
Token IDs:
 tensor([[15496,    11,   466,   345],
        [  588,  8887,    30,   220],
        [50256,   554,   262,  4252]])

Inputs shape: torch.Size([3, 4])
Token Embeddings Shape: torch.Size([3, 4, 256])


# 2.8 Encoding word positions
* Embedding layer convert IDs into identical vector representations regardless of where they are located in the input sequence
* Positional embeddings are combined with the token embedding vector to form the input embeddings for a large language model

In [161]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([3, 4, 256])
