In [1]:
## Imports

%load_ext autoreload
%autoreload 2

import re
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
with open("../data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


### Section 2.2 : Tokenization -> will work by first splitting the text on all sorts of punctuations and whitespaces, and dropping the whitespaces.

In [3]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [4]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## Section 2.3 : Tokens to Token IDs -> take all unique items in the preprocessed text, sort, then create a dict of item to item index in the sorted list. We can then create complete tokenizer classes by using these vocabs as input. It will have a function to encode the text using the vocab dict, and a function to decode a list of ids to the corresponding text using the inverse dictionary.

In [5]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [6]:
vocab = {token:integer for integer,token in enumerate(all_words)}
print({k:v for k,v in vocab.items() if v<51})

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '--': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50}


#### Complete tokenizer class

In [7]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab            
        self.int_to_str = {i:s for s,i in vocab.items()}        

    def encode(self, text):      
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):         
        text = " ".join([self.int_to_str[i] for i in ids]) 
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)    
        return text

In [8]:
tokenizer = SimpleTokenizerV1(vocab = vocab) 
text = '''
    "It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride.
'''
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [9]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


## Section 2.4 : Adding special context tokens -> extend the vocab by adding tokens for unknown texts and end of texts. Otherwise the dict call will error out for unknown texts (text that does not exist in the vocab natively)

In [10]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

print(len(vocab.items()))

1132


In [11]:
print({k:v for k,v in vocab.items() if v > len(vocab.items())-5})

{'your': 1128, 'yourself': 1129, '<|endoftext|>': 1130, '<|unk|>': 1131}


#### New Tokenizer class

In [12]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab            
        self.int_to_str = {i:s for s,i in vocab.items()}        

    def encode(self, text):      
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):         
        text = " ".join([self.int_to_str[i] for i in ids]) 
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)    
        return text

In [13]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [14]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [15]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## Section 2.5 : Byte pair encoding -> this can deal with any unknown word by splitting into smaller subtokens (could be single characters, combination of characters etc depending upon their frequency of usage)

In [16]:
tokenizer = tiktoken.get_encoding("gpt2")

In [17]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [18]:
tokenizer.encode(text="Akwirw ier")

[33901, 86, 343, 86, 220, 959]

In [19]:
tokenizer.decode(tokenizer.encode(text="Akwirw ier"))

'Akwirw ier'

#### In short, BPE builds its vocabulary by iteratively merging frequent characters into subwords and frequent subwords into words. For example, BPE starts with adding all individual single characters to its vocabulary (“a,” “b,” etc.). In the next stage, it merges character combinations that frequently occur together into subwords. For example, “d” and “e” may be merged into the subword “de,” which is common in many English words like “define,” “depend,” “made,” and “hidden.” The merges are determined by a frequency cutoff.

## Section 2.6 : Data sampling with a sliding window - LLMs learn to predict the next word, one word at a time. So create input-target pairs for training

####  Let's encode the raw text using BPE first and take a sample by removing the first 50 words

In [20]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))
enc_sample = enc_text[50:]

5145


#### We need to work with texts of a certain context size (context will help define meaning to the sentences/phrases). Let's say context size is 4. Then our input container "x" is a collection of 4 consective text items, while the target container "y" is the same but shifted by one position to the right.

In [21]:
context_size = 4         
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


#### For each such input-target pair, we can then set up the next word prediction task by iterating over, appending one item at a time.

In [22]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [23]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


#### Wrapping everything above into a single dataloader class that will 
    - take as input the text, the tokenizer, the context size(max length) and a stride parameter
    - tokenize the text
    - then iterate over the token ids to create the input and output container tensors
    - with additional functions to return len of the inputs and input-target pairs

In [24]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)    

        for i in range(0, len(token_ids) - max_length, stride):     
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):    
        return len(self.input_ids)

    def __getitem__(self, idx):         
        return self.input_ids[idx], self.target_ids[idx]

In [25]:
def create_dataloader_v1(
    txt, 
    batch_size=4, 
    max_length=256,
    stride=128, 
    shuffle=True, 
    drop_last=True,
    num_workers=0
):
    tokenizer = tiktoken.get_encoding("gpt2")                         
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)   
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,     
        num_workers=num_workers     
    )
    ### drop_last=True drops the last batch if it is shorter 
    ### than the specified batch_size to prevent loss spikes during training.
    return dataloader

In [26]:
dataloader = create_dataloader_v1(
    raw_text, 
    batch_size=1, 
    max_length=4, 
    stride=1, 
    shuffle=False
)
data_iter = iter(dataloader)     
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


##### The above makes the stride parameter clear : the input shifts by the stride amount, here = 1, from one batch to the next.Test out below with different context sizes and stride values.

In [27]:
dataloader = create_dataloader_v1(
    raw_text, 
    batch_size=1, 
    max_length=2, 
    stride=2, 
    shuffle=False
)
data_iter = iter(dataloader)     
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]
[tensor([[2885, 1464]]), tensor([[1464, 1807]])]


In [28]:
dataloader = create_dataloader_v1(
    raw_text, 
    batch_size=1, 
    max_length=8, 
    stride=4, 
    shuffle=False
)
data_iter = iter(dataloader)     
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
[tensor([[ 1807,  3619,   402,   271, 10899,  2138,   257,  7026]]), tensor([[ 3619,   402,   271, 10899,  2138,   257,  7026, 15632]])]


In [29]:
#### Now experiment with batch size as well. 

In [30]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=2,
    shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 2885,  1464,  1807,  3619],
        [ 1807,  3619,   402,   271],
        [  402,   271, 10899,  2138],
        [10899,  2138,   257,  7026],
        [  257,  7026, 15632,   438],
        [15632,   438,  2016,   257],
        [ 2016,   257,   922,  5891]])

Targets:\n tensor([[  367,  2885,  1464,  1807],
        [ 1464,  1807,  3619,   402],
        [ 3619,   402,   271, 10899],
        [  271, 10899,  2138,   257],
        [ 2138,   257,  7026, 15632],
        [ 7026, 15632,   438,  2016],
        [  438,  2016,   257,   922],
        [  257,   922,  5891,  1576]])


#### In the above, there are batch_size number of rows in input and target (here = 8). For each corresponding row, target is shifted by 1 position compared to input. The stride determines the shift between consecutive rows of inputs (here = 2).

## Section 2.7 : Creating token embeddings -> use torch.nn.Embedding on the vocab size and chosen output dimension of embedding vector

In [31]:
input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 6
output_dim = 3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


#### In this toy example, there are 6 possible tokens as determined by vocab_size, and we have chosen each embedding vector to have a dimension of 3 as determined by output_dim. Hence, the torch.nn.Embedding creates a randomized tensor of shape = vocab_size * output_dim, representing each of these token embedding vectors. So a token id = 2 will correspond to the third row of this tensor, as verified below.

In [32]:
print(embedding_layer(torch.tensor([2])))

tensor([[ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)


In [33]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


## Section 2.8 Encoding word positions

In [34]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [35]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
   stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:\n torch.Size([8, 4])


In [36]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


#### Absolute positional encoding 

In [37]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [38]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


## To summarize : 
- read text file and convert to tokens
- create token id to token dictionary and reverse as well.
- Use torch to create token embedding vectors using vocab size and chosen output dimension of vectors
- self-attention is position-agnostic, so same tokens from different parts of the text have same embedding vectors. We can create positional embedding vectors.
- absolute positional embedding and relative positional embeddings.
- add the token embedding vector to positional embedding vector to create the final input embeddings, which will be a tensor of dimensions = batch_size * context size * output dim 