## Tokenization from the previous

In [49]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=2, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader    

raw_text = "Yours journey start with one steps."
max_length = 4
data_loader = create_dataloader_v1(raw_text, batch_size=2, max_length=max_length, stride=max_length, shuffle=False)

vocab_size = 50257
output_dim = 3

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

pos_embedding = pos_embedding_layer(torch.arange(max_length))
input_embeddings = token_embeddings + pos_embedding

print("inputs =", inputs)
print(input_embeddings)

inputs = tensor([[  56, 4662, 7002,  923]])
tensor([[[ 0.5002, -1.5425, -1.2515],
         [ 1.4524,  0.8233,  1.5379],
         [ 0.1790,  0.7805,  0.2986],
         [-4.3992, -0.5013, -0.0914]]], grad_fn=<AddBackward0>)


## Simple self-attention

In [50]:
input1 = input_embeddings[0][0]
input2 = input_embeddings[0][2]
input3 = input_embeddings[0][3]
result1 = torch.dot(input2, input1)
result2 = torch.dot(input3, input1)
print("Input 1:", result1, "Input 2:", result2)

Input 1: tensor(-1.4881, grad_fn=<DotBackward0>) Input 2: tensor(-1.3127, grad_fn=<DotBackward0>)


In [51]:
import torch.nn.functional as F
embeddings = input_embeddings.squeeze(0)
attention_scores = torch.matmul(embeddings, embeddings.transpose(0, 1))
print("Attention Scores (dot products):")
print(attention_scores)


Attention Scores (dot products):
tensor([[ 4.1959, -2.4682, -1.4881, -1.3127],
        [-2.4682,  5.1524,  1.3617, -6.9430],
        [-1.4881,  1.3617,  0.7303, -1.2060],
        [-1.3127, -6.9430, -1.2060, 19.6129]], grad_fn=<MmBackward0>)


In [52]:
for i in range(4):
    for j in range(4):
        print(f"w{i+1}{j+1} = {attention_scores[i][j]:.4f}")

w11 = 4.1959
w12 = -2.4682
w13 = -1.4881
w14 = -1.3127
w21 = -2.4682
w22 = 5.1524
w23 = 1.3617
w24 = -6.9430
w31 = -1.4881
w32 = 1.3617
w33 = 0.7303
w34 = -1.2060
w41 = -1.3127
w42 = -6.9430
w43 = -1.2060
w44 = 19.6129


In [53]:
attention_weights = F.softmax(attention_scores, dim=-1)
print("\nAttention Weights (after softmax):")
print(attention_weights)


Attention Weights (after softmax):
tensor([[9.9135e-01, 1.2648e-03, 3.3705e-03, 4.0164e-03],
        [4.7918e-04, 9.7744e-01, 2.2071e-02, 5.4591e-06],
        [3.4719e-02, 6.0010e-01, 3.1915e-01, 4.6032e-02],
        [8.1677e-10, 2.9303e-12, 9.0875e-10, 1.0000e+00]],
       grad_fn=<SoftmaxBackward0>)


In [47]:
# Print individual weights
print("\nIndividual attention weights:")
for i in range(4):
    for j in range(4):
        print(f"w{i+1}{j+1} = {attention_weights[i][j]:.4f}")


Individual attention weights:
w11 = 0.9556
w12 = 0.0000
w13 = 0.0000
w14 = 0.0000
w21 = 0.0000
w22 = 0.9999
w23 = 0.0001
w24 = 0.0000
w31 = 0.0938
w32 = 0.0154
w33 = 0.7404
w34 = 0.0106
w41 = 0.1423
w42 = 0.0174
w43 = 0.0364
w44 = 0.0540


compute the context vector

In [48]:
context_vectors = torch.matmul(attention_weights, embeddings)
context_vectors

tensor([[ 0.9837,  0.2052,  3.3906],
        [-1.0465,  1.8418, -2.4984],
        [-0.7738,  1.2155,  1.4582],
        [ 1.7287,  1.2394,  1.7910],
        [ 2.9638,  1.1900,  1.1359],
        [-0.4400, -3.0991, -1.5225],
        [ 1.3644,  1.2937,  2.5405]], grad_fn=<MmBackward0>)