## Tokenization from the previous

In [13]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=2, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader    

raw_text = "Yours journey start with one steps."
data_loader = create_dataloader_v1(raw_text, batch_size=2, max_length=4, stride=4, shuffle=False)

vocab_size = 50257
output_dim = 3

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

pos_embedding = pos_embedding_layer(torch.arange(max_length))
input_embeddings = token_embeddings + pos_embedding

print("inputs =", inputs)
print(input_embeddings)

inputs = tensor([[  56, 4662, 7002,  923]])
tensor([[[ 0.4072, -1.8805,  1.4397],
         [-0.5533,  0.3557,  0.1620],
         [-0.3950,  1.1908, -0.5788],
         [-0.4490,  0.2006,  0.0126]]], grad_fn=<AddBackward0>)


## Simple self-attention

In [28]:
input1 = input_embeddings[0][0]
input2 = input_embeddings[0][2]
input3 = input_embeddings[0][3]
result1 = torch.dot(input2, input1)
result2 = torch.dot(input3, input1)
print("Input 1:", result1, "Input 2:", result2)

Input 1: tensor(-3.2334, grad_fn=<DotBackward0>) Input 2: tensor(-0.5419, grad_fn=<DotBackward0>)


In [36]:
import torch.nn.functional as F
embeddings = input_embeddings.squeeze(0)
attention_scores = torch.matmul(embeddings, embeddings.transpose(0, 1))
print("Attention Scores (dot products):")
print(attention_scores)


Attention Scores (dot products):
tensor([[ 5.7747, -0.6610, -3.2334, -0.5419],
        [-0.6610,  0.4589,  0.5484,  0.3218],
        [-3.2334,  0.5484,  1.9090,  0.4089],
        [-0.5419,  0.3218,  0.4089,  0.2420]], grad_fn=<MmBackward0>)


In [37]:
for i in range(4):
    for j in range(4):
        print(f"w{i+1}{j+1} = {attention_scores[i][j]:.4f}")

w11 = 5.7747
w12 = -0.6610
w13 = -3.2334
w14 = -0.5419
w21 = -0.6610
w22 = 0.4589
w23 = 0.5484
w24 = 0.3218
w31 = -3.2334
w32 = 0.5484
w33 = 1.9090
w34 = 0.4089
w41 = -0.5419
w42 = 0.3218
w43 = 0.4089
w44 = 0.2420


In [34]:
attention_weights = F.softmax(attention_scores, dim=-1)
print("\nAttention Weights (after softmax):")
print(attention_weights)


Attention Weights (after softmax):
tensor([[9.9648e-01, 1.5977e-03, 1.2198e-04, 1.7996e-03],
        [9.9131e-02, 3.0379e-01, 3.3221e-01, 2.6487e-01],
        [3.9338e-03, 1.7267e-01, 6.7320e-01, 1.5020e-01],
        [1.2270e-01, 2.9105e-01, 3.1754e-01, 2.6871e-01]],
       grad_fn=<SoftmaxBackward0>)


In [35]:
# Print individual weights
print("\nIndividual attention weights:")
for i in range(4):
    for j in range(4):
        print(f"w{i+1}{j+1} = {attention_weights[i][j]:.4f}")


Individual attention weights:
w11 = 0.9965
w12 = 0.0016
w13 = 0.0001
w14 = 0.0018
w21 = 0.0991
w22 = 0.3038
w23 = 0.3322
w24 = 0.2649
w31 = 0.0039
w32 = 0.1727
w33 = 0.6732
w34 = 0.1502
w41 = 0.1227
w42 = 0.2910
w43 = 0.3175
w44 = 0.2687


compute the context vector

In [40]:
context_vectors = torch.matmul(attention_weights, embeddings)
context_vectors

tensor([[ 0.4041, -1.8728,  1.4348],
        [-0.3778,  0.3704,  0.0030],
        [-0.4273,  0.8858, -0.3541],
        [-0.3571,  0.3049,  0.0434]], grad_fn=<MmBackward0>)