## Creating Query, Key, Value matrix

In [1]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        
        if len(token_ids) == max_length:
            input_chunk = token_ids
            target_chunk = token_ids[1:] + [token_ids[-1]]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
        elif len(token_ids) < max_length:
            padded_tokens = token_ids + [0] * (max_length - len(token_ids))
            input_chunk = padded_tokens
            target_chunk = padded_tokens[1:] + [0]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
        else:
            for i in range(0, len(token_ids) - max_length, stride):
                input_chunk = token_ids[i : i + max_length]
                target_chunk = token_ids[i + 1 : i + max_length + 1]
                self.input_ids.append(torch.tensor(input_chunk))
                self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=2, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    
    return dataloader    



raw_text = "Yours journey start with one steps"
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})

max_length = len(token_ids)
vocab_size = 50257
output_dim = 3

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(max_length, output_dim)

dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=max_length,
    stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

token_embeddings = token_embedding_layer(inputs)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))

input_embeddings = token_embeddings + pos_embeddings
input_embeddings[0]


tensor([[-0.6306, -1.9513,  1.6480],
        [-0.5873, -0.3170, -0.9106],
        [-1.1438, -0.2405,  0.4203],
        [ 0.4340,  1.4606,  1.0211],
        [ 0.7751,  0.6850,  1.1459],
        [ 0.0522, -2.0567, -1.3790],
        [-0.3948,  0.9785,  0.2447]], grad_fn=<SelectBackward0>)

In [7]:
rows = input_embeddings[0].shape[0]
cols = input_embeddings[0].shape[1]
print(f"Input Embeddings Shape: {rows} rows, {cols} columns")

Input Embeddings Shape: 7 rows, 3 columns


In [16]:
output_dim = 2
torch.manual_seed(123)
w_query = torch.nn.Parameter(torch.randn(cols, output_dim))
w_key = torch.nn.Parameter(torch.randn(cols, output_dim))
w_value = torch.nn.Parameter(torch.randn(cols, output_dim))

print(w_query)
print(w_key)
print(w_value)

Parameter containing:
tensor([[-0.1115,  0.1204],
        [-0.3696, -0.2404],
        [-1.1969,  0.2093]], requires_grad=True)
Parameter containing:
tensor([[-0.9724, -0.7550],
        [ 0.3239, -0.1085],
        [ 0.2103, -0.3908]], requires_grad=True)
Parameter containing:
tensor([[ 0.2350,  0.6653],
        [ 0.3528,  0.9728],
        [-0.0386, -0.8861]], requires_grad=True)


## Try for single row. "Yours"

In [18]:
w_query1 = input_embeddings[0][0] @ w_query
w_key1 = input_embeddings[0][0] @ w_key
w_value1 = input_embeddings[0][0] @ w_value
print(w_query1)
print(w_key1)
print(w_value1)

tensor([-1.1809,  0.7381], grad_fn=<SqueezeBackward4>)
tensor([0.3277, 0.0438], grad_fn=<SqueezeBackward4>)
tensor([-0.9003, -3.7780], grad_fn=<SqueezeBackward4>)


# attention score

In [19]:
atten_score11 = torch.dot(w_query1, w_key1)
atten_score11

tensor(-0.3547, grad_fn=<DotBackward0>)

In [None]:
keys = input_embeddings[0] @ w_key
values = input_embeddings[0] @ w_value



tensor([[-0.9003, -3.7780],
        [-0.2147,  0.1078],
        [-0.3698, -1.3673],
        [ 0.5779,  0.8049],
        [ 0.3795,  0.1666],
        [-0.6601, -0.7442],
        [ 0.2430,  0.4725]], grad_fn=<MmBackward0>)

**For entire first row**

In [22]:
atten_score1 = w_query1 @ keys.T
atten_score1

tensor([-0.3547,  0.2884, -0.7904, -0.9674, -0.4740,  1.7226, -0.8173],
       grad_fn=<SqueezeBackward4>)

In [24]:
atten_weight_1 = torch.softmax(atten_score1/keys.shape[1]**0.5, dim=-1)

In [25]:
atten_weight_1.sum()

tensor(1., grad_fn=<SumBackward0>)

In [26]:
comtext_vec1 = atten_weight_1 @ values
comtext_vec1

tensor([-0.3499, -0.6869], grad_fn=<SqueezeBackward4>)

## Compact class design for entire process

In [35]:
import torch.nn as nn
class SelfAttention_v1(nn.Module):
    
    def __init__(self, d_in, d_out):
        super().__init__()
        self.w_query = torch.nn.Parameter(torch.randn(d_in, d_out))
        self.w_key = torch.nn.Parameter(torch.randn(d_in, d_out))
        self.w_value = torch.nn.Parameter(torch.randn(d_in, d_out))
        
    def forward(self, inputs):
        queries = inputs @ self.w_query
        keys = inputs @ self.w_key
        values = inputs @ self.w_value
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[1]**0.5, dim=-1)
        
        context_vec = attn_weights @ values
        
        return context_vec


torch.manual_seed(123)
sa_v1 = SelfAttention_v1(cols,output_dim)    

In [36]:
sa_v1(input_embeddings[0])

tensor([[-0.3499, -0.6869],
        [-0.1021, -0.6654],
        [-0.1479, -0.6058],
        [-0.2591, -0.5528],
        [-0.3058, -0.6036],
        [-0.1788, -0.8294],
        [-0.1150, -0.5333]], grad_fn=<MmBackward0>)

**Updated with linear layer**

In [37]:
class SelfAttention_v2(nn.Module):
    
    def __init__(self, d_in, d_out):
        super().__init__()
        self.w_query = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_key = torch.nn.Linear(d_in, d_out, bias=False)
        self.w_value = torch.nn.Linear(d_in, d_out, bias=False)
        
    def forward(self, inputs):
        queries = self.w_query(inputs)
        keys = self.w_key(inputs)
        values = self.w_value(inputs)
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[1]**0.5, dim=-1)
        
        context_vec = attn_weights @ values
        
        return context_vec


In [38]:
torch.manual_seed(123)
sa_v2 = SelfAttention_v2(cols,output_dim)    
sa_v2(input_embeddings[0])

tensor([[-0.0493,  0.3184],
        [ 0.2791,  0.2625],
        [ 0.1756,  0.2637],
        [-0.0586,  0.1474],
        [-0.1230,  0.1648],
        [ 0.2659,  0.3405],
        [ 0.1293,  0.1971]], grad_fn=<MmBackward0>)