# Transformers from scratch 

In [30]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as f
from torch import Tensor
from torch import nn
from IPython.display import Image


***Self Attention Function***

This is the function of a single attention head. This takes in three tensors named Q(query), K(key) & V(value). These are learnable parameters. The shapes of these tensors are (batch size, sequence_length, num_features/dimension of feature vector). These are generated using a feed forward layer which uses the embeddings embedded with positional encoding as input.

Attention(Q,K,V) = softmax $(QK^T/\sqrt d_k)V$

Masking is added in the original paper, 

Attention(Q,K,V) = softmax $(QK^T/ \sqrt d_k + M)V$. M is coded using a triangular function.

In [31]:
Image(url='https://miro.medium.com/v2/resize:fit:1400/format:webp/1*BzhKcJJxv974OxWOVqUuQQ.png')

In [32]:
def self_attention(Q: Tensor, K:Tensor, V: Tensor) -> Tensor:
    qk = Q.bmm(K.transpose(1,2)) #matrix batchwise multiplication
    scalling = Q.size(-1)**.5 #scalled by the square root of the number of features
    softmax = f.softmax(qk/scalling,dim= -1) #conversion into softmax probability. values will be 0-1 via this.
    self_attention_block = softmax.bmm(V) #MatMul again
    return self_attention_block

In [33]:
# Test the function
q = Tensor(30,50,64)
k = Tensor(30,50,64)
v = Tensor(30,50,64)
result = self_attention(q,k,v)
print(result.shape)


torch.Size([30, 50, 64])


***Multi-head Attention Class***

This takes in three tensors named Q(query), K(key) & V(value). These are then passed through a linear layer and then self_attention is computed paralelly. That is why they are called multi-head attention. 

In [78]:
class SelfAttention(nn.Module):
    def __init__(self, number_of_heads, model_dimension):
        super().__init__()
        self.q_l = nn.Linear(model_dimension,model_dimension)
        self.k_l = nn.Linear(model_dimension,model_dimension)
        self.v_l = nn.Linear(model_dimension,model_dimension)
        
    def forward(self, q,k,v):
        q = self.q_l(q)
        k = self.k_l(k)
        v = self.v_l(v)
        single_attention_head = self_attention(q,k,v)
        return single_attention_head,v.shape
    

In [85]:
#Batch size= 30 (passing 30 sentences at once, max number of words in each sentence= 50, 64 is the vector
#represetation for each word, model dimension D_l
q = Tensor(30,50,512)
k = Tensor(30,50,512)
v = Tensor(30,50,512)
model = SelfAttention(8,512)

In [86]:
model

SelfAttention(
  (q_l): Linear(in_features=512, out_features=512, bias=True)
  (k_l): Linear(in_features=512, out_features=512, bias=True)
  (v_l): Linear(in_features=512, out_features=512, bias=True)
)

In [87]:
attention, v = model.forward(q,k,v)

In [88]:
v

torch.Size([30, 50, 512])