**Import libraries**

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [None]:
sequence_length = 4 #length of the input sentence
batch_size = 1 #helps in parallel processing
input_dim = 512 #vector dimension of every word that goes into the attention unit
d_model = 512  #output of the attention unit for every single word
 #(i.e. after coming out as a value vector)

# Let's randomly sample some input as if the input is coming out adding the input
# embedding vectors with the positional encoding.

x = torch.randn((batch_size, sequence_length, input_dim))


In [None]:
#See x and size - for debugging
print(x)
x.size()

tensor([[[ 0.2643, -0.4835,  0.1705,  ...,  0.2310,  0.2089, -0.7768],
         [ 0.3003, -0.5567, -0.3078,  ..., -0.2815, -0.3583, -1.0123],
         [-0.3623,  1.3805,  0.7164,  ..., -0.5174,  1.1290,  0.5393],
         [ 1.3606,  0.2813, -0.4229,  ..., -0.4427,  0.8692, -0.4544]]])


torch.Size([1, 4, 512])

In [None]:
#Define the qkv_layer - that is for each word(token) we have 3 vectors - q, k , v

qkv_layer = nn.Linear(input_dim, 3 * d_model)

In [None]:
# Pass the input to the qkv_layer to generate the q, k, v vectors

qkv = qkv_layer(x)

It should be noted that it is not necessary to split the above vector into 8 attention heads and then pass it through a linear neural network. Instead you can pass the whole 512 length vector to a linear network unit. After that, you can split and carry out QK^T, scaling, softmax and then obtain the value vector. And then concatenate again and apss it through a linear layer

IT is not going to amke any difference follwoing the laws of linear transformation.

In [None]:
# See the size of the qkv for the purpose of debugging/understanding
# The size will be  1 x 4 x 1536 (batch dimension x 4 words x 512*3) the q,k,v
# concatenated

qkv.shape

torch.Size([1, 4, 1536])

In [None]:
# Now, we have 8 attention heads that we are considering
# Each head dimension = 512/8
# Reshape the qkv matrix to break down the last dimention into a product of the
# number of heads into 3 times the head dimension

num_heads = 8
head_dim = d_model // num_heads
qkv = qkv.reshape(batch_size, sequence_length, num_heads, 3 * head_dim)

In [None]:
# See the size of the reshaped qkv
# The size is going to be 1 x 8 x 4 x 192
# here, 1 is the batch size, 8 attention heads , 4 words, size of each head
# 512/8 = 64
# multiply with 3 = 192 (each for q,k,v / word)

qkv.shape


torch.Size([1, 4, 8, 192])

In [None]:
qkv = qkv.permute(0, 2, 1, 3)  #
# [ batch_size, num_heads, sequence_length, 3*head_dim]
# switch the dimensions for ease in parallel processing

qkv.shape

torch.Size([1, 8, 4, 192])

In [None]:
#Get the query, key and value vector individually by breaking down the tensor
# by its last dimension (hence -1)

q, k, v = qkv.chunk(3, dim=-1)
q.shape
k.shape
v.shape


torch.Size([1, 8, 4, 64])

In [None]:
from torch._functorch.vmap import doesnt_support_saved_tensors_hooks
#Perform the Attension Mechanism
#Just as in Slef Attention
#See notebook of Self Attention

#Get the size of the one of the vectors of one of the heads

d_k = q.size()[-1]

#Carry out qk^T and do the scaling in order that variance is smaller.
#we want to transpose across last 2 dimensions - sequence length and head
#dimensioin size

scaled = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
scaled.shape

torch.Size([1, 8, 4, 4])

In [None]:
# Let's talk about masking
# In the Encoder, we do not require amsking
# In Decoder we do require masking as we do not want to know the relavence of
# the next words. We do not want the behaviour to be bi-directional for language
# In Decoder we generate words one ata  time.
# So while generating context we want only the words before it.
# We do not have the words after it!
# What will it gain context from?!

# Do the masking as follows:

# 1) We have the saled tensor. This is the 1 x 8 x 4 x 4
# 2) We will fill this up with negative infinity values
# 3) We taken an upper triangular matrix wheer we leave the values above the
# main diagonal as it is and fill the lowe r diagonal with zero
# 4) Add the scaled mass
# 5) Apply softmax to the new scaled tensor
# 6) Get the value vectors

mask = torch.full(scaled.size(), float('-inf')) # step 1 and 2 above
mask = torch.triu(mask, diagonal=1)
mask[0][1] # mask for input to a single head



tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

In [None]:
scaled += mask

In [None]:
#Carry out softmax t convert numbers into probabilities
#i.e. between 0 and 1
#Apply softmax to the last dimension

attention = F.softmax(scaled, dim=-1)
attention.shape


torch.Size([1, 8, 4, 4])

In [None]:
#We now multiply the attention matrix with bvalue vector
#This will give us the final transformed word embeddings which are contextually
#rich

values = torch.matmul(attention, v)
values.shape

torch.Size([1, 8, 4, 64])