## Multi-Headed Attention


### Imports


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import math

### Initials


In [2]:
sentence_length = 50
batch_size = 30
input_dim = 512
output_dim = input_dim

x = torch.randn( (batch_size, sentence_length, input_dim) )
x.shape

torch.Size([30, 50, 512])

In [3]:
qkv_layer = nn.Linear(input_dim, 3*output_dim)
qkv = qkv_layer(x)

#qkv now contains q, k and v
qkv.shape

torch.Size([30, 50, 1536])

In [4]:
n_heads = 8
head_dim = output_dim // n_heads
# Size of vector for each split of (q, k, or v)
# Each head will have its own dimension (head_dim)

qkv = qkv.reshape(batch_size, sentence_length, n_heads, 3 * head_dim)
# Reshaped qkv: for each batch and each sentence, 'qkv' is divided into 8 parts (n_heads).
# Each nth part corresponds to the nth part of query (q), key (k), and value (v).
qkv.shape

torch.Size([30, 50, 8, 192])

In [5]:
# for easier parallel operations we switch the positions of sentence_length and n_heads
# so for each batch for each head, we will work on sentence with its corresponding vector representation(192 elements)
qkv = qkv.permute(0, 2, 1, 3)
qkv.shape


torch.Size([30, 8, 50, 192])

In [6]:
# the sentence representation will be now divided into q, k and v
q, k, v = qkv.chunk(3, dim = -1)
q.shape, k.shape, v.shape

(torch.Size([30, 8, 50, 64]),
 torch.Size([30, 8, 50, 64]),
 torch.Size([30, 8, 50, 64]))

### Attention Mechanism for single thread


In [7]:
k_t = k.transpose(-2,-1)
k_t.shape

torch.Size([30, 8, 64, 50])

In [8]:
d_k = q.shape[-1]

product = torch.matmul(q,k_t) / math.sqrt(d_k)
product.shape

torch.Size([30, 8, 50, 50])

In [9]:
# Masking
mask = torch.full(product.size(), float('-inf'))
mask = torch.triu(mask, diagonal =1)
mask[0][0]
mask.shape

torch.Size([30, 8, 50, 50])

In [10]:
to_mask = 1

if to_mask is not None:
    product = product + mask
    
(product+mask)[0][0]

tensor([[-0.1269,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
        [-0.2754,  0.6308,    -inf,  ...,    -inf,    -inf,    -inf],
        [-0.1359,  0.5563, -0.4251,  ...,    -inf,    -inf,    -inf],
        ...,
        [-0.5237,  0.0218, -0.2747,  ..., -0.1744,    -inf,    -inf],
        [-0.6314,  0.0886, -0.0256,  ...,  0.0645, -0.0662,    -inf],
        [-0.1783, -0.1951,  0.0603,  ..., -0.0255, -0.3918,  0.2879]],
       grad_fn=<SelectBackward0>)

In [11]:
attention = F.softmax(product, dim = -1)
print(attention.shape)
attention[0][0]

torch.Size([30, 8, 50, 50])


tensor([[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2878, 0.7122, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2669, 0.5333, 0.1999,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0112, 0.0193, 0.0144,  ..., 0.0159, 0.0000, 0.0000],
        [0.0106, 0.0217, 0.0193,  ..., 0.0212, 0.0186, 0.0000],
        [0.0165, 0.0162, 0.0209,  ..., 0.0192, 0.0133, 0.0262]],
       grad_fn=<SelectBackward0>)

In [12]:
values = torch.matmul(attention, v)
values.shape

torch.Size([30, 8, 50, 64])

In [13]:
# now we reshape the values to concatenate each head
# note that previously 'values' was for each head, now we reshape to get 'values' or each sentence (i.e. vector representation of each sentence)
# for that we need to permute to appropriate format
values = values.permute(0,2,1,3)
values.shape

torch.Size([30, 50, 8, 64])

In [14]:
values = values.reshape(30, 50, 8*64)
values.shape

torch.Size([30, 50, 512])

In [15]:
linear_layer = nn.Linear(output_dim, output_dim)
out = linear_layer(values)

In [16]:
out.shape

torch.Size([30, 50, 512])

In [17]:
%reset -f

## Positional Encoding


In [18]:
import torch 
import torch.nn as nn


In [19]:
# the below code is inefficient and there are better ways to write
# I came up with the following logic by understanding the concept
# to respect i will include this code in my final code inspite of inefficiency

In [20]:
d_model = 6 #embeddings
sequence_length = 10

position = torch.arange(0, sequence_length, 1, dtype=torch.float32).unsqueeze(1)

# in paper they have given that in denominator 2*i ,i.e even numbers to be filled in place of 2*i
even_numbers = torch.arange(0,d_model, 2, dtype=torch.float32)
denominator = torch.pow(10_000, even_numbers/d_model)

position_embedding = torch.zeros(sequence_length, d_model)
for index, pos in enumerate(position):
    for i in range(d_model):
        if i % 2 == 0:
            position_embedding[index][i] = torch.sin((pos/denominator[i // 2]))

        else:
            position_embedding[index][i] = torch.cos((pos/denominator[i // 2]))

position_embedding

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

In [21]:
# below is the efficient code

In [22]:
import torch

def positional_encoding(sequence_length, d_model):
    # Create position tensor with shape (sequence_length, 1)
    position = torch.arange(0, sequence_length, 1, dtype=torch.float32).unsqueeze(1)
    
    # Create the denominator term based on powers of 10_000
    denominator = torch.pow(10_000, torch.arange(0, d_model, 2, dtype=torch.float32) / d_model)
    print(denominator)
    # Calculate the positional encoding matrix
    pe = torch.zeros(sequence_length, d_model)
    pe[:, 0::2] = torch.sin(position / denominator)  # Apply sine to even indices
    pe[:, 1::2] = torch.cos(position / denominator)  # Apply cosine to odd indices
    
    return pe

# Example usage
sequence_length = 10  # Number of tokens in the sequence
d_model = 6         # Embedding dimension
position_embedding = positional_encoding(sequence_length, d_model)

print(position_embedding)  # Expected output: torch.Size([50, 512])


tensor([  1.0000,  21.5443, 464.1590])
tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])


In [23]:
%reset -f

## Layer Normalization


In [24]:
import torch
import torch.nn as nn

In [25]:
inputs = torch.rand(1,2,3)

batch, words, embeddings = inputs.shape
inputs_per_word = inputs.permute(1,0,2)
inputs_per_word.shape

torch.Size([2, 1, 3])

In [26]:
# we introduce gamma, beta, cauz if not then the outputs will always be in a fixed length
# by introducing these, we have varied ranges(of cource learned from data), increasing the representation power

parameter_shape = inputs_per_word.size()[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))
parameter_shape

torch.Size([1, 3])

In [27]:
mean = inputs_per_word.mean(dim = [-2,-1], keepdim=True)
mean, mean.shape

(tensor([[[0.5155]],
 
         [[0.5518]]]),
 torch.Size([2, 1, 1]))

In [28]:
var, mean = torch.var_mean(inputs_per_word, dim = [-2,-1], unbiased=False, keepdim=True)
epsilon = 1e-5

std = (var + epsilon).sqrt()
std

tensor([[[0.3074]],

        [[0.1467]]])

In [29]:
y =(inputs_per_word-mean)/std
y

tensor([[[ 1.2248e+00, -2.2416e-04, -1.2246e+00]],

        [[-9.0923e-01,  1.3923e+00, -4.8308e-01]]])

In [30]:
out = gamma*y + beta
out

tensor([[[ 1.2248e+00, -2.2416e-04, -1.2246e+00]],

        [[-9.0923e-01,  1.3923e+00, -4.8308e-01]]], grad_fn=<AddBackward0>)

In [31]:
%reset -f

## Encoder


In [32]:
sequence_length = 50
batch_size = 30
d_model = 512 # embeddings
num_heads = 8  # number of attention heads
dropout = 0.1
to_mask = False
feed_forward_hidden = 2048
num_encoder_layers = 5