# The Self-Attention, Backborn of Transformers

## Self-Attention without trainable parameters

In [1]:
# Imports
import torch

### Let's Take an simple example of 6 words sentence

#### sentence is: How self attention works in Transformer

In [4]:
# Each word has word embeddings lenght of 3
# Let's create a random word embedding

word_embeddings = torch.randn(6, 3)
word_embeddings

tensor([[-0.3178, -0.8048,  0.7659],
        [ 0.6087,  0.6313,  0.4881],
        [-0.5776,  0.5329,  1.2401],
        [-0.7568,  1.0494, -0.4434],
        [ 1.8883, -1.5365, -0.5007],
        [-1.2233, -0.2219, -1.2253]])

In [5]:
# Each row in embeddings repersentence a word from example sentence in same sequence
# Let's try to understand how self attention works

# Calulating attention for word "self" in example sentence
word_embed_self = word_embeddings[1]
word_embed_self

tensor([0.6087, 0.6313, 0.4881])

In [11]:
# we will calculate simalarity score against each word embedding in same sentence
# Let's calculate simalarity score 
sim_score = torch.tensor([torch.dot(word_embed_self,embed) for embed in word_embeddings])
sim_score

tensor([-0.3276,  1.0072,  0.5901, -0.0147, -0.0648, -1.4828])

In [12]:
# We can get the intuition that word 'attention' has the higest simalarity score.
# However, the word embedding are choosen randomly, just for illestration.

# Now let's calculate attention weights by normalizing the score
# there are many ways to normalize the score, like division with sum of all values.
sim_weights  = sim_score/sim_score.sum()
sim_weights

tensor([ 1.1195, -3.4418, -2.0162,  0.0502,  0.2215,  5.0668])

In [16]:
# This is not looking good, it will diviate the model training weights while backprop
# We can use softmax for torch
sim_weights = torch.softmax(sim_score, axis=-1)
sim_weights

tensor([0.0972, 0.3694, 0.2434, 0.1329, 0.1264, 0.0306])

In [17]:
# Softmax converts the score to a range between 0 and 1, that sum to 1. (like probability)
sim_weights.sum()

tensor(1.0000)

In [20]:
# To generate context vector for word 'self', we need to calculate weighted sum of all words embeddings
# using for loop to keep the process more intutuive, instead of matrix multiplication

context_vector = torch.zeros(3)
for i in range(6):
    context_vector += sim_weights[i] * word_embeddings[i]
context_vector
# As we can see, context vector for 'self' is a weighted sum of all word embeddings in sentence.

tensor([0.1540, 0.2231, 0.3968])

In [None]:
# now let's create context vector for all words against each word in sentence
similarty_mat = torch.matmul(word_embeddings, word_embeddings.T)    # word_embeddings @ word_embeddings.T
similarty_mat

# a 6x6 matrix, simalrity for each word in sentence

tensor([[ 1.3354, -0.3276,  0.7045, -0.9436,  0.2530, -0.3711],
        [-0.3276,  1.0072,  0.5901, -0.0147, -0.0648, -1.4828],
        [ 0.7045,  0.5901,  2.1554,  0.4466, -2.5304, -0.9311],
        [-0.9436, -0.0147,  0.4466,  1.8706, -2.8194,  1.2363],
        [ 0.2530, -0.0648, -2.5304, -2.8194,  6.1770, -1.3556],
        [-0.3711, -1.4828, -0.9311,  1.2363, -1.3556,  3.0472]])

In [None]:
# normalization
similarty_weights = torch.softmax(similarty_mat, axis=-1)   # axis for each word in sentence
similarty_weights

# we can comapre 'self' similarty weigths comapring with previously computed
# tensor([0.0972, 0.3694, 0.2434, 0.1329, 0.1264, 0.0306])                      -> previously computed
# [9.7219e-02, 3.6939e-01, 2.4339e-01, 1.3294e-01, 1.2644e-01, 3.0624e-02],     -> above computed

tensor([[4.2655e-01, 8.0862e-02, 2.2698e-01, 4.3673e-02, 1.4451e-01, 7.7419e-02],
        [9.7219e-02, 3.6939e-01, 2.4339e-01, 1.3294e-01, 1.2644e-01, 3.0624e-02],
        [1.3955e-01, 1.2446e-01, 5.9548e-01, 1.0782e-01, 5.4933e-03, 2.7190e-02],
        [3.0096e-02, 7.6201e-02, 1.2086e-01, 5.0202e-01, 4.6118e-03, 2.6622e-01],
        [2.6600e-03, 1.9358e-03, 1.6447e-04, 1.2318e-04, 9.9458e-01, 5.3247e-04],
        [2.6468e-02, 8.7083e-03, 1.5120e-02, 1.3207e-01, 9.8898e-03, 8.0774e-01]])

In [32]:
similarty_weights[0].sum(axis=-1)       # just to verify

tensor(1.)

In [33]:
# context matrix for each word in sentence

context_matrix = torch.matmul(similarty_weights, word_embeddings)
context_matrix

tensor([[-0.0723, -0.3647,  0.4611],
        [ 0.1540,  0.2231,  0.3968],
        [-0.4170,  0.3823,  0.8222],
        [-0.7299,  0.5489, -0.3410],
        [ 1.8775, -1.5290, -0.4955],
        [-1.0813, -0.0636, -1.0100]])

In [34]:
# NOTE: 
    # The words embeddings are randomly chosen, might not make sense while genreating similarty_weights
    # In real implementation, we would use positional embedding along with word embedding,
    # but in this example, we just used word embedding.
    # Also, in real implementation, we would use matrix multiplication for word embedding and positional embedding,
    # instead of for loop.
    # Also, in real implementation, we would use softmax for attention weights, instead of matrix multiplication.
    
    # This tutorial have single attention head for illestration purposes
    # In real implementation, we would have multiple attention heads, and each head would have its own weights and bias.
    # And we would also use scaled dot-product attention.