# Self-Attention in Transformer

### Transformer architecture follows the same intuition but in a silightly different way.

In [2]:
import torch

In [3]:
# We will follow "Attention all you need" paper to illestimate self attention mechanism.
# In transformer architecture:
    # it apply attention  using 3 different weight matrix: key, query and value matrix
    # instead of using just one matrix or weights with input matrix, transformer architecture follows below algorithm
        # context_vector_i = (key_matrix * (query_matrix * x_i))/constant_value * value_matrix

In [4]:
# let's understand it with example
# Example sentence: "Our LLM journey starts here"

# input embedding matrix
x = torch.randn(5, 3)
x

tensor([[ 0.2191, -0.9015, -0.2318],
        [ 0.5199, -0.9474, -0.0275],
        [-0.5163, -0.0144,  0.2551],
        [ 0.2500,  0.3028,  0.6914],
        [-1.6208,  0.7710,  0.3911]])

In [5]:
use_gred = False    # keeping gredients false to keep output cluster free, True for training
d_in = d_out = 3    # keeping 3x3 matrix of all three matrix
q = torch.nn.Parameter(torch.rand(3,3), requires_grad=use_gred)         # trainable parameter, while training
k = torch.nn.Parameter(torch.rand(3,3), requires_grad=use_gred)         # trainable parameter, while training
v = torch.nn.Parameter(torch.rand(3,3), requires_grad=use_gred)         # trainable parameter, while training

In [9]:
# Evaluating attention for word "LLM"
x_1 = x[1]  # word embedding of word "LLM" 

q_x1 = x_1 @ q
k_x1 = x_1 @ k
v_x1 = x_1 @ v 

q_x1, k_x1, v_x1

(tensor([-0.5213, -0.6311,  0.0208]),
 tensor([ 0.3267, -0.1724, -0.0928]),
 tensor([0.1461, 0.1484, 0.1638]))

In [10]:
# Unnormalized attention score for word "LLM", it's word "LLM" attention to "LLM"
# "LLM" -> "LLM" attention
attn_score_11 = q_x1.dot(k_x1)
attn_score_11

tensor(-0.0635)

In [11]:
# Generalizing the computation to all attention
# attention score againest all words sentence
keys = (x @ k)
attn_score_1 = q_x1 @ keys.T
attn_score_1

tensor([ 0.2838, -0.0635,  0.3444, -0.6775,  0.9437])

In [12]:
# attention weights for word "LLM"
constant_vaule = keys.shape[-1] ** 0.5
attn_weight_1 = torch.softmax(attn_score_1 / constant_vaule, dim=-1)
attn_weight_1


tensor([0.2044, 0.1673, 0.2117, 0.1174, 0.2992])

In [13]:
# context vector for word "LLM" after attention
# "LLM" context vector
values = x @ v
context_vector_1 = attn_weight_1 @ values
context_vector_1

tensor([-0.3535, -0.2929, -0.0785])

### all words attention againest all words in sentence

In [6]:
q_all = x @ q
k_all = x @ k
v_all = x @ v

In [None]:
# attention score
att_score = q_all @ k_all.T
att_score

# you can mathch (1,1) index value with previously compute attention score.

tensor([[ 4.6778e-01, -9.7939e-02,  4.8369e-01, -1.1508e+00,  1.3818e+00],
        [ 2.8379e-01, -6.3466e-02,  3.4444e-01, -6.7752e-01,  9.4370e-01],
        [ 2.5976e-02,  1.7880e-02, -1.0293e-03, -2.9809e-02, -1.1422e-02],
        [-5.9335e-01,  1.2894e-01, -5.1991e-01,  1.5237e+00, -1.5767e+00],
        [-6.3557e-02,  6.4684e-02, -1.9389e-01,  1.9309e-01, -5.1222e-01]])

In [None]:
# normalization score
atten_wights = torch.softmax(att_score / k_all.shape[-1] ** 0.5, dim=-1)
atten_wights, atten_wights[0].sum()

tensor([[0.2075, 0.1497, 0.2095, 0.0815, 0.3518],
        [0.2044, 0.1673, 0.2117, 0.1174, 0.2992],
        [0.2030, 0.2020, 0.1998, 0.1965, 0.1986],
        [0.1329, 0.2017, 0.1387, 0.4513, 0.0753],
        [0.2026, 0.2182, 0.1879, 0.2350, 0.1564]])

In [None]:
# context matrix after attention
contex_matrix = atten_wights @ v_all
contex_matrix

# we can check 1st word context matches with previously compute attention

tensor([[-0.4343, -0.3608, -0.1297],
        [-0.3535, -0.2929, -0.0785],
        [-0.1872, -0.1532,  0.0273],
        [ 0.1347,  0.1094,  0.2661],
        [-0.1114, -0.0897,  0.0759]])

In [14]:
k_all.shape

torch.Size([5, 3])