**A simple self-attention mechanism without trainable weights**

mainly 3 steps associated with it *(embedded vectors as a input)*
1. calculating attention score
2. calculating attention weights
3. finding context vector


In [1]:

import torch
inputs=torch.tensor([
    [0.43, 0.15, 0.89], # your        (x^1)
    [0.55, 0.87, 0.66], # journey     (x^2)
    [0.57, 0.85, 0.64],  # starts     (x^3)
    [0.22, 0.58, 0.33],  # with       (x^4)
    [0.77, 0.25, 0.10],  # one        (x^5)
    [0.05, 0.80, 0.55]  # step        (X^6)
])

In [2]:
print(inputs)

tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])


In [3]:
# calculating attention score between query (x2) with all other inputs
# this attention score calculated by dot product

query=inputs[1] #2nd input token is the query
attn_scores_2=torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i]=torch.dot(x_i, query)
print(attn_scores_2)



tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [4]:
# normalizing attention scores to get attention weights
attn_weights_2_tmp=attn_scores_2/attn_scores_2.sum()

print(attn_weights_2_tmp)
print(attn_weights_2_tmp.sum())



tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
tensor(1.0000)


In [5]:
# more better approach to get attention weights is 
# softmax function for normalization

def softmax_naive(x):
    return torch.exp(x)/torch.exp(x).sum(dim=0)

attn_weights_2_naive=softmax_naive(attn_scores_2)

print(attn_weights_2_naive)
print(attn_weights_2_naive.sum())


tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor(1.)


In [6]:
# in general case, we'll use pytorch's softmax 
# to prevent overflow/underflow

attn_weights_2=torch.softmax(attn_scores_2,dim=0)

print(attn_weights_2)
print(attn_weights_2.sum())


tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor(1.)


In [7]:
# finding context vector
# by multiplying input embeded tokens with corresponding attention weights
# then summing the resulting vectors


query=inputs[1] #2nd input token
context_vec_2=torch.zeros(query.shape)

for i,x_i in enumerate(inputs):
    context_vec_2+=attn_weights_2[i]*x_i

print(context_vec_2)
    

tensor([0.4419, 0.6515, 0.5683])
