**Attention Mechanisms**

Simplified Self Attention

In [18]:
import torch

# example embedding vector for each word
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

Computing Attention scores

In [19]:
# query token is the token we want to find the most similar token to
query = inputs[1] # journey
attention_scores_2 = torch.empty(inputs.shape[0]) # creates an empty tensor of size 6
inputs.shape[0] # number of words in the sentence 6 (rows)

for i, x_i in enumerate(inputs): # i = index, x_i = word embedding
  attention_scores_2[i] = torch.dot(x_i, query) # dot product between query and each word embedding
  print(f"Similarity score between journey and {i+1}th word: {attention_scores_2[i]:.4f}")
print(f"Attention scores: {attention_scores_2}")

Similarity score between journey and 1th word: 0.9544
Similarity score between journey and 2th word: 1.4950
Similarity score between journey and 3th word: 1.4754
Similarity score between journey and 4th word: 0.8434
Similarity score between journey and 5th word: 0.7070
Similarity score between journey and 6th word: 1.0865
Attention scores: tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


Computing Attention Weigts

In [20]:
# Attention scores are normalized to get the attention weights x/sum(x)

attention_weights_2_tmp = attention_scores_2/attention_scores_2.sum() # normalization
print(attention_weights_2_tmp)
print(attention_weights_2_tmp.sum()) # should sum to 1

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
tensor(1.0000)


In [21]:
# using softmax function e^x/sum(e^x)
# softmax ensures weights are positive and sum to 1

def softmax_naive(x):
    exp_x = torch.exp(x)
    return exp_x / exp_x.sum(dim=0)

attention_weights_2_naive = softmax_naive(attention_scores_2)
print(attention_weights_2_naive)
print(attention_weights_2_naive.sum()) # should sum to 1

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor(1.)


In [22]:
# pytorch softmax function

attention_weights_2 = torch.nn.functional.softmax(attention_scores_2, dim=0)
print(attention_weights_2)
print(attention_weights_2.sum()) # should sum to 1

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor(1.)


Computing Context Vector

In [34]:
# multiplying attention weights with word embeddings to get the context vector

query = inputs[1] # journey
context_vector_2 = torch.zeros(query.shape) # creates a tensor of zeros of size 3 (no. of input columns)
for i, x_i in enumerate(inputs): # i = index, x_i = word embedding
  context_vector_2 += attention_weights_2[i] * x_i # attention weight * correspinding word embedding
  print(f"Context vector after adding {i+1}th word, multiplying {attention_weights_2[i]} and word {x_i}: {context_vector_2}")

print(context_vector_2)

# this context vector is related to the query token journey

Context vector after adding 1th word, multiplying 0.13854756951332092 and tensor([0.4300, 0.1500, 0.8900]): tensor([0.0596, 0.0208, 0.1233])
Context vector after adding 2th word, multiplying 0.2378913015127182 and tensor([0.5500, 0.8700, 0.6600]): tensor([0.1904, 0.2277, 0.2803])
Context vector after adding 3th word, multiplying 0.23327402770519257 and tensor([0.5700, 0.8500, 0.6400]): tensor([0.3234, 0.4260, 0.4296])
Context vector after adding 4th word, multiplying 0.12399158626794815 and tensor([0.2200, 0.5800, 0.3300]): tensor([0.3507, 0.4979, 0.4705])
Context vector after adding 5th word, multiplying 0.10818186402320862 and tensor([0.7700, 0.2500, 0.1000]): tensor([0.4340, 0.5250, 0.4813])
Context vector after adding 6th word, multiplying 0.15811361372470856 and tensor([0.0500, 0.8000, 0.5500]): tensor([0.4419, 0.6515, 0.5683])
tensor([0.4419, 0.6515, 0.5683])


Computing Attention Scores for all Inputs

In [30]:
# computing attention scores for all words in the sentence (all queries)

attention_scores = torch.empty(6,6)
for i, x_i in enumerate(inputs):
  for j, x_j in enumerate(inputs):
    attention_scores[i,j] = torch.dot(x_i, x_j)
    # print(f"Similarity score between {i+1}th and {j+1}th word: {attention_scores[i,j]:.4f}")
print(f"Attention scores: {attention_scores}")
# each row represents the similarity between the query token and all other tokens

Attention scores: tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [36]:
# or matmul

attention_scores = inputs @ inputs.T # 6*3 @ 3*6 = 6*6
print(f"Attention scores: {attention_scores}")

Attention scores: tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


Computing Attention Weights for all inputs

In [33]:
# attention weights for all queries
# dim=-1 means the last dimension columns - normalize across columns
# so that rows sum to 1

attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)
print(f"Attention weights: {attention_weights}")

row2_sum = attention_weights[1].sum()
print(f"Sum of attention weights for the 2nd query (2nd row): {row2_sum}")

Attention weights: tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
Sum of attention weights for the 2nd query (2nd row): 1.0


Computing Context Vectors for all inputs

In [39]:
print(attention_weights.shape)
inputs.shape

torch.Size([6, 6])


torch.Size([6, 3])

In [40]:
context_vectors = attention_weights @ inputs
print(f"Context vectors: {context_vectors}")

Context vectors: tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


**Self Attention with Trainable Weights**