# Simple Attention Mechanism
**Without Trainable Weights**

In [49]:
import torch

In [50]:
EMBEDDING_DIM = 4
NUM_TOKENS = 6
torch.manual_seed(0)

<torch._C.Generator at 0x2c2ca869eb0>

In [51]:
vector_embeddings = torch.randn(NUM_TOKENS, EMBEDDING_DIM)

print(f"Input Vector Embeddings:\n{vector_embeddings}\n")

print(f"Shape of Input Vector Embeddings: {vector_embeddings.shape}\n")

Input Vector Embeddings:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.4681, -0.1577,  1.4437,  0.2660],
        [ 0.1665,  0.8744, -0.1435, -0.1116],
        [ 0.9318,  1.2590,  2.0050,  0.0537],
        [ 0.6181, -0.4128, -0.8411, -2.3160]])

Shape of Input Vector Embeddings: torch.Size([6, 4])



## For one token

#### Query Vector

In [52]:
QUERY_VECTOR_INDEX = 2

In [53]:
query = vector_embeddings[QUERY_VECTOR_INDEX]

print(f"Query Vector (Index {QUERY_VECTOR_INDEX}):\n{query}\n")


Query Vector (Index 2):
tensor([ 0.4681, -0.1577,  1.4437,  0.2660])



#### Attention Scores

In [54]:
attention_scores = torch.empty(NUM_TOKENS)

for i in range(NUM_TOKENS):
    attention_scores[i] = torch.dot(query, vector_embeddings[i])

print(f"Attention Scores:\n{attention_scores}\n")
print(f"Shape of Attention Scores: {attention_scores.shape}\n")

Attention Scores:
tensor([-0.8224, -0.7308,  2.3989, -0.2968,  3.1464, -1.4760])

Shape of Attention Scores: torch.Size([6])



#### Attention Weights

In [55]:
# Normalization using just sum
attention_weights = attention_scores / torch.sum(attention_scores)

print(f"Attention Weights by using sum normalization:\n{attention_weights}\n")
print(f"Sum of Attention Weights: {torch.sum(attention_weights)}\n")

Attention Weights by using sum normalization:
tensor([-0.3706, -0.3293,  1.0809, -0.1337,  1.4178, -0.6651])

Sum of Attention Weights: 0.9999999403953552



In [56]:
# Normalization using naive softmax
exp_attention_scores = torch.exp(attention_scores)
attention_weights = exp_attention_scores / torch.sum(exp_attention_scores)

print(f"Attention Weights by using naive softmax normalization:\n{attention_weights}\n")
print(f"Sum of Attention Weights: {torch.sum(attention_weights)}\n")

Attention Weights by using naive softmax normalization:
tensor([0.0122, 0.0133, 0.3045, 0.0206, 0.6431, 0.0063])

Sum of Attention Weights: 0.9999999403953552



In [57]:
# Normalization using PyTorch softmax
attention_weights = torch.softmax(attention_scores, dim=0)

print(f"Attention Weights by using PyTorch softmax normalization:\n{attention_weights}\n")
print(f"Sum of Attention Weights: {torch.sum(attention_weights)}\n")

Attention Weights by using PyTorch softmax normalization:
tensor([0.0122, 0.0133, 0.3045, 0.0206, 0.6431, 0.0063])

Sum of Attention Weights: 1.0



#### Context Vector Generation

In [58]:
context_vector = torch.zeros(EMBEDDING_DIM)
for i in range(NUM_TOKENS):
    context_vector += attention_weights[i] * vector_embeddings[i]

print(f"Context Vector:\n{context_vector}\n")
print(f"Shape of Context Vector: {context_vector.shape}\n")

Context Vector:
tensor([0.7468, 0.7722, 1.7136, 0.0652])

Shape of Context Vector: torch.Size([4])



## All Tokens

#### Attention Scores

In [59]:
# using for loop
attention_scores = torch.empty(NUM_TOKENS, NUM_TOKENS)

for i in range(NUM_TOKENS):
    for j in range(NUM_TOKENS):
        attention_scores[i, j] = torch.dot(vector_embeddings[i], vector_embeddings[j])

print(f"Attention Scores Matrix using for loop:\n{attention_scores}\n")
print(f"Shape of Attention Scores Matrix: {attention_scores.shape}\n")

Attention Scores Matrix using for loop:
tensor([[ 2.8465, -0.7560, -0.8224, -1.1106, -3.0256,  0.9955],
        [-0.7560,  5.7732, -0.7308,  1.0278,  0.9148,  5.4036],
        [-0.8224, -0.7308,  2.3989, -0.2968,  3.1464, -1.4760],
        [-1.1106,  1.0278, -0.2968,  0.8253,  0.9623,  0.1211],
        [-3.0256,  0.9148,  3.1464,  0.9623,  6.4762, -1.7546],
        [ 0.9955,  5.4036, -1.4760,  0.1211, -1.7546,  6.6238]])

Shape of Attention Scores Matrix: torch.Size([6, 6])



In [60]:
# without using for loop
# this is X * X^T
attention_scores = torch.matmul(vector_embeddings, vector_embeddings.T)
print(f"Attention Scores Matrix without using for loop:\n{attention_scores}\n")
print(f"Shape of Attention Scores Matrix: {attention_scores.shape}\n")

Attention Scores Matrix without using for loop:
tensor([[ 2.8465, -0.7560, -0.8224, -1.1106, -3.0256,  0.9955],
        [-0.7560,  5.7732, -0.7308,  1.0278,  0.9148,  5.4036],
        [-0.8224, -0.7308,  2.3989, -0.2968,  3.1464, -1.4760],
        [-1.1106,  1.0278, -0.2968,  0.8253,  0.9623,  0.1211],
        [-3.0256,  0.9148,  3.1464,  0.9623,  6.4762, -1.7546],
        [ 0.9955,  5.4036, -1.4760,  0.1211, -1.7546,  6.6238]])

Shape of Attention Scores Matrix: torch.Size([6, 6])



In [61]:
# shortcut for torch.matmul
attention_scores = vector_embeddings @ vector_embeddings.T
print(f"Attention Scores Matrix using @ operator:\n{attention_scores}\n")
print(f"Shape of Attention Scores Matrix: {attention_scores.shape}\n")

Attention Scores Matrix using @ operator:
tensor([[ 2.8465, -0.7560, -0.8224, -1.1106, -3.0256,  0.9955],
        [-0.7560,  5.7732, -0.7308,  1.0278,  0.9148,  5.4036],
        [-0.8224, -0.7308,  2.3989, -0.2968,  3.1464, -1.4760],
        [-1.1106,  1.0278, -0.2968,  0.8253,  0.9623,  0.1211],
        [-3.0256,  0.9148,  3.1464,  0.9623,  6.4762, -1.7546],
        [ 0.9955,  5.4036, -1.4760,  0.1211, -1.7546,  6.6238]])

Shape of Attention Scores Matrix: torch.Size([6, 6])



#### Attention Weights

In [62]:
attention_weights = torch.softmax(attention_scores, dim=1)

print(f"Attention Weights Matrix using PyTorch softmax normalization:\n{attention_weights}\n")
print(f"Shape of Attention Weights Matrix: {attention_weights.shape}\n")

Attention Weights Matrix using PyTorch softmax normalization:
tensor([[8.1184e-01, 2.2127e-02, 2.0705e-02, 1.5521e-02, 2.2868e-03, 1.2752e-01],
        [8.5367e-04, 5.8465e-01, 8.7545e-04, 5.0814e-03, 4.5387e-03, 4.0400e-01],
        [1.2151e-02, 1.3317e-02, 3.0454e-01, 2.0555e-02, 6.4311e-01, 6.3212e-03],
        [3.3280e-02, 2.8241e-01, 7.5097e-02, 2.3065e-01, 2.6451e-01, 1.1406e-01],
        [7.1562e-05, 3.6813e-03, 3.4291e-02, 3.8603e-03, 9.5784e-01, 2.5509e-04],
        [2.7633e-03, 2.2691e-01, 2.3339e-04, 1.1526e-03, 1.7664e-04, 7.6876e-01]])

Shape of Attention Weights Matrix: torch.Size([6, 6])



In [65]:
# dim argument
test_tensor = torch.randn(3, 4)
print(f"Test Tensor:\n{test_tensor}")
print(f"Shape of Test Tensor: {test_tensor.shape}\n")

# sum with dim=0
sum_dim0 = torch.sum(test_tensor, dim=0)
print(f"Sum over dim=0:\n{sum_dim0}")
print(f"Shape of Sum over dim=0: {sum_dim0.shape}\n")

# sum with dim=1
sum_dim1 = torch.sum(test_tensor, dim=1)
print(f"Sum over dim=1:\n{sum_dim1}")
print(f"Shape of Sum over dim=1: {sum_dim1.shape}\n")

# sum with dim=-1
sum_dim_neg1 = torch.sum(test_tensor, dim=-1)
print(f"Sum over dim=-1:\n{sum_dim_neg1}")
print(f"Shape of Sum over dim=-1: {sum_dim_neg1.shape}\n")

# dim=-1 refers to the last dimension

Test Tensor:
tensor([[-1.0886, -0.2666,  0.1894, -0.2190],
        [ 2.0576, -0.0354,  0.0627, -0.7663],
        [ 1.0993,  2.7565,  0.1753, -0.9315]])
Shape of Test Tensor: torch.Size([3, 4])

Sum over dim=0:
tensor([ 2.0683,  2.4545,  0.4274, -1.9169])
Shape of Sum over dim=0: torch.Size([4])

Sum over dim=1:
tensor([-1.3848,  1.3186,  3.0996])
Shape of Sum over dim=1: torch.Size([3])

Sum over dim=-1:
tensor([-1.3848,  1.3186,  3.0996])
Shape of Sum over dim=-1: torch.Size([3])



Intuitively, if input is 3x4 then, if we want the o/p to be 3x1 then dim should be 1. If we want the o/p to be 1x4, then dim should be 0. This is true for reduction operations like sum, mean, max etc.

#### Context Vectors

In [66]:
context_vectors = attention_weights @ vector_embeddings

print(f"Context Vectors:\n{context_vectors}\n")
print(f"Shape of Context Vectors: {context_vectors.shape}\n")

Context Vectors:
tensor([[-0.8020, -0.9597, -0.2854, -0.6905],
        [ 0.7504,  0.2468, -0.5151, -2.1728],
        [ 0.7468,  0.7722,  1.7136,  0.0652],
        [ 0.5927,  0.6328,  0.4122, -0.8675],
        [ 0.9124,  1.2063,  1.9680,  0.0518],
        [ 0.6651, -0.1623, -0.7185, -2.2617]])

Shape of Context Vectors: torch.Size([6, 4])



In [68]:
print(f"The above is the result of followingg matrix multiplication:\n")
print(f"Attention Weights Matrix (Shape: {attention_weights.shape}) @ Vector Embeddings Matrix (Shape: {vector_embeddings.shape})\n")
print(f"{attention_weights} \n*\n {vector_embeddings}\n")

The above is the result of followingg matrix multiplication:

Attention Weights Matrix (Shape: torch.Size([6, 6])) @ Vector Embeddings Matrix (Shape: torch.Size([6, 4]))

tensor([[8.1184e-01, 2.2127e-02, 2.0705e-02, 1.5521e-02, 2.2868e-03, 1.2752e-01],
        [8.5367e-04, 5.8465e-01, 8.7545e-04, 5.0814e-03, 4.5387e-03, 4.0400e-01],
        [1.2151e-02, 1.3317e-02, 3.0454e-01, 2.0555e-02, 6.4311e-01, 6.3212e-03],
        [3.3280e-02, 2.8241e-01, 7.5097e-02, 2.3065e-01, 2.6451e-01, 1.1406e-01],
        [7.1562e-05, 3.6813e-03, 3.4291e-02, 3.8603e-03, 9.5784e-01, 2.5509e-04],
        [2.7633e-03, 2.2691e-01, 2.3339e-04, 1.1526e-03, 1.7664e-04, 7.6876e-01]]) 
*
 tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.4681, -0.1577,  1.4437,  0.2660],
        [ 0.1665,  0.8744, -0.1435, -0.1116],
        [ 0.9318,  1.2590,  2.0050,  0.0537],
        [ 0.6181, -0.4128, -0.8411, -2.3160]])

