**Attention weights, and context vectors following the Transformer attention mechanism** **bold text**

**Section 1: Define Input and Projection Matrices**

In [6]:
import tensorflow as tf
import numpy as np

# Input
x = tf.constant([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2]
], dtype=tf.float32)

# Projection matrices (EXACT from manual)
W_Q = tf.constant([
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9],
    [1.0, 1.1, 0]
], dtype=tf.float32)
W_K = W_Q  # Same as Q
W_V = tf.constant([
    [0.1, 0.2],
    [0.3, 0.4],
    [0.5, 0.6],
    [0.7, 0.8],

], dtype=tf.float32)


**Section 2: Compute Queries, Keys, Values and Attention Scores**

In [7]:
queries = tf.matmul(x, W_Q)
print("Queries Matrix: ", queries)

keys = tf.matmul(x, W_K)
print("Keys Matrix:", keys)

values = tf.matmul(x, W_V)
print("Values Matrix:", values)

# Compute raw attention scores (dot product of queries and keys)
scores = tf.matmul(queries, keys, transpose_b=True)
print("Attention Scores Matrix: ", scores)

# Scale attention scores
scaled = scores / tf.sqrt(3.0)  # sqrt of key dimension (3)
print("Attention Score Matrix Scaled: ", scaled)

Queries Matrix:  tf.Tensor(
[[0.70000005 0.8000001  0.42000002]
 [1.5799999  1.8400002  1.14      ]
 [2.46       2.88       1.86      ]], shape=(3, 3), dtype=float32)
Keys Matrix: tf.Tensor(
[[0.70000005 0.8000001  0.42000002]
 [1.5799999  1.8400002  1.14      ]
 [2.46       2.88       1.86      ]], shape=(3, 3), dtype=float32)
Values Matrix: tf.Tensor(
[[0.5       0.6      ]
 [1.14      1.4000001]
 [1.7800001 2.2      ]], shape=(3, 2), dtype=float32)
Attention Scores Matrix:  tf.Tensor(
[[ 1.3064002  3.0568004  4.8072004]
 [ 3.0568004  7.1816    11.3064   ]
 [ 4.8072004 11.3064    17.805601 ]], shape=(3, 3), dtype=float32)
Attention Score Matrix Scaled:  tf.Tensor(
[[ 0.7542505  1.7648445  2.7754385]
 [ 1.7648445  4.146299   6.5277534]
 [ 2.7754385  6.5277534 10.280068 ]], shape=(3, 3), dtype=float32)


**Section 3: Calculate Attention Weights and Context Vectors**

In [8]:
weights = tf.nn.softmax(scaled, axis=-1)
print("Attention Weights Matrix:", weights)

# Compute the context vectors as weighted sum of values
context = tf.matmul(weights, values)
print("Context vectors for each word:")
print(context.numpy())

Attention Weights Matrix: tf.Tensor(
[[8.8538527e-02 2.4323590e-01 6.6822553e-01]
 [7.7575515e-03 8.3941586e-02 9.0830088e-01]
 [5.3761917e-04 2.2913132e-02 9.7654927e-01]], shape=(3, 3), dtype=float32)
Context vectors for each word:
[[1.5109997 1.8637496]
 [1.7163478 2.1204348]
 [1.7646476 2.1808093]]
