In [1]:
import tensorflow as tf
import numpy as np

# Input tensor (3 words, each with 4 features)
x = tf.constant([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2]
], dtype=tf.float32)
print("Input tensor x:")
print(x.numpy())


Input tensor x:
[[0.1 0.2 0.3 0.4]
 [0.5 0.6 0.7 0.8]
 [0.9 1.  1.1 1.2]]


In [2]:
# Projection matrices for Q, K, V (weights)
W_Q = tf.constant([
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9],
    [1.0, 1.1, 1.2]
], dtype=tf.float32)

W_K = W_Q  # Keys use the same matrix as Queries in this example

W_V = tf.constant([
    [0.1, 0.2],
    [0.3, 0.4],
    [0.5, 0.6],
    [0.7, 0.8]
], dtype=tf.float32)

print("Projection matrix W_Q:\n", W_Q.numpy())
print("Projection matrix W_V:\n", W_V.numpy())


Projection matrix W_Q:
 [[0.1 0.2 0.3]
 [0.4 0.5 0.6]
 [0.7 0.8 0.9]
 [1.  1.1 1.2]]
Projection matrix W_V:
 [[0.1 0.2]
 [0.3 0.4]
 [0.5 0.6]
 [0.7 0.8]]


In [3]:
# Project input x to Queries, Keys, Values by multiplying with respective weights
queries = tf.matmul(x, W_Q)
keys = tf.matmul(x, W_K)
values = tf.matmul(x, W_V)

print("Queries matrix:\n", queries.numpy())
print("Keys matrix:\n", keys.numpy())
print("Values matrix:\n", values.numpy())


Queries matrix:
 [[0.70000005 0.8000001  0.90000004]
 [1.5799999  1.8400002  2.1       ]
 [2.46       2.88       3.3000002 ]]
Keys matrix:
 [[0.70000005 0.8000001  0.90000004]
 [1.5799999  1.8400002  2.1       ]
 [2.46       2.88       3.3000002 ]]
Values matrix:
 [[0.5       0.6      ]
 [1.14      1.4000001]
 [1.78      2.2      ]]


In [4]:
# Calculate attention scores as dot product of Queries and Keys transpose
scores = tf.matmul(queries, keys, transpose_b=True)

print("Attention scores matrix (Q*K^T):")
print(scores.numpy())


Attention scores matrix (Q*K^T):
[[ 1.9400002  4.4680004  6.996001 ]
 [ 4.4680004 10.292     16.116001 ]
 [ 6.996001  16.116001  25.236002 ]]


In [5]:
# Scale scores by sqrt of key dimension (sqrt(3))
scaled_scores = scores / tf.math.sqrt(tf.cast(tf.shape(keys)[-1], tf.float32))

print("Scaled attention scores:")
print(scaled_scores.numpy())

# Apply softmax to get attention weights (probabilities)
weights = tf.nn.softmax(scaled_scores, axis=-1)

print("Attention weights (after softmax):")
print(weights.numpy())


Scaled attention scores:
[[ 1.1200596  2.5796013  4.039143 ]
 [ 2.5796013  5.942089   9.304578 ]
 [ 4.039143   9.304578  14.570013 ]]
Attention weights (after softmax):
[[4.1966923e-02 1.8062508e-01 7.7740800e-01]
 [1.1589993e-03 3.3449762e-02 9.6539128e-01]
 [2.6561420e-05 5.1404452e-03 9.9483299e-01]]


In [6]:
# Multiply attention weights by Values to get final context vectors
context = tf.matmul(weights, values)

print("Context vectors (output of attention):")
print(context.numpy())


Context vectors (output of attention):
[[1.6106822 1.9883529]
 [1.7571087 2.171386 ]
 [1.776676  2.1958451]]
