**1. Importing Libraries**

In [None]:
import tensorflow as tf
import numpy as np

**2. Input Tensor Definition**

In [3]:
# Input
x = tf.constant([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2]
], dtype=tf.float32)

**3. Defining Projection Matrices**

In [4]:
# Projection matrices (EXACT from manual)
W_Q = tf.constant([
    [0.1, 0.2, 0.3],
    [0.4, 0.5, 0.6],
    [0.7, 0.8, 0.9],
    [1.0, 1.1, 1.2]
], dtype=tf.float32)
W_K = W_Q  # Same as Q
W_V = tf.constant([
    [0.1, 0.2],
    [0.3, 0.4],
    [0.5, 0.6],
    [0.7, 0.8]
], dtype=tf.float32)

**4. Computing Queries, Keys, and Values**

In [5]:
# Manual projection to Queries, Keys & Values
queries = tf.matmul(x, W_Q)
print("Queries Matrix: ", queries)

keys = tf.matmul(x, W_K)
print("Keys Matrix:", keys)

values = tf.matmul(x, W_V)
print("Values Matrix:", values)

Queries Matrix:  tf.Tensor(
[[0.70000005 0.8000001  0.90000004]
 [1.5799999  1.8400002  2.1       ]
 [2.46       2.88       3.3000002 ]], shape=(3, 3), dtype=float32)
Keys Matrix: tf.Tensor(
[[0.70000005 0.8000001  0.90000004]
 [1.5799999  1.8400002  2.1       ]
 [2.46       2.88       3.3000002 ]], shape=(3, 3), dtype=float32)
Values Matrix: tf.Tensor(
[[0.5       0.6      ]
 [1.14      1.4000001]
 [1.78      2.2      ]], shape=(3, 2), dtype=float32)


**5. Calculating Attention Scores**

In [6]:
# Attention
scores = tf.matmul(queries, keys, transpose_b=True)
print("Attention Scores Matrix: ", scores)

Attention Scores Matrix:  tf.Tensor(
[[ 1.9400002  4.4680004  6.996001 ]
 [ 4.4680004 10.292     16.116001 ]
 [ 6.996001  16.116001  25.236002 ]], shape=(3, 3), dtype=float32)


**6. Scaling the Attention Scores**

In [7]:
scaled = scores / tf.sqrt(3.0)
print("Attention Score Matrix Scaled: ", scaled)

Attention Score Matrix Scaled:  tf.Tensor(
[[ 1.1200596  2.5796013  4.039143 ]
 [ 2.5796013  5.942089   9.304578 ]
 [ 4.039143   9.304578  14.570013 ]], shape=(3, 3), dtype=float32)


**7. Applying Softmax to Get Attention Weights**

In [11]:
weights = tf.nn.softmax(scaled, axis=-1)
print("Attention Weights Matrix:", weights)

Attention Weights Matrix: tf.Tensor(
[[4.1966923e-02 1.8062508e-01 7.7740800e-01]
 [1.1589993e-03 3.3449762e-02 9.6539128e-01]
 [2.6561420e-05 5.1404452e-03 9.9483299e-01]], shape=(3, 3), dtype=float32)


**8. Computing the Final Context Vectors**

In [12]:
# Contextualized Value Vectors for each word
context = tf.matmul(weights, values)

print("Context vectors for each word:")
print(context.numpy())

Context vectors for each word:
[[1.6106822 1.9883529]
 [1.7571087 2.171386 ]
 [1.776676  2.1958451]]
