In [1]:
import tensorflow as tf
import numpy as np

# Dummy tokenizer and sentence
word2idx = {"i": 1, "like": 2, "pizza": 3, "<pad>": 0}
idx2word = {v: k for k, v in word2idx.items()}

input_sentence = ["i", "like", "pizza"]
input_ids = [word2idx[word] for word in input_sentence] + [0] * 2  # pad to length 5
embedding_dim = 8
num_heads = 2

# Embedding
embedding_layer = tf.keras.layers.Embedding(input_dim=len(word2idx), output_dim=embedding_dim)
x = embedding_layer(tf.constant([input_ids]))  # (1, seq_len, embed_dim)
print(" Input Embeddings:\n", x.numpy())

# ─────────────────────────────────────────────
# Manual Single-Head Self-Attention
# ─────────────────────────────────────────────
def single_head_attention(x):
    d_model = x.shape[-1]
    q_proj = tf.keras.layers.Dense(d_model, use_bias=False)
    k_proj = tf.keras.layers.Dense(d_model, use_bias=False)
    v_proj = tf.keras.layers.Dense(d_model, use_bias=False)

    Q = q_proj(x)
    K = k_proj(x)
    V = v_proj(x)

    scores = tf.matmul(Q, K, transpose_b=True) / tf.math.sqrt(tf.cast(d_model, tf.float32))
    weights = tf.nn.softmax(scores, axis=-1)
    output = tf.matmul(weights, V)

    return output, weights, Q, K, V

single_output, single_weights, Q_single, K_single, V_single = single_head_attention(x)

print("\n [Single-Head] Q matrix:\n", Q_single.numpy())
print("\n [Single-Head] K matrix:\n", K_single.numpy())
print("\n [Single-Head] V matrix:\n", V_single.numpy())

# ─────────────────────────────────────────────
# Manual Multi-Head Self-Attention
# ─────────────────────────────────────────────
def multi_head_attention(x, num_heads):
    batch_size, seq_len, d_model = x.shape
    depth = d_model // num_heads

    q_proj = tf.keras.layers.Dense(d_model, use_bias=False)
    k_proj = tf.keras.layers.Dense(d_model, use_bias=False)
    v_proj = tf.keras.layers.Dense(d_model, use_bias=False)
    out_proj = tf.keras.layers.Dense(d_model, use_bias=False)

    Q = q_proj(x)  # (1, seq, d_model)
    K = k_proj(x)
    V = v_proj(x)

    def split_heads(tensor):
        tensor = tf.reshape(tensor, (batch_size, seq_len, num_heads, depth))
        return tf.transpose(tensor, [0, 2, 1, 3])  # (batch, heads, seq, depth)

    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    print("\n [Multi-Head] Q matrix (before split):\n", Q.numpy())
    print("\n [Multi-Head] K matrix (before split):\n", K.numpy())
    print("\n [Multi-Head] V matrix (before split):\n", V.numpy())

    scores = tf.matmul(Q_heads, K_heads, transpose_b=True) / tf.math.sqrt(tf.cast(depth, tf.float32))
    weights = tf.nn.softmax(scores, axis=-1)
    attention = tf.matmul(weights, V_heads)

    def combine_heads(tensor):
        tensor = tf.transpose(tensor, [0, 2, 1, 3])  # (batch, seq, heads, depth)
        return tf.reshape(tensor, (batch_size, seq_len, d_model))

    concat_attention = combine_heads(attention)
    output = out_proj(concat_attention)

    return output, weights, Q_heads, K_heads, V_heads

multi_output, multi_weights, Q_multi, K_multi, V_multi = multi_head_attention(x, num_heads=num_heads)


print("\n [Multi-Head] Q matrix (split into heads):")
for h in range(num_heads):
    print(f"Head {h}:\n", Q_multi[0, h].numpy())

print("\n [Multi-Head] K matrix (split into heads):")
for h in range(num_heads):
    print(f"Head {h}:\n", K_multi[0, h].numpy())

print("\n [Multi-Head] V matrix (split into heads):")
for h in range(num_heads):
    print(f"Head {h}:\n", V_multi[0, h].numpy())


print("\n Single-Head Attention Output:\n", single_output.numpy())
print("\n Multi-Head Attention Output:\n", multi_output.numpy())

2025-07-24 06:40:52.149080: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753339252.412846      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753339252.498413      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


 Input Embeddings:
 [[[-0.00551504 -0.04743361  0.04183458 -0.00768981 -0.01358447
   -0.02088403 -0.00740484 -0.00991791]
  [ 0.04345195  0.00339695  0.0024263   0.00450151 -0.04237303
    0.03037925 -0.01545955  0.01773569]
  [ 0.02591239  0.03374423  0.00670849  0.01483304  0.04877156
   -0.03328855 -0.04937769 -0.04755886]
  [ 0.02456169 -0.02162606  0.03049548 -0.04758055  0.03713093
    0.03330494 -0.02457246 -0.02500652]
  [ 0.02456169 -0.02162606  0.03049548 -0.04758055  0.03713093
    0.03330494 -0.02457246 -0.02500652]]]

 [Single-Head] Q matrix:
 [[[-0.05226441  0.02968124  0.02343815  0.00114103  0.01197708
   -0.01017776  0.04240124 -0.04315875]
  [-0.02682922  0.00301197  0.02580999 -0.01038731 -0.00598572
   -0.00999494 -0.00701539  0.01630884]
  [ 0.01164746 -0.02270364  0.00372308 -0.01677837  0.00717917
    0.02732132 -0.03734814 -0.05170512]
  [ 0.01133449  0.01575701  0.03457785 -0.03391965 -0.00976276
   -0.04150211 -0.03647286 -0.05715483]
  [ 0.01133449  0.015757

2025-07-24 06:41:10.965465: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
