In [1]:
import numpy as np

# Define the query, key, value weight matrices (Q_w, K_w, V_w)
Q_w = np.array([[0.5, 0.1, 0.3],
                [0.2, 0.3, 0.1],
                [0.1, 0.2, 0.5],
                [0.3, 0.5, 0.2]])

K_w = np.array([[0.4, 0.2, 0.3],
                [0.1, 0.4, 0.2],
                [0.3, 0.1, 0.4],
                [0.2, 0.3, 0.1]])

V_w = np.array([[0.2, 0.4, 0.1],
                [0.3, 0.2, 0.4],
                [0.1, 0.3, 0.2],
                [0.4, 0.1, 0.3]])

# Define the word inputs (e.g., 3 words)
words = np.array([
    [1, 0, 0, 1],  # Word 1
    [0, 2, 2, 0],  # Word 2
    [1, 1, 0, 0]   # Word 3
])

# Function to compute queries, keys, and values
def compute_qkv(word, Q_w, K_w, V_w):
    Q = np.dot(word, Q_w)
    K = np.dot(word, K_w)
    V = np.dot(word, V_w)
    return Q, K, V

# Compute Q, K, V for each word
Q1, K1, V1 = compute_qkv(words[0], Q_w, K_w, V_w)  # For word 1
Q2, K2, V2 = compute_qkv(words[1], Q_w, K_w, V_w)  # For word 2
Q3, K3, V3 = compute_qkv(words[2], Q_w, K_w, V_w)  # For word 3

# Function to calculate scaled attention score (dot product between Q and K)
def scaled_attention_score(Q, K, dim):
    score = np.dot(Q, K.T) / np.sqrt(dim)  # Scale by sqrt of the dimension (dimensionality scaling)
    return score

# Dimensionality (size of the query/key vectors)
dim = Q1.shape[0]  # Dimension size (in this case it's 3)

# Compute scaled attention scores between Q1 and K1, K2, K3
scores = np.array([
    scaled_attention_score(Q1, K1, dim),
    scaled_attention_score(Q1, K2, dim),
    scaled_attention_score(Q1, K3, dim)
])

# Apply softmax to the attention scores
def softmax(x):
    e_x = np.exp(x - np.max(x))  # For numerical stability
    return e_x / np.sum(e_x)

softmax_scores = softmax(scores)

# Calculate the weighted sum of values (V1, V2, V3) using the softmax scores
weighted_sum = softmax_scores[0] * V1 + softmax_scores[1] * V2 + softmax_scores[2] * V3

# Add a simple residual connection (output = weighted sum + input, e.g., word1 as input here)
residual_output = weighted_sum + np.dot(words[0], V_w)

# Print results
print(f"Q1: {Q1}")
print(f"K1: {K1}")
print(f"V1: {V1}")
print(f"Scaled Attention Scores: {scores}")
print(f"Softmax Scores: {softmax_scores}")
print(f"Weighted Sum (Before Residual): {weighted_sum}")
print(f"Residual Output: {residual_output}")


Q1: [0.8 0.6 0.5]
K1: [0.6 0.5 0.4]
V1: [0.6 0.5 0.4]
Scaled Attention Scores: [0.56580326 1.0623245  0.58312377]
Softmax Scores: [0.27318918 0.44884865 0.27796217]
Weighted Sum (Before Residual): [0.66197351 0.75222054 0.78687514]
Residual Output: [1.26197351 1.25222054 1.18687514]
