## Self Attention

In [138]:
import numpy as np

In [140]:
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

In [142]:
def QKV(X, W_q, W_k, W_v):
    Q = np.dot(X, W_q)
    K = np.dot(X, W_k)
    V = np.dot(X, W_v)
    return Q, K, V

In [144]:
def self_attention(Q, K, V):
    d= Q.shape[-1]
    attention_scores = np.dot(Q, K.T) / np.sqrt(d)
    attention_weights= softmax(attention_scores)
    output = np.dot(attention_weights, V)
    return output

In [146]:
X = np.array([[1, 0], [0, 1]])
W_q = np.array([[1, 0], [0, 1]])
W_k = np.array([[1, 0], [0, 1]])
W_v = np.array([[1, 2], [3, 4]])

Q, K, V = QKV(X, W_q, W_k, W_v)
output = self_attention(Q, K, V)

In [148]:
print(output)

[[1.6604769 2.6604769]
 [2.3395231 3.3395231]]


## Multihead Attention

In [154]:
def hard_softmax(x):
    one_hot = np.zeros_like(x)
    one_hot[np.arange(len(x)), np.argmax(x, axis=-1)] = 1
    return one_hot
def QKV(X, W_q, W_k, W_v):
    Q = np.dot(X, W_q)
    K = np.dot(X, W_k)
    V = np.dot(X, W_v)
    return Q, K, V
def self_attention(Q, K, V):
    d= Q.shape[-1]
    attention_scores = np.dot(Q, K.T) / np.sqrt(d)
    attention_weights= hard_softmax(attention_scores)
    output = np.dot(attention_weights, V)
    return output
    
def multi_head_attention(Q, K, V, n):
    d= Q.shape[-1]
    assert d % n == 0, "Embedding dimension must be divisible by no. of heads"
    head_dim = d // n
    Q_heads = np.split(Q, n, axis=-1)
    K_heads = np.split(K, n, axis=-1)
    V_heads = np.split(V, n, axis=-1)
    
    heads = [self_attention(q, k, v) for q, k, v in zip(Q_heads, K_heads, V_heads)]
    
    multihead = np.concatenate(heads, axis=-1)
    return multihead

Q = np.array([[1, 0], [0, 1]])
K = np.array([[1, 0], [0, 1]])
V = np.array([[1, 0], [0, 1]])
n = 2
output = multi_head_attention(Q, K, V, n)

In [156]:
print(output)

[[1. 0.]
 [1. 1.]]
