In [1]:
import numpy as np



# Self attention in depth

In [2]:
# In attention block we have three key factors.
# query, key and value 
# to know what does query, key and value pls check Andrew Ng's transformer video
# Now lets go through all the above in more depth
# Example sentence 'what is self attention' this is having 4 words
# lets take a particular word say "attention"
# now "attention" can be represented as embedding vector  of any dimension, here we
# are taking it as 24

d_k = 24 #embedding dim
seq_len = 4 # as we have an example sentence having 4 words
x3 = np.random.randn(1, d_k)

In [3]:
# attention calculation for x3

# We will only focus on x3
# for x3 we need the query vector x3 

head_dim = 3 # the dimension for query, key and  value
wquery_3 = np.random.randn(d_k, head_dim)
key = np.random.randn(seq_len, head_dim) # 4 is the vocab size here i.e. len(['what', 'is', 'self', 'attention'])
value = np.random.randn(seq_len, head_dim)
query_3 = np.dot(x3, wquery_3)  # query vector derived from x3 (1, 20)


# In order to get the raw attention vale for x3 we need to 
# dot product of query3 with all the keys from other words

score = []

# we will collect the keys from each word to know the information it has and multiply it
# with question we have 
# key roughly speaking is like answer so dot product of (question and answer) give you 
# amount information the previous or next word contains
# values can be treated as reward of Reinfocement Learning

# just to make it more intutive we name query_3 as question
question = query_3
for idx in range(seq_len):
    answer = key[idx, :]
    inforamtion_contains = np.dot(question, answer.T)
    scaled_information_contains = inforamtion_contains / np.sqrt(d_k) #scaled by embedding dim
    score.append(scaled_information_contains)

score = np.array(score)
print(score.shape)


(4, 1)


In [4]:
# score is not scaled 
print(score)

[[-0.06819091]
 [-0.64732525]
 [-0.71929792]
 [ 0.14924954]]


In [5]:
# In order to scale the score we will use softmax function
prob = np.exp(score) / np.sum(np.exp(score), axis=0, keepdims=True)

In [6]:
print(prob)

[[0.30077514]
 [0.16854935]
 [0.15684467]
 [0.37383084]]


In [7]:
# prob[1] represents how much word3 should give attention to word2
print(prob[1].item())

0.16854935294368745


In [9]:
# final step
# here we will aggregate the probs with reward
A3 = np.zeros((1, head_dim))
for idx in range(seq_len):
    reward = value[idx, :]
    A3 += prob[idx] * reward

print(A3.shape)
print(A3)

# This is A3 is the single head attention value for the word "X3"

(1, 3)
[[ 0.06351373 -0.04478712  0.46063753]]


In [13]:
# Now the above is only done for x3 which is attention 
# lets do it for all others

x = np.random.randn(seq_len, d_k) # as 4 is the vocab size and 20 is embedding size
w_query = np.random.rand(d_k, head_dim)
w_key = np.random.rand(d_k, head_dim)
w_value = np.random.randn(d_k, head_dim)

In [14]:
# first we will create an inefficient implementation of self attention

A = [None for _ in range(seq_len)] # a list which will contain all attentions


key = np.dot(x, w_key) # 4 is the vocab size here i.e. len(['what', 'is', 'self', 'attention'])
value = np.dot(x, w_value)

for idx in range(seq_len):
    score = []
    for j in range(seq_len):
        query = np.dot(x[idx, :].reshape(1, -1), w_query)
        answer = key[idx, :]
        inforamtion_contains = np.dot(query, answer)
        score.append(inforamtion_contains)
    score = np.array(score)
    prob = np.exp(score) / np.sum(np.exp(score), axis=0, keepdims=True)


    # final step
    # here we will aggregate the probs with reward
    attention = np.zeros((1, head_dim))
    for k in range(seq_len):
        reward = value[k, :]
        attention += prob[k] * reward
    A[idx] = attention

A = np.array(A).reshape(seq_len, -1)
print(A.shape)

# Here A is containing all attention representaions of all the words in vocab
    

(4, 3)


In [16]:
# Now lets do all the above in a vectorized implementation
x = np.random.rand(seq_len, d_k)

w_query = np.random.rand(d_k, head_dim)
w_key = np.random.rand(d_k, head_dim)
w_value = np.random.randn(d_k, head_dim)


query = np.dot(x, w_query)
key = np.dot(x, w_key)
value = np.dot(x, w_value)

# Now instead of for loop, we will use np.dot for every next operations
score = np.dot(query, key.T) 
probs = np.exp(score) / np.sum(np.exp(score), axis=0, keepdims=True)

A = np.dot(probs, value)
print(A.shape)

(4, 3)


# MultiHead Attention

In [18]:
# Now for multi head attention 
# we just need to create multiple heads like this

# lets say we want to create 8 heads inplace of 1 head 


multi_heads= []
num_heads = 8    #this number is taken from the paper
x = np.random.rand(seq_len, d_k)
head_dim = d_k // num_heads



for head in range(num_heads):
    
    w_query = np.random.rand(d_k, head_dim)
    w_key = np.random.rand(d_k, head_dim)
    w_value = np.random.randn(d_k, head_dim)


    query = np.dot(x, w_query)
    key = np.dot(x, w_key)
    value = np.dot(x, w_value)

    score = np.dot(query, key.T) 
    probs = np.exp(score) / np.sum(np.exp(score), axis=0, keepdims=True)

    A = np.dot(probs, value)
    multi_heads.append(A)

multi_heads = np.array(multi_heads)

# Now we can see, instead of one head we have 8 heads with much rich representation
print(multi_heads.shape)


(8, 4, 3)
