In [2]:
import numpy as np



#Self Attention In Depth 

In [3]:
# In attention block we have three key factors.
# query, key and value 
# to know what does query, key and value pls check Andrew Ng's transformer video
# Now lets go through all the above in more depth
# Example sentence 'what is self attention' this is having 4 words
# lets take a particular word say "attention"
# now "attention" can be represented as embedding vector  of any dimension, here we
# are taking it as 20

x3 = np.random.randn(1, 20)

In [4]:
# attention calculation for x3

# We will only focus on x3
# for x3 we need the query vector x3 
wquery_3 = np.random.randn(20, 20)
key = np.random.randn(4, 20) # 4 is the vocab size here i.e. len(['what', 'is', 'self', 'attention'])
value = np.random.randn(4, 20)
query_3 = np.dot(x3, wquery_3)  # query vector derived from x3 (1, 20)


# In order to get the raw attention vale for x3 we need to 
# dot product of query3 with all the keys from other words

score = []

# we will collect the keys from each word to know the information it has and multiply it
# with question we have 
# key roughly speaking is like answer so dot product of (question and answer) give you 
# amount information the previous or next word contains
# values can be treated as reward of Reinfocement Learning

# just to make it more intutive we name query_3 as question
question = query_3
for idx in range(4):
    answer = key[idx, :]
    inforamtion_contains = np.dot(question, answer.T)
    score.append(inforamtion_contains)

score = np.array(score)
print(score.shape)


(4, 1)


In [5]:
# score is not scaled 
print(score)

[[-4.8333801 ]
 [15.04554084]
 [-2.4846493 ]
 [15.6106447 ]]


In [6]:
# In order to scale the score we will use softmax function
prob = np.exp(score) / np.sum(np.exp(score), axis=1, keepdims=True)

In [7]:
print(prob)

[[1.]
 [1.]
 [1.]
 [1.]]


In [8]:
# prob[1] represents how much word3 should give attention to word2
print(prob[1].item())

1.0


In [9]:
# final step
# here we will aggregate the probs with reward
A3 = np.zeros((1, 20))
for idx in range(4):
    reward = value[idx, :]
    A3 += prob[idx] * reward

print(A3.shape)
print(A3)

# This is A3 is the single head attention value for the word "X3"

(1, 20)
[[-3.0517202  -1.55142381  0.25731669  3.14041936  3.64158751  1.95962641
   2.45629022 -1.18237319 -0.85627119  0.5164287  -1.58407735 -0.03085708
   0.78139896 -2.83374374 -0.66413263 -2.08166427  3.41359276 -1.59055425
  -4.02994175  1.19368728]]


In [10]:
# Now the above is only done for x3 which is attention 
# lets do it for all others

x = np.random.randn(4, 20) # as 4 is the vocab size and 20 is embedding size
w_query = np.random.rand(20, 20)
w_key = np.random.rand(20, 20)
w_value = np.random.randn(20, 20)

In [11]:
# first we will create an inefficient implementation of self attention

A = [None for _ in range(4)] # a list which will contain all attentions


key = np.dot(x, w_key) # 4 is the vocab size here i.e. len(['what', 'is', 'self', 'attention'])
value = np.dot(x, w_value)

vocab_size = 4 
for idx in range(vocab_size):
    score = []
    for j in range(vocab_size):
        query = np.dot(x[idx, :].reshape(1, -1), w_query)
        answer = key[idx, :]
        inforamtion_contains = np.dot(query, answer)
        score.append(inforamtion_contains)
    score = np.array(score)
    prob = np.exp(score) / np.sum(np.exp(score), axis=1, keepdims=True)


    # final step
    # here we will aggregate the probs with reward
    attention = np.zeros((1, 20))
    for k in range(vocab_size):
        reward = value[k, :]
        attention += prob[k] * reward
    A[idx] = attention

A = np.array(A).reshape(vocab_size, -1)
print(A.shape)

# Here A is containing all attention representaions of all the words in vocab
    

(4, 20)


In [12]:
# Now lets do all the above in a vectorized implementation
x = np.random.rand(4, 20)

w_query = np.random.rand(20, 20)
w_key = np.random.rand(20, 20)
w_value = np.random.randn(20, 20)


query = np.dot(x, w_query)
key = np.dot(x, w_key)
value = np.dot(x, w_value)

# Now instead of for loop, we will use np.dot for every next operations
score = np.dot(query, key.T) 
probs = np.exp(score) / np.sum(np.exp(score), axis=1, keepdims=True)

A = np.dot(probs, value)
print(A.shape)

(4, 20)


In [16]:
# Now for multi head attention 
# we just need to create multiple heads like this

# lets say we want to create 8 heads inplace of 1 head 
# now we will add two more variables 
# 1. embeddig dimendion
# 2. head dimension which is basically the dim of query, key, values


multi_heads= []
num_heads = 8    #this number is taken from the paper
embed_dim = 24
seq_len = 4      #as we have 4 words in the sentence
x = np.random.rand(seq_len, embed_dim)
head_dim = embed_dim // num_heads



for head in range(num_heads):
    
    w_query = np.random.rand(embed_dim, head_dim)
    w_key = np.random.rand(embed_dim, head_dim)
    w_value = np.random.randn(embed_dim, head_dim)


    query = np.dot(x, w_query)
    key = np.dot(x, w_key)
    value = np.dot(x, w_value)

    score = np.dot(query, key.T) 
    probs = np.exp(score) / np.sum(np.exp(score), axis=1, keepdims=True)

    A = np.dot(probs, value)
    multi_heads.append(A)

multi_heads = np.array(multi_heads)

# Now we can see, instead of one head we have 8 heads with much rich representation
print(multi_heads.shape)


(8, 4, 3)
