# Single Head Attention Mechanisms

In [1]:
import numpy as np
import math

In [2]:
L, d_k, d_v = 4, 8, 8
# L - length of input sequence

# Query Vector for what i am looking for?
q = np.random.randn(L, d_k) 

# key Vector for what i can offer?
k = np.random.randn(L, d_k)

# Value Vector for what i can actually offer?
v = np.random.randn(L, d_k)

In [3]:
print("\nQ: ", q)
print("\nK: ", k)
print("\nV: ", v)


Q:  [[ 0.17860127 -0.12679924 -0.70385102 -0.1321929   1.66246326 -0.13830833
  -0.91713395  0.38146683]
 [-0.6867237  -0.36840547  0.34886887  0.60999383  0.49675138  0.05169754
   1.73291754 -1.44874149]
 [-0.5206792   0.88486143  1.55438362  0.45791464  0.76019937  0.63370558
  -0.50140007  1.23242755]
 [-0.5230864  -1.14149062 -0.7578606  -0.59142653  0.76552817 -0.81982243
  -0.09392885 -0.11942583]]

K:  [[-0.25348382  0.65407831 -1.01298103 -1.2455882   0.81397054 -0.20435126
   0.78180332  1.37437123]
 [-0.01597132 -1.51435872 -0.28574857  1.09904496  0.84577145 -0.34560973
  -0.67450277  0.38056754]
 [ 1.1796839   0.55844447 -0.49982861 -0.43669645  0.95549426 -0.8175062
   1.24984591 -1.24680455]
 [-0.43477699 -0.20702378 -0.86591103 -0.02795983  1.36668061  0.07201489
   1.6862868  -1.11639335]]

V:  [[ 0.04876692  1.62111108 -1.54808634 -0.31269417 -0.33835223  0.9820561
   0.37012513 -0.19601234]
 [-1.93868023  1.88238666  1.34964456 -0.13590576 -0.02591865  0.13059406
  

# Self Attention

In [4]:
np.matmul(q, k.T)

array([[ 1.93815475,  2.46265347,  0.62906707,  0.85144531],
       [-1.42262276, -0.17834291,  2.94794973,  5.27788042],
       [ 0.3569163 , -0.04141805, -3.05194722, -2.45236144],
       [ 1.44342181,  2.25224006,  0.81571032,  2.09864599]])

In [5]:
q.var(), k.var(), np.matmul(q, k.T).var()

(0.6515184420509901, 0.7805136644846842, 4.068995490985916)

In [6]:
scaled = np.matmul(q, k.T)/math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.6515184420509901, 0.7805136644846842, 0.5086244363732394)

# Masking

specifically require in decoder part of transformer so we dont look at the future word when trying to generate current context

In [7]:
mask = np.tril(np.ones((L, L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [8]:
mask[mask==0] = -np.infty
mask[mask==1] = 0

In [9]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [10]:
scaled

array([[ 0.68524118,  0.87067948,  0.2224088 ,  0.30103138],
       [-0.5029731 , -0.06305374,  1.04225762,  1.86601252],
       [ 0.12618897, -0.01464349, -1.07902629, -0.8670407 ],
       [ 0.51032667,  0.79628711,  0.28839715,  0.74198341]])

In [11]:
scaled + mask

array([[ 0.68524118,        -inf,        -inf,        -inf],
       [-0.5029731 , -0.06305374,        -inf,        -inf],
       [ 0.12618897, -0.01464349, -1.07902629,        -inf],
       [ 0.51032667,  0.79628711,  0.28839715,  0.74198341]])

# Softmax

operation used to convert vectors into a probability distribution

In [12]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
    d_k = q.shape[-1]
    scaled = np.matmul(q, k.T) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled+mask
    attention = softmax(scaled)
    out = np.matmul(attention, v)
    return out, attention

In [13]:
values , attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q:\n",q)
print("K:\n",k)
print("V:\n",v)
print("Values:\n",values)
print("Attention:\n",attention)


Q:
 [[ 0.17860127 -0.12679924 -0.70385102 -0.1321929   1.66246326 -0.13830833
  -0.91713395  0.38146683]
 [-0.6867237  -0.36840547  0.34886887  0.60999383  0.49675138  0.05169754
   1.73291754 -1.44874149]
 [-0.5206792   0.88486143  1.55438362  0.45791464  0.76019937  0.63370558
  -0.50140007  1.23242755]
 [-0.5230864  -1.14149062 -0.7578606  -0.59142653  0.76552817 -0.81982243
  -0.09392885 -0.11942583]]
K:
 [[-0.25348382  0.65407831 -1.01298103 -1.2455882   0.81397054 -0.20435126
   0.78180332  1.37437123]
 [-0.01597132 -1.51435872 -0.28574857  1.09904496  0.84577145 -0.34560973
  -0.67450277  0.38056754]
 [ 1.1796839   0.55844447 -0.49982861 -0.43669645  0.95549426 -0.8175062
   1.24984591 -1.24680455]
 [-0.43477699 -0.20702378 -0.86591103 -0.02795983  1.36668061  0.07201489
   1.6862868  -1.11639335]]
V:
 [[ 0.04876692  1.62111108 -1.54808634 -0.31269417 -0.33835223  0.9820561
   0.37012513 -0.19601234]
 [-1.93868023  1.88238666  1.34964456 -0.13590576 -0.02591865  0.13059406
   0.

In [14]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.39176018, 0.60823982, 0.        , 0.        ],
       [0.46119881, 0.40061335, 0.13818784, 0.        ],
       [0.2276505 , 0.30301187, 0.18234163, 0.286996  ]])

In [15]:
#multiplying attention matrix to value matrix
new_v = np.matmul(attention, v)
new_v

array([[ 0.04876692,  1.62111108, -1.54808634, -0.31269417, -0.33835223,
         0.9820561 ,  0.37012513, -0.19601234],
       [-1.16007757,  1.78002929,  0.21442897, -0.20516442, -0.14831768,
         0.46416299,  0.42365114, -0.05150958],
       [-0.81492943,  1.30753267,  0.02450691, -0.33891924, -0.19493184,
         0.11431611,  0.68322802,  0.03479829],
       [-0.42394254,  0.51095907,  0.19910752, -0.33040761, -0.09153464,
         0.11046663,  0.21557015, -0.13514845]])

In [16]:
v

array([[ 0.04876692,  1.62111108, -1.54808634, -0.31269417, -0.33835223,
         0.9820561 ,  0.37012513, -0.19601234],
       [-1.93868023,  1.88238666,  1.34964456, -0.13590576, -0.02591865,
         0.13059406,  0.45812663,  0.04156297],
       [-0.43968773, -1.40555815,  1.43136222, -1.01499087, -0.20624707,
        -2.82893707,  2.38078187,  0.78551241],
       [ 0.81036722, -0.59994359, -0.41263675, -0.11486738,  0.10784994,
         1.26539226, -1.5387757 , -0.85838071]])