In [2]:
import numpy as np
import math

In [3]:
L,d_k,d_v = 4,8,8
q = np.random.randn(L,d_k)
k = np.random.randn(L,d_k)
v = np.random.randn(L,d_v)

Q: What I'm looking For?<br>
K: What I can offer?<br>
V: What I actually offer?

In [4]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[-0.12910698  0.73864953  0.33297499 -0.75307611 -1.13002255 -0.68198012
   1.31837293 -0.79915987]
 [-1.03591611 -1.67535719 -0.82257477 -0.78754226 -0.02720394 -0.68176828
   0.56084084  0.73459063]
 [ 0.17686493  0.40980959  0.20704985  1.53571987  0.92087811  0.1201056
  -0.25029803  0.22677105]
 [-0.34363031  0.18486712  1.41522444 -0.91354706  1.40524207 -1.30139658
   1.03811423 -0.20485255]]
K
 [[-0.96095415  0.33023474 -0.29941719  1.12783158  1.33238693  2.09740203
   1.1006317   0.85645483]
 [ 0.19013953 -0.05582271  0.33228049  0.23907953  1.49909388  0.72421307
   0.21041058  0.23239837]
 [-1.4991445   1.65444269  0.00455148 -0.51335059 -0.87699558  0.01436565
   0.15952534 -0.69890054]
 [-0.31638517 -1.16131167 -0.41439652 -1.18110866 -0.78148025  1.88167268
   1.56191176 -0.55287531]]
V
 [[ 0.10291552 -0.51908749  1.98221311  0.92311307 -1.10624456  0.34501014
   0.87294412 -0.23038233]
 [-0.28364959 -0.71470854 -0.053116    1.12903809 -1.48177544  0.53803537
   0.84

## Self Attention

$$ \text{self attention} = \text{softmax}\bigg(\frac{Q \cdot K^T}{\sqrt{d_k}} + M\bigg) $$


In [7]:
np.matmul(q,k.T)

array([[-2.75046287, -2.2314184 ,  3.55378578,  2.03537112],
       [-0.41948071, -0.81085865, -1.22812695,  2.75264693],
       [ 3.03302225,  1.9142108 , -1.37885985, -3.44149504],
       [-0.95289843,  1.51110776,  0.35410544, -1.42570512]])

In [9]:
#why we need sqrt(d_k)
q.var(), k.var(), np.matmul(q,k.T).var()

(0.7252784236163596, 0.9005697391102687, 4.448544014186415)

In [10]:
scaled = (np.matmul(q,k.T) /  np.sqrt(d_k))

In [11]:
scaled

array([[-0.97243547, -0.78892554,  1.25645301,  0.71961236],
       [-0.14830883, -0.28668182, -0.43420845,  0.97320766],
       [ 1.0723353 ,  0.67677572, -0.48750057, -1.21675224],
       [-0.33690047,  0.53425727,  0.12519518, -0.50406288]])

What is M?

## Masking

- This is to ensure words don't get context from words generated in the future
- Not required in the encoders, but required in the decoders

In [14]:
mask = np.tril(np.ones((L,L)))

In [16]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [17]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [18]:
scaled + mask

array([[-0.97243547,        -inf,        -inf,        -inf],
       [-0.14830883, -0.28668182,        -inf,        -inf],
       [ 1.0723353 ,  0.67677572, -0.48750057,        -inf],
       [-0.33690047,  0.53425727,  0.12519518, -0.50406288]])

## Softmax

$$ \text{softmax} = \frac{e^{x_i}}{\sum_j e^{x_j}} $$


In [20]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x),axis=-1)).T

In [21]:
attention = softmax(scaled+mask)

In [23]:
new_v = np.matmul(attention,v)

In [24]:
new_v

array([[ 0.10291552, -0.51908749,  1.98221311,  0.92311307, -1.10624456,
         0.34501014,  0.87294412, -0.23038233],
       [-0.07701579, -0.61014162,  1.03484507,  1.01896331, -1.28103986,
         0.43485602,  0.8578006 ,  0.02223168],
       [-0.02183852, -0.55533426,  1.12530082,  1.06730469, -1.3759062 ,
         0.34278144,  0.7100872 , -0.05154183],
       [ 0.09025698, -0.47270692,  0.44615161,  1.17404607, -1.48666858,
         0.2262938 ,  0.23330115,  0.22085707]])

In [25]:
v

array([[ 0.10291552, -0.51908749,  1.98221311,  0.92311307, -1.10624456,
         0.34501014,  0.87294412, -0.23038233],
       [-0.28364959, -0.71470854, -0.053116  ,  1.12903809, -1.48177544,
         0.53803537,  0.84040972,  0.31233458],
       [ 0.2233156 , -0.2172259 ,  0.82325802,  1.55560483, -2.31980406,
        -0.29333901, -0.48229426, -0.36632722],
       [ 0.88173558, -0.21369979, -0.66675872,  0.88187155, -0.38698604,
         0.18041773, -0.89486837,  1.59750819]])

In [26]:
def scaled_dot_product_attention(q,k,v,mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q,k.T)/math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention,v)
  return out, attention