In [2]:
# Importing Libraries
import numpy as np
import math

In [3]:
L, d_k, d_v = 4, 8, 8

q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[-0.27766923  0.30586403  0.01057463  0.36409474 -0.41489623  1.15277481
   0.63715732 -1.05318505]
 [-0.6823625   0.72444694 -1.4419607  -0.50095249 -0.04408113  0.68822858
   1.05971024 -0.10210544]
 [-0.74758765  0.58988275  0.2740787   0.33826729  1.34593399 -0.1037819
   0.87886222 -0.59395793]
 [ 0.45122425 -0.41196376  1.01399581  0.53620411 -0.80208738  1.57122979
  -0.97703709 -1.46250139]]
K
 [[-0.4708874   1.64177464 -1.38780232  1.71701544  1.01333479  0.62804628
  -0.54720871  0.94657378]
 [ 0.27079892  0.3888408   1.06496911  0.07948627 -1.4330064   0.13770432
   0.31164302 -0.14493827]
 [ 1.11200785  1.39342085  0.45975428 -1.09075613  0.65111229 -0.76908524
   0.17459603  1.11260402]
 [ 1.95817072  0.52294909  0.84582026 -0.61544628 -0.35686193 -0.79044713
  -0.46193641 -1.23709805]]
V
 [[ 0.89056773 -0.87976587 -0.44480487  0.96568199  1.63254008  1.2195493
   0.71989661  0.77845321]
 [-0.90588885 -0.07982795 -1.01866175  1.84510543  0.13146088 -1.32809874
  -0.400

### Self Attention
$$
\text{self attention} = \text{softmax} \left( \frac{Q K^T}{\sqrt{d_k}} + M \right)
$$

$$
\text{new } V = \text{self attention} \cdot V
$$

In [4]:
np.matmul(q, k.T)

array([[ 0.20138332,  1.18844554, -2.49210879, -0.35348884],
       [ 2.36274522, -0.97556009, -0.35245351, -2.76014537],
       [ 1.77648308, -1.23734722,  0.19645048, -1.20126238],
       [-2.05108275,  2.3577449 , -3.71938331,  2.5006384 ]])

In [5]:
q.var(), k.var(), np.matmul(q, k.T).var()

(0.6255806827997046, 0.8418219704856419, 3.549242551815472)

##### Without the scaling factor, the self-attention mechanism would struggle to learn effectively because the gradients would either vanish or explode. By normalizing the dot products, the model can maintain a consistent gradient flow, making the training process more efficient and stable.

##### The softmax function is sensitive to the scale of its input values. If the variance of the dot product is too high, the softmax function can become too steep, which means it will produce very small gradients for most inputs.

In [6]:
scaled = np.matmul(q, k.T) / np.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.6255806827997046, 0.8418219704856419, 0.443655318976934)

### Masking

##### To prevent getting context from the future words

In [9]:
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [10]:
mask[mask == 0] = -np.Infinity
mask[mask == 1] = 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [11]:
scaled + mask

array([[ 0.07119975,        -inf,        -inf,        -inf],
       [ 0.83535658, -0.34491258,        -inf,        -inf],
       [ 0.62808162, -0.4374683 ,  0.06945573,        -inf],
       [-0.72516726,  0.8335887 , -1.31500058,  0.88410919]])

### Applying softmax

In [13]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

attention = softmax(scaled + mask)
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.7649962 , 0.2350038 , 0.        , 0.        ],
       [0.52177556, 0.17977168, 0.29845276, 0.        ],
       [0.08844457, 0.4203686 , 0.04903541, 0.44215143]])

In [14]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.89056773, -0.87976587, -0.44480487,  0.96568199,  1.63254008,
         1.2195493 ,  0.71989661,  0.77845321],
       [ 0.4683936 , -0.69177741, -0.57966342,  1.17234985,  1.27978076,
         0.62084232,  0.45667637,  0.84881258],
       [ 0.41588814, -1.27355992, -0.10225977,  1.0447626 ,  0.50216696,
         0.42574844, -0.31473101,  0.23590865],
       [ 0.24209591,  0.22858748, -0.94258661,  1.65344097,  0.02765179,
        -0.01720608,  0.1636889 , -0.10017723]])

In [15]:
v

array([[ 0.89056773, -0.87976587, -0.44480487,  0.96568199,  1.63254008,
         1.2195493 ,  0.71989661,  0.77845321],
       [-0.90588885, -0.07982795, -1.01866175,  1.84510543,  0.13146088,
        -1.32809874, -0.40017139,  1.07784997],
       [ 0.38218721, -2.68105677,  1.04859167,  0.70093381, -1.25073563,
         0.09439338, -2.07207328, -1.2197445 ],
       [ 1.18827287,  1.06620003, -1.19065712,  1.71442776, -0.25029707,
         0.96933714,  0.83646095, -1.27176068]])

### Function

In [16]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask = None):
    d_k = q.shape[-1]
    scaled = np.matmul(q, k.T) / np.sqrt(d_k)
    if mask is not None:
        scaled = scaled + mask
    attention = softmax(scaled)
    out = np.matmul(attention, v)
    return out, attention

In [17]:
scaled_dot_product_attention(q, k, v, mask)

(array([[ 0.89056773, -0.87976587, -0.44480487,  0.96568199,  1.63254008,
          1.2195493 ,  0.71989661,  0.77845321],
        [ 0.4683936 , -0.69177741, -0.57966342,  1.17234985,  1.27978076,
          0.62084232,  0.45667637,  0.84881258],
        [ 0.41588814, -1.27355992, -0.10225977,  1.0447626 ,  0.50216696,
          0.42574844, -0.31473101,  0.23590865],
        [ 0.24209591,  0.22858748, -0.94258661,  1.65344097,  0.02765179,
         -0.01720608,  0.1636889 , -0.10017723]]),
 array([[1.        , 0.        , 0.        , 0.        ],
        [0.7649962 , 0.2350038 , 0.        , 0.        ],
        [0.52177556, 0.17977168, 0.29845276, 0.        ],
        [0.08844457, 0.4203686 , 0.04903541, 0.44215143]]))