In [1]:
# Transformer-Self-Attention-Implementation

In [2]:
# Generating Data

import numpy as np
import math

L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[-0.2223086  -0.31633868 -0.09647073 -0.54942748  1.09366307  0.18765609
   0.93622365  2.13806364]
 [-0.11915798 -1.72812429 -0.6525256  -3.31598736 -0.86431895  0.54983089
   0.38302276  0.57265537]
 [-0.38238403 -0.5688977   0.09216993  2.18935556  1.21742175  1.00742316
   0.04253056 -0.33943216]
 [ 1.62386815 -0.35051662  0.33866687 -0.16296746  1.79872054  0.77132772
  -1.39786049 -0.69209865]]
K
 [[ 0.08519891 -0.16788269  0.80788457  1.22150825 -0.66576594  1.2190729
   0.36792822 -0.64887914]
 [ 0.21002585 -1.89670433 -0.31777078 -0.14812503  0.52837649  1.41851944
   0.74911016  1.71469443]
 [ 0.11823098  0.04111385 -0.4514374  -0.43314656  2.33862035  1.09578609
  -1.03272727 -0.03324421]
 [-0.47049314  0.42894361  0.67529536  0.21552885  0.85375428  1.105311
   0.56902524 -0.37556222]]
V
 [[ 0.64576293 -0.05969949 -0.02437854  0.3714553   0.04953169  0.36178736
   1.28640854 -0.4881781 ]
 [-0.19955745 -0.8352465  -0.01323093 -0.88872444  1.81330185  0.93644836
  -0.3170

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$

In [3]:
np.matmul(q, k.T)

array([[-2.25713905,  5.87687007,  1.96759519,  0.65623634],
       [-3.28264242,  5.54336407, -0.18766741, -1.96784328],
       [ 3.46521088,  2.16727693,  2.85984927,  2.77456903],
       [-0.05071099,  0.73305762,  6.61363391,  1.13193588]])

In [4]:
# Why we need sqrt(d_k) in denominator
q.var(), k.var(), np.matmul(q, k.T).var()

(1.2443537270952494, 0.7215835889093534, 7.81862040434553)

In [5]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(1.2443537270952494, 0.7215835889093534, 0.977327550543191)

In [6]:
scaled

array([[-0.79801916,  2.07778734,  0.69564995,  0.23201458],
       [-1.16058936,  1.95987516, -0.06635045, -0.69573766],
       [ 1.22513706,  0.76624811,  1.01110941,  0.98095829],
       [-0.01792904,  0.25917501,  2.33827269,  0.40019977]])

In [7]:
### Masking

# This is to ensure words don't get context from words generated in the future.
# Not required in the encoders, but required int he decoders

mask = np.tril(np.ones( (L, L) ))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [8]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [9]:
scaled + mask

array([[-0.79801916,        -inf,        -inf,        -inf],
       [-1.16058936,  1.95987516,        -inf,        -inf],
       [ 1.22513706,  0.76624811,  1.01110941,        -inf],
       [-0.01792904,  0.25917501,  2.33827269,  0.40019977]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [10]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [11]:
attention = softmax(scaled + mask)
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.04227096, 0.95772904, 0.        , 0.        ],
       [0.40995174, 0.25908353, 0.33096473, 0.        ],
       [0.06949647, 0.09168694, 0.73324337, 0.10557322]])

In [12]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.64576293, -0.05969949, -0.02437854,  0.3714553 ,  0.04953169,
         0.36178736,  1.28640854, -0.4881781 ],
       [-0.16382495, -0.80246338, -0.01370215, -0.83545543,  1.73874559,
         0.91215688, -0.24930866,  0.69912851],
       [-0.04450358,  0.14303776, -0.11179004, -0.05060385,  0.77134527,
         0.78636111,  0.45086109,  0.10726344],
       [-0.62222602,  0.83562054, -0.18981257, -0.18137128,  0.66247482,
         1.06857428,  0.01097954,  0.26557895]])

In [13]:
v

array([[ 0.64576293, -0.05969949, -0.02437854,  0.3714553 ,  0.04953169,
         0.36178736,  1.28640854, -0.4881781 ],
       [-0.19955745, -0.8352465 , -0.01323093, -0.88872444,  1.81330185,
         0.93644836, -0.31709009,  0.75153226],
       [-0.77812876,  1.15997341, -0.29721625,  0.08270146,  0.84976736,
         1.19477205,  0.01706673,  0.34046915],
       [-0.7411889 ,  0.62333833,  0.29388705, -1.76504969, -1.2343077 ,
         0.77209509, -0.58596641, -0.18041351]])

In [14]:
#Attention

In [15]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [16]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[-0.2223086  -0.31633868 -0.09647073 -0.54942748  1.09366307  0.18765609
   0.93622365  2.13806364]
 [-0.11915798 -1.72812429 -0.6525256  -3.31598736 -0.86431895  0.54983089
   0.38302276  0.57265537]
 [-0.38238403 -0.5688977   0.09216993  2.18935556  1.21742175  1.00742316
   0.04253056 -0.33943216]
 [ 1.62386815 -0.35051662  0.33866687 -0.16296746  1.79872054  0.77132772
  -1.39786049 -0.69209865]]
K
 [[ 0.08519891 -0.16788269  0.80788457  1.22150825 -0.66576594  1.2190729
   0.36792822 -0.64887914]
 [ 0.21002585 -1.89670433 -0.31777078 -0.14812503  0.52837649  1.41851944
   0.74911016  1.71469443]
 [ 0.11823098  0.04111385 -0.4514374  -0.43314656  2.33862035  1.09578609
  -1.03272727 -0.03324421]
 [-0.47049314  0.42894361  0.67529536  0.21552885  0.85375428  1.105311
   0.56902524 -0.37556222]]
V
 [[ 0.64576293 -0.05969949 -0.02437854  0.3714553   0.04953169  0.36178736
   1.28640854 -0.4881781 ]
 [-0.19955745 -0.8352465  -0.01323093 -0.88872444  1.81330185  0.93644836
  -0.3170