<a href="https://colab.research.google.com/github/arpeggi-15/Transformers-neural-network/blob/main/Self_Attention_in_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Coding Self Attention in Transformers

In [1]:
## Importing Libraries
import numpy as np
import math

In [2]:
## Genreating Data

L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [3]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[ 3.34278457  0.39612788 -0.98819355 -1.1077455  -0.38255537 -0.10379994
  -0.51146918  0.09293298]
 [ 1.52237815 -0.2174949   1.03665892 -0.21065444 -0.40634944  0.77296857
  -0.47011219  1.62302878]
 [-1.19135761  0.48766303  0.45636425  0.21676984  1.42303043  0.22596744
  -0.3678861  -0.73940932]
 [-0.09375137 -0.04474917 -0.48749194  0.52051747  1.88337721 -0.41709959
  -0.26235022  0.24116219]]
K
 [[-1.44353036  0.37960281 -1.29376998  0.31693309  2.07464438  0.91732314
  -0.33753667 -0.07122909]
 [ 1.25943966  0.10988663  0.95702566 -0.50777681  0.4789222   0.65097021
  -0.79660986 -0.68303974]
 [-1.72847953 -0.62896366 -2.33455326  0.28707914  0.35321812 -0.02203805
   0.22116167 -0.48156707]
 [ 0.02940219 -1.36951664 -0.48449898 -0.03127682 -0.72518655  0.0488824
  -1.33205182  1.43489268]]
V
 [[-1.72955746 -0.62463916 -0.6793932   0.65046206  1.19017355  0.03047917
  -0.84386192  1.00766872]
 [-1.04429763 -1.15478099  0.17701764 -0.88352442  1.16597075  0.18716261
   1.41

###Self Attention
$\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)$

$\text{new V} = \text{self attention}.V$

In [4]:
np.matmul(q, k.T)

array([[-4.47049017,  3.96350506, -4.32881368,  1.15620933],
       [-3.7790177 ,  2.56699085, -6.02134575,  3.13450216],
       [ 4.71956006,  0.50655282,  1.52171149, -2.52262541],
       [ 4.51011558, -0.17910135,  1.97797596, -0.41224523]])

In [5]:
q.var(), k.var(), np.matmul(q, k.T).var()

(0.9026687922231827, 0.9025904900814988, 11.127789095706268)

In [6]:
##we need sqrt(d_k) in denominator to reduce the variance of the product
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.9026687922231827, 0.9025904900814988, 1.3909736369632832)

In [7]:
scaled

array([[-1.58055696,  1.40131065, -1.53046675,  0.40878173],
       [-1.33608452,  0.90756832, -2.12886721,  1.10821387],
       [ 1.66861646,  0.17909347,  0.53800626, -0.89188277],
       [ 1.59456665, -0.06332189,  0.69932011, -0.1457507 ]])

###Masking
1. Masking is done to ensure words don't get context from words generated in the future.
2.Not required in the encoders, but required in the decoders.

In [11]:
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [12]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [13]:
scaled + mask

array([[-1.58055696,        -inf,        -inf,        -inf],
       [-1.33608452,  0.90756832,        -inf,        -inf],
       [ 1.66861646,  0.17909347,  0.53800626,        -inf],
       [ 1.59456665, -0.06332189,  0.69932011, -0.1457507 ]])

###Softmax
$\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}$

In [14]:
def softmax(x):
  return ((np.exp(x).T)/np.sum(np.exp(x), axis=-1)).T

In [15]:
attention = softmax(scaled + mask)
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.09589837, 0.90410163, 0.        , 0.        ],
       [0.64586283, 0.14562927, 0.2085079 , 0.        ],
       [0.56353507, 0.10737647, 0.23020795, 0.09888052]])

In [16]:
new_v = np.matmul(attention, v)
new_v

array([[-1.72955746, -0.62463916, -0.6793932 ,  0.65046206,  1.19017355,
         0.03047917, -0.84386192,  1.00766872],
       [-1.11001293, -1.10394125,  0.09488924, -0.73641762,  1.16829176,
         0.17213693,  1.19401648,  0.99233394],
       [-1.2133942 , -0.27871829, -0.43936537,  0.29868192,  0.74299119,
         0.3147987 , -0.2606584 ,  1.15657305],
       [-1.19951232, -0.18908316, -0.46262617,  0.41756329,  0.67214849,
         0.26048005, -0.1907387 ,  1.03349746]])

###Scaled dot product attention

In [17]:
def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  output = np.matmul(attention, v)
  return output, attention

In [18]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[ 3.34278457  0.39612788 -0.98819355 -1.1077455  -0.38255537 -0.10379994
  -0.51146918  0.09293298]
 [ 1.52237815 -0.2174949   1.03665892 -0.21065444 -0.40634944  0.77296857
  -0.47011219  1.62302878]
 [-1.19135761  0.48766303  0.45636425  0.21676984  1.42303043  0.22596744
  -0.3678861  -0.73940932]
 [-0.09375137 -0.04474917 -0.48749194  0.52051747  1.88337721 -0.41709959
  -0.26235022  0.24116219]]
K
 [[-1.44353036  0.37960281 -1.29376998  0.31693309  2.07464438  0.91732314
  -0.33753667 -0.07122909]
 [ 1.25943966  0.10988663  0.95702566 -0.50777681  0.4789222   0.65097021
  -0.79660986 -0.68303974]
 [-1.72847953 -0.62896366 -2.33455326  0.28707914  0.35321812 -0.02203805
   0.22116167 -0.48156707]
 [ 0.02940219 -1.36951664 -0.48449898 -0.03127682 -0.72518655  0.0488824
  -1.33205182  1.43489268]]
V
 [[-1.72955746 -0.62463916 -0.6793932   0.65046206  1.19017355  0.03047917
  -0.84386192  1.00766872]
 [-1.04429763 -1.15478099  0.17701764 -0.88352442  1.16597075  0.18716261
   1.41