In [1]:
import os

os.environ['XLA_FLAGS'] = f"--xla_gpu_cuda_data_dir={os.environ['CONDA_PREFIX']}/lib/"

# Transformer Components

## 1 Scaled Dot-Product Attention

$$
\text{Attention}(Q, K, V) = \text{softmax} \left( \frac{QK^T}{\sqrt{d_k}} \right) V
$$

In [2]:
import numpy as np
from scipy.special import softmax

In [3]:
np.random.seed(1337)
q_sequence = np.random.random(size=(5, 10))
k_sequence = np.random.random(size=(5, 10))
v_sequence = k_sequence.copy()

### For-loop Implementation

In [4]:
output = np.zeros(shape=v_sequence.shape)

for i, q_vector in enumerate(q_sequence):
    scores = np.zeros(shape=(len(q_sequence),))

    for j, k_vector in enumerate(k_sequence):
        scores[j] = np.dot(q_vector, k_vector.T)

    scores /= np.sqrt(len(k_vector))
    scores = softmax(scores)

    output_vector = np.zeros(shape=q_vector.shape)

    for j, v_vector in enumerate(v_sequence):
        output_vector += v_vector * scores[j]

    output[i] = output_vector

In [5]:
output

array([[0.57284906, 0.6241619 , 0.42520517, 0.77054331, 0.51498201,
        0.37227166, 0.39858822, 0.63204179, 0.69069591, 0.53930717],
       [0.56543131, 0.63987227, 0.39927305, 0.77201221, 0.52879869,
        0.39989248, 0.4143486 , 0.62758946, 0.68079163, 0.56012285],
       [0.5715203 , 0.62739219, 0.41876807, 0.77218655, 0.5223185 ,
        0.37794863, 0.40369935, 0.63315834, 0.68463259, 0.54588121],
       [0.55985979, 0.62539716, 0.40915768, 0.76689235, 0.52276334,
        0.36085987, 0.41196783, 0.64331435, 0.66314229, 0.55337357],
       [0.56299325, 0.64480461, 0.38976731, 0.77313509, 0.51665272,
        0.39105676, 0.41212249, 0.63581084, 0.68213553, 0.56622954]])

### Vector Implementation

In [6]:
softmax((q_sequence @ k_sequence.T) / np.sqrt(len(k_vector)), axis=1) @ v_sequence

array([[0.57284906, 0.6241619 , 0.42520517, 0.77054331, 0.51498201,
        0.37227166, 0.39858822, 0.63204179, 0.69069591, 0.53930717],
       [0.56543131, 0.63987227, 0.39927305, 0.77201221, 0.52879869,
        0.39989248, 0.4143486 , 0.62758946, 0.68079163, 0.56012285],
       [0.5715203 , 0.62739219, 0.41876807, 0.77218655, 0.5223185 ,
        0.37794863, 0.40369935, 0.63315834, 0.68463259, 0.54588121],
       [0.55985979, 0.62539716, 0.40915768, 0.76689235, 0.52276334,
        0.36085987, 0.41196783, 0.64331435, 0.66314229, 0.55337357],
       [0.56299325, 0.64480461, 0.38976731, 0.77313509, 0.51665272,
        0.39105676, 0.41212249, 0.63581084, 0.68213553, 0.56622954]])

## 2 Multi-head Attention

In [7]:
from keras import layers

In [8]:
layers.MultiHeadAttention?

[0;31mInit signature:[0m
[0mlayers[0m[0;34m.[0m[0mMultiHeadAttention[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mnum_heads[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkey_dim[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvalue_dim[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropout[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_bias[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_shape[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mattention_axes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkernel_initializer[0m[0;34m=[0m[0;34m'glorot_uniform'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias_initializer[0m[0;34m=[0m[0;34m'zeros'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkernel_regularizer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbias_regularizer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m

In [9]:
num_heads = 4
embed_dim = 10
mha_layer = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

In [10]:
mha_layer(q_sequence[np.newaxis, :, :],
          k_sequence[np.newaxis, :, :],
          v_sequence[np.newaxis, :, :])

<tf.Tensor: shape=(1, 5, 10), dtype=float32, numpy=
array([[[ 0.05663091, -0.10765006,  0.48948398, -0.07543378,
         -0.04875398,  0.45872042, -0.23137562, -0.19092551,
         -0.18513665, -0.08372168],
        [ 0.05681177, -0.10743542,  0.48938474, -0.07487963,
         -0.04845609,  0.45961136, -0.23126273, -0.19116779,
         -0.18536365, -0.08369228],
        [ 0.05653821, -0.10762081,  0.48883337, -0.07477252,
         -0.04845449,  0.45951137, -0.23147833, -0.1916148 ,
         -0.18542819, -0.08358492],
        [ 0.05608054, -0.10683779,  0.4884538 , -0.07536796,
         -0.04806905,  0.45730373, -0.23021281, -0.19133647,
         -0.185938  , -0.08244988],
        [ 0.05744241, -0.10784833,  0.4895334 , -0.07521044,
         -0.04874349,  0.460122  , -0.23151511, -0.19172466,
         -0.1851113 , -0.08364887]]], dtype=float32)>

## 3 Normalization Techniques

### Layer Normalization

In [11]:
def layer_normalization(batch_of_sequences):
    # Batch of sequences has shape (batch size, sequence length, embedding dimension)
    mean = np.mean(batch_of_sequences, keepdims=True, axis=-1)
    variance = np.var(batch_of_sequences, keepdims=True, axis=-1)

    return (batch_of_sequences - mean) / variance

In [12]:
s1 = np.arange(0, 20).reshape((4, 5))
s1

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [13]:
s2 = 2*s1
s2

array([[ 0,  2,  4,  6,  8],
       [10, 12, 14, 16, 18],
       [20, 22, 24, 26, 28],
       [30, 32, 34, 36, 38]])

In [14]:
batch_of_sequences = np.concatenate((s1[np.newaxis, :], s2[np.newaxis, :]))
batch_of_sequences

array([[[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19]],

       [[ 0,  2,  4,  6,  8],
        [10, 12, 14, 16, 18],
        [20, 22, 24, 26, 28],
        [30, 32, 34, 36, 38]]])

In [15]:
mean = np.mean(batch_of_sequences, keepdims=True, axis=-1)
mean

array([[[ 2.],
        [ 7.],
        [12.],
        [17.]],

       [[ 4.],
        [14.],
        [24.],
        [34.]]])

In [16]:
mean.shape

(2, 4, 1)

### Batch Normalization

In [17]:
def layer_normalization(batch_of_images):
    # Batch of images has shape (batch size, height, width, channels)
    mean = np.mean(batch_of_images, keepdims=True, axis=(0, 1, 2))
    variance = np.var(batch_of_images, keepdims=True, axis=(0, 1, 2))

    return (batch_of_images - mean) / variance

In [18]:
img_1 = np.arange(0, 3*3*3).reshape((3, 3, 3))
img_1

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

In [19]:
img_2 = img_1 + 1
img_2

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]],

       [[19, 20, 21],
        [22, 23, 24],
        [25, 26, 27]]])

In [20]:
batch_of_images = np.concatenate((img_1[np.newaxis, :], img_2[np.newaxis, :]))

In [21]:
batch_of_images

array([[[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23],
         [24, 25, 26]]],


       [[[ 1,  2,  3],
         [ 4,  5,  6],
         [ 7,  8,  9]],

        [[10, 11, 12],
         [13, 14, 15],
         [16, 17, 18]],

        [[19, 20, 21],
         [22, 23, 24],
         [25, 26, 27]]]])

In [22]:
batch_of_images.shape

(2, 3, 3, 3)

In [23]:
mean = np.mean(batch_of_images, keepdims=True, axis=(0, 1, 2))

In [24]:
mean

array([[[[12.5, 13.5, 14.5]]]])

In [25]:
mean.shape

(1, 1, 1, 3)

## 4 Causal Mask

In [29]:
import tensorflow as tf

In [30]:
batch_of_sequences.shape

(2, 4, 5)

In [34]:
input_shape = batch_of_sequences.shape
batch_size, sequence_length = input_shape[0], input_shape[1]

i = tf.range(sequence_length)[:, tf.newaxis]
j = tf.range(sequence_length)

mask = tf.cast(i > j, dtype="int32")

In [35]:
mask

<tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[0, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0],
       [1, 1, 1, 0]], dtype=int32)>

In [36]:
mask = tf.reshape(mask, (1, sequence_length, sequence_length))
mask

<tf.Tensor: shape=(1, 4, 4), dtype=int32, numpy=
array([[[0, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 1, 0, 0],
        [1, 1, 1, 0]]], dtype=int32)>

In [37]:
mult = tf.concat(
    [tf.expand_dims(batch_size, -1),
     tf.constant([1, 1], dtype=tf.int32)], axis=0)

In [38]:
mult

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([2, 1, 1], dtype=int32)>

In [48]:
tf.tile(mask, mult)

<tf.Tensor: shape=(2, 4, 4), dtype=int32, numpy=
array([[[0, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 1, 0, 0],
        [1, 1, 1, 0]],

       [[0, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 1, 0, 0],
        [1, 1, 1, 0]]], dtype=int32)>