In [1]:
!pip install BPEmb

import math
import numpy as np
import tensorflow as tf

from bpemb import BPEmb

Collecting BPEmb
  Downloading bpemb-0.3.5-py3-none-any.whl (19 kB)
Installing collected packages: BPEmb
Successfully installed BPEmb-0.3.5


### Transformers

There are many parts to Transformers, we wil start with Multi-Head Self-Attention

$$Attention(Q, K, V) = softmax(\frac{QK^T)}{\sqrt{d_k}})V$$

Inside every attention head there is scaled dot product with uses query, key and value to calculate attention weights, we will implement that first

In [2]:
def scaled_dot_product_attention(query, key, value, mask=None):
  key_dim = tf.cast(tf.shape(key)[-1], tf.float32)
  scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dim)

  if mask is not None:
    scaled_score = tf.where(mask==0, -np.inf, scaled_scores)

  softmax = tf.keras.layers.Softmax()
  weights = softmax(scaled_scores)

  return tf.matmul(weights, value), weights


In [3]:
# Suppose our queries, keys, and values are each a length of 3 with a dimension of 4.
seq_len = 3
embed_dim = 4

queries = np.random.rand(seq_len, embed_dim)
keys = np.random.rand(seq_len, embed_dim)
values = np.random.rand(seq_len, embed_dim)

print("Queries:\n",queries)

Queries:
 [[0.11237853 0.48299194 0.62938949 0.6126515 ]
 [0.93497659 0.57699119 0.56262699 0.54204357]
 [0.19426278 0.80967871 0.43821061 0.78351492]]


In [4]:
output, attn_weights = scaled_dot_product_attention(queries, keys, values)

print("Output\n", output, "\n")
print("Weights\n", attn_weights)

Output
 tf.Tensor(
[[0.6199776  0.20539664 0.588922   0.22266714]
 [0.65892184 0.2145942  0.64545256 0.2213564 ]
 [0.63038975 0.20718662 0.6035171  0.2216724 ]], shape=(3, 4), dtype=float32) 

Weights
 tf.Tensor(
[[0.27230927 0.30554375 0.422147  ]
 [0.30628258 0.24092954 0.45278782]
 [0.27987507 0.28848448 0.43164048]], shape=(3, 3), dtype=float32)


![](https://drive.google.com/uc?export=view&id=1SLWkHQgy4nQPFvvjG5_V8UTtpSAJ2zrr)

In [5]:
batch_size = 1
seq_len = 3
embed_dim = 12
num_heads = 3
head_dim = embed_dim // num_heads

print(f"Dimension of each head: {head_dim}")

Dimension of each head: 4


Using separate weight matrices per head

In [6]:
# Suppose these are our input embeddings
x = np.random.rand(batch_size, seq_len, embed_dim).round(1)
print("Input shape: ", x.shape, "\n")
print("Input:\n", x)

Input shape:  (1, 3, 12) 

Input:
 [[[0.7 0.3 0.5 0.1 1.  0.6 0.6 0.5 0.1 0.7 0.9 0.6]
  [0.2 0.4 0.9 0.8 0.1 0.1 0.8 0.1 0.  0.9 0.4 0.3]
  [0.8 0.4 0.1 0.  0.8 0.5 0.6 0.3 0.8 0.4 0.5 0.7]]]


Remember each weight matrix should have a dimension of $\text{d}\ \text{x}\ \text{d/h}$.

In [7]:
# Declaring query, key and value

# query
wq0 = np.random.rand(embed_dim, head_dim).round(1)
wq1 = np.random.rand(embed_dim, head_dim).round(1)
wq2 = np.random.rand(embed_dim, head_dim).round(1)

# key
wk0 = np.random.rand(embed_dim, head_dim).round(1)
wk1 = np.random.rand(embed_dim, head_dim).round(1)
wk2 = np.random.rand(embed_dim, head_dim).round(1)

# value
wv0 = np.random.rand(embed_dim, head_dim).round(1)
wv1 = np.random.rand(embed_dim, head_dim).round(1)
wv2 = np.random.rand(embed_dim, head_dim).round(1)

In [8]:
print("The three sets of query weights (one for each head):")
print("wq0:\n", wq0)
print("wq1:\n", wq1)
print("wq2:\n", wq1)

The three sets of query weights (one for each head):
wq0:
 [[0.8 0.3 0.9 0.2]
 [0.5 0.6 0.7 0.8]
 [0.5 0.2 0.2 0.8]
 [0.7 0.3 0.3 0.7]
 [0.8 0.9 0.7 0.6]
 [0.3 0.3 1.  0. ]
 [0.9 0.2 0.6 0.7]
 [0.1 0.  0.3 1. ]
 [1.  0.4 0.  0.1]
 [0.4 0.7 0.8 0. ]
 [0.1 0.8 0.3 0.9]
 [0.6 0.8 0.6 0.9]]
wq1:
 [[0.4 0.1 0.5 0.5]
 [0.2 0.2 0.6 0.2]
 [0.8 0.9 0.7 0.4]
 [0.3 0.9 0.3 0.4]
 [0.6 0.6 0.4 0.5]
 [0.9 0.4 0.4 0.8]
 [0.7 0.2 0.9 0.1]
 [0.6 0.7 0.8 0.8]
 [0.3 0.6 0.9 0. ]
 [0.1 0.3 0.6 1. ]
 [0.9 0.7 0.9 0.2]
 [0.3 0.9 0.3 0.4]]
wq2:
 [[0.4 0.1 0.5 0.5]
 [0.2 0.2 0.6 0.2]
 [0.8 0.9 0.7 0.4]
 [0.3 0.9 0.3 0.4]
 [0.6 0.6 0.4 0.5]
 [0.9 0.4 0.4 0.8]
 [0.7 0.2 0.9 0.1]
 [0.6 0.7 0.8 0.8]
 [0.3 0.6 0.9 0. ]
 [0.1 0.3 0.6 1. ]
 [0.9 0.7 0.9 0.2]
 [0.3 0.9 0.3 0.4]]


In [9]:
# Generate query, key and value for first head
q0 = np.dot(x, wq0)
k0 = np.dot(x, wk0)
v0 = np.dot(x, wv0)

# Generate query, key and value for second head
q1 = np.dot(x, wq1)
k1 = np.dot(x, wk1)
v1 = np.dot(x, wv1)

# Generate query, key and value for third head
q2 = np.dot(x, wq2)
k2 = np.dot(x, wk2)
v2 = np.dot(x, wv2)

In [10]:
print("Q, K, and V for first head:\n")

print(f"q0 {q0.shape}:\n", q0, "\n")
print(f"k0 {k0.shape}:\n", k0, "\n")
print(f"v0 {v0.shape}:\n", v0)

Q, K, and V for first head:

q0 (1, 3, 4):
 [[[3.43 3.45 3.97 3.73]
  [2.79 2.19 2.58 2.99]
  [3.68 3.05 3.42 2.92]]] 

k0 (1, 3, 4):
 [[[4.18 4.63 3.26 3.6 ]
  [3.11 3.43 1.69 2.58]
  [3.42 3.97 2.96 3.15]]] 

v0 (1, 3, 4):
 [[[3.74 4.47 2.83 3.44]
  [3.1  2.87 1.45 2.35]
  [3.37 3.97 2.53 3.27]]]


Now we pass query, key and value to self-attention operation

In [11]:
out0, attn_weights0 = scaled_dot_product_attention(q0, k0, v0)

print("Output from first attention head: ", out0, "\n")
print("Attention weights from first head: ", attn_weights0)

Output from first attention head:  tf.Tensor(
[[[3.7324066 4.459643  2.823731  3.4364076]
  [3.7186127 4.439895  2.8112497 3.428871 ]
  [3.7297044 4.4558406 2.8213637 3.4350028]]], shape=(1, 3, 4), dtype=float32) 

Attention weights from first head:  tf.Tensor(
[[[9.7957271e-01 1.3046073e-04 2.0296868e-02]
  [9.4339097e-01 1.6366168e-03 5.4972406e-02]
  [9.7241926e-01 3.3523850e-04 2.7245473e-02]]], shape=(1, 3, 3), dtype=float32)


In [12]:
out1, _ = scaled_dot_product_attention(q1, k1, v1)
out2, _ = scaled_dot_product_attention(q2, k2, v2)

print("Output from second attention head: ", out1, "\n")
print("Output from third attention head: ", out2,)

Output from second attention head:  tf.Tensor(
[[[3.3065586 3.4400454 3.1833153 3.4144921]
  [3.2400954 3.4136553 3.1471026 3.3772635]
  [3.2602277 3.4184432 3.157498  3.3891182]]], shape=(1, 3, 4), dtype=float32) 

Output from third attention head:  tf.Tensor(
[[[4.1941414 4.333278  3.3307385 2.625926 ]
  [4.1312056 4.2749963 3.316106  2.6021001]
  [4.1585345 4.3017426 3.3267047 2.6109216]]], shape=(1, 3, 4), dtype=float32)


We will combine the outputs from individual head into one and then pass to linear layer

In [13]:
combined_out_a = np.concatenate((out0, out1, out2), axis=-1)
print(f"Combined output from all heads {combined_out_a.shape}:")
print(combined_out_a)

Combined output from all heads (1, 3, 12):
[[[3.7324066 4.459643  2.823731  3.4364076 3.3065586 3.4400454 3.1833153
   3.4144921 4.1941414 4.333278  3.3307385 2.625926 ]
  [3.7186127 4.439895  2.8112497 3.428871  3.2400954 3.4136553 3.1471026
   3.3772635 4.1312056 4.2749963 3.316106  2.6021001]
  [3.7297044 4.4558406 2.8213637 3.4350028 3.2602277 3.4184432 3.157498
   3.3891182 4.1585345 4.3017426 3.3267047 2.6109216]]]


Using a single matrix for all head

In [15]:
# For single head this was these were the separate query matrix
print("Query weights for first head: \n", wq0, "\n")
print("Query weights for second head: \n", wq1, "\n")
print("Query weights for third head: \n", wq2)

Query weights for first head: 
 [[0.8 0.3 0.9 0.2]
 [0.5 0.6 0.7 0.8]
 [0.5 0.2 0.2 0.8]
 [0.7 0.3 0.3 0.7]
 [0.8 0.9 0.7 0.6]
 [0.3 0.3 1.  0. ]
 [0.9 0.2 0.6 0.7]
 [0.1 0.  0.3 1. ]
 [1.  0.4 0.  0.1]
 [0.4 0.7 0.8 0. ]
 [0.1 0.8 0.3 0.9]
 [0.6 0.8 0.6 0.9]] 

Query weights for second head: 
 [[0.4 0.1 0.5 0.5]
 [0.2 0.2 0.6 0.2]
 [0.8 0.9 0.7 0.4]
 [0.3 0.9 0.3 0.4]
 [0.6 0.6 0.4 0.5]
 [0.9 0.4 0.4 0.8]
 [0.7 0.2 0.9 0.1]
 [0.6 0.7 0.8 0.8]
 [0.3 0.6 0.9 0. ]
 [0.1 0.3 0.6 1. ]
 [0.9 0.7 0.9 0.2]
 [0.3 0.9 0.3 0.4]] 

Query weights for third head: 
 [[1.  0.4 0.  0.8]
 [1.  0.  0.9 0.9]
 [0.6 0.7 0.1 0.8]
 [0.  0.6 0.8 0.2]
 [0.8 0.8 0.5 0.1]
 [0.1 0.1 0.2 0.5]
 [0.7 0.1 0.  0.4]
 [0.8 0.8 0.7 0.7]
 [0.7 0.7 0.3 0.3]
 [0.3 0.1 0.8 1. ]
 [0.5 0.6 0.7 0. ]
 [0.6 0.  0.5 0.1]]


In [16]:
# using a single matrix instead
wq = np.concatenate((wq0, wq1, wq2), axis=1)
print(f"Single query weight matrix {wq.shape}: \n", wq)

Single query weight matrix (12, 12): 
 [[0.8 0.3 0.9 0.2 0.4 0.1 0.5 0.5 1.  0.4 0.  0.8]
 [0.5 0.6 0.7 0.8 0.2 0.2 0.6 0.2 1.  0.  0.9 0.9]
 [0.5 0.2 0.2 0.8 0.8 0.9 0.7 0.4 0.6 0.7 0.1 0.8]
 [0.7 0.3 0.3 0.7 0.3 0.9 0.3 0.4 0.  0.6 0.8 0.2]
 [0.8 0.9 0.7 0.6 0.6 0.6 0.4 0.5 0.8 0.8 0.5 0.1]
 [0.3 0.3 1.  0.  0.9 0.4 0.4 0.8 0.1 0.1 0.2 0.5]
 [0.9 0.2 0.6 0.7 0.7 0.2 0.9 0.1 0.7 0.1 0.  0.4]
 [0.1 0.  0.3 1.  0.6 0.7 0.8 0.8 0.8 0.8 0.7 0.7]
 [1.  0.4 0.  0.1 0.3 0.6 0.9 0.  0.7 0.7 0.3 0.3]
 [0.4 0.7 0.8 0.  0.1 0.3 0.6 1.  0.3 0.1 0.8 1. ]
 [0.1 0.8 0.3 0.9 0.9 0.7 0.9 0.2 0.5 0.6 0.7 0. ]
 [0.6 0.8 0.6 0.9 0.3 0.9 0.3 0.4 0.6 0.  0.5 0.1]]


In [17]:
wk = np.concatenate((wk0, wk1, wk2), axis=1)
wv = np.concatenate((wv0, wv1, wv2), axis=1)

print(f"Single key weight matrix {wk.shape}:\n", wk, "\n")
print(f"Single value weight matrix {wv.shape}:\n", wv)

Single key weight matrix (12, 12):
 [[0.9 0.9 0.6 0.8 0.2 0.4 0.  0.8 0.6 0.9 0.7 0.6]
 [0.9 0.5 0.  0.2 0.7 0.3 0.9 0.8 0.5 0.8 0.  0.9]
 [0.2 1.  0.  0.7 0.9 0.8 0.5 0.9 0.5 0.5 0.6 0.4]
 [1.  0.3 0.1 0.5 0.9 0.2 0.9 0.1 0.3 0.  0.9 0.6]
 [0.8 0.1 0.8 0.6 0.2 0.2 0.  0.1 0.8 0.7 0.4 0.6]
 [0.1 1.  0.9 1.  0.2 0.6 0.8 0.4 0.6 0.5 0.  0.2]
 [0.3 0.3 0.9 0.3 0.3 0.1 0.2 0.2 0.5 0.3 0.3 0.7]
 [0.7 0.5 0.9 0.4 1.  0.  0.9 0.8 0.  0.  0.7 0.7]
 [0.  0.6 0.3 0.3 0.6 1.  0.8 0.  0.5 0.6 0.2 0.3]
 [0.7 0.9 0.5 0.7 0.6 0.3 0.  0.6 0.5 0.2 0.4 0.1]
 [0.8 1.  0.  0.  0.8 0.5 0.5 0.4 0.2 0.9 0.2 0.5]
 [0.8 1.  0.2 0.8 0.3 0.7 0.6 0.2 0.7 0.2 0.5 0.8]] 

Single value weight matrix (12, 12):
 [[0.5 1.  0.9 0.4 0.2 1.  0.2 0.4 0.9 0.8 0.4 0.4]
 [0.3 0.3 0.6 0.8 0.7 0.7 0.1 0.4 0.1 0.  0.2 0.3]
 [0.6 0.2 0.  0.7 0.2 0.  0.3 0.5 1.  0.7 0.1 0.1]
 [0.6 0.4 0.4 0.2 0.1 0.3 1.  1.  0.3 0.2 0.  0.9]
 [0.2 0.9 0.9 0.3 0.8 0.4 0.5 0.5 0.5 0.9 0.7 0.2]
 [0.8 0.5 0.4 1.  0.3 0.  0.8 0.8 0.8 0.7 0.4 0.1]
 [0.9

In [18]:
# Now we can perform a single operation to calculate queries, keys and values across multiple heads
q_s = np.dot(x, wq)
k_s = np.dot(x, wk)
v_s = np.dot(x, wv)

In [19]:
print(f"Query vectors using a single weight matrix {q_s.shape}:\n", q_s)

Query vectors using a single weight matrix (1, 3, 12):
 [[[3.43 3.45 3.97 3.73 3.72 3.42 3.99 3.21 4.07 2.69 2.89 3.03]
  [2.79 2.19 2.58 2.99 2.43 2.78 3.08 2.25 2.52 1.77 2.38 2.78]
  [3.68 3.05 3.42 2.92 2.95 2.84 3.63 2.4  3.96 2.28 2.34 2.57]]]


In [20]:
print(q0, "\n")
print(q1, "\n")
print(q2)

[[[3.43 3.45 3.97 3.73]
  [2.79 2.19 2.58 2.99]
  [3.68 3.05 3.42 2.92]]] 

[[[3.72 3.42 3.99 3.21]
  [2.43 2.78 3.08 2.25]
  [2.95 2.84 3.63 2.4 ]]] 

[[[4.07 2.69 2.89 3.03]
  [2.52 1.77 2.38 2.78]
  [3.96 2.28 2.34 2.57]]]


Although we have the concatenated vector, we need to separate the heads in some way so that separate self-attention operation can be performed

The first step is to *reshape* our combined queries from a shape of:<br>
(batch_size, seq_len, embed_dim)<br>

into a shape of<br>
 (batch_size, seq_len, num_heads, head_dim).
 <br>

In [21]:
q_s_reshaped = tf.reshape(q_s, (batch_size, seq_len, num_heads, head_dim))
print(f"Combined queries: {q_s.shape}\n", q_s, "\n")
print(f"Reshaped into separate heads: {q_s_reshaped.shape}\n", q_s_reshaped)

Combined queries: (1, 3, 12)
 [[[3.43 3.45 3.97 3.73 3.72 3.42 3.99 3.21 4.07 2.69 2.89 3.03]
  [2.79 2.19 2.58 2.99 2.43 2.78 3.08 2.25 2.52 1.77 2.38 2.78]
  [3.68 3.05 3.42 2.92 2.95 2.84 3.63 2.4  3.96 2.28 2.34 2.57]]] 

Reshaped into separate heads: (1, 3, 3, 4)
 tf.Tensor(
[[[[3.43 3.45 3.97 3.73]
   [3.72 3.42 3.99 3.21]
   [4.07 2.69 2.89 3.03]]

  [[2.79 2.19 2.58 2.99]
   [2.43 2.78 3.08 2.25]
   [2.52 1.77 2.38 2.78]]

  [[3.68 3.05 3.42 2.92]
   [2.95 2.84 3.63 2.4 ]
   [3.96 2.28 2.34 2.57]]]], shape=(1, 3, 3, 4), dtype=float64)


By transposing, our matrix dimensions become:
(batch_size, num_heads, seq_len, head_dim)

In [22]:
q_s_transposed = tf.transpose(q_s_reshaped, perm=[0, 2, 1, 3]).numpy()
print(f"Queries transposed into \"separate\" heads {q_s_transposed.shape}:\n",
      q_s_transposed)

Queries transposed into "separate" heads (1, 3, 3, 4):
 [[[[3.43 3.45 3.97 3.73]
   [2.79 2.19 2.58 2.99]
   [3.68 3.05 3.42 2.92]]

  [[3.72 3.42 3.99 3.21]
   [2.43 2.78 3.08 2.25]
   [2.95 2.84 3.63 2.4 ]]

  [[4.07 2.69 2.89 3.03]
   [2.52 1.77 2.38 2.78]
   [3.96 2.28 2.34 2.57]]]]


In [23]:
print("The separate per-head query matrices from before: ")
print(q0, "\n")
print(q1, "\n")
print(q2)

The separate per-head query matrices from before: 
[[[3.43 3.45 3.97 3.73]
  [2.79 2.19 2.58 2.99]
  [3.68 3.05 3.42 2.92]]] 

[[[3.72 3.42 3.99 3.21]
  [2.43 2.78 3.08 2.25]
  [2.95 2.84 3.63 2.4 ]]] 

[[[4.07 2.69 2.89 3.03]
  [2.52 1.77 2.38 2.78]
  [3.96 2.28 2.34 2.57]]]


In [24]:
k_s_transposed = tf.transpose(tf.reshape(k_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()
v_s_transposed = tf.transpose(tf.reshape(v_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()

print(f"Keys for all heads in a single matrix {k_s.shape}: \n", k_s_transposed, "\n")
print(f"Values for all heads in a single matrix {v_s.shape}: \n", v_s_transposed)

Keys for all heads in a single matrix (1, 3, 12): 
 [[[[4.18 4.63 3.26 3.6 ]
   [3.11 3.43 1.69 2.58]
   [3.42 3.97 2.96 3.15]]

  [[3.27 2.59 2.55 3.02]
   [3.18 1.92 2.24 2.42]
   [2.6  2.7  2.51 2.27]]

  [[3.31 3.43 2.59 3.47]
   [2.29 1.91 2.34 2.56]
   [3.16 3.23 2.1  3.18]]]] 

Values for all heads in a single matrix (1, 3, 12): 
 [[[[3.74 4.47 2.83 3.44]
   [3.1  2.87 1.45 2.35]
   [3.37 3.97 2.53 3.27]]

  [[3.39 3.48 3.23 3.46]
   [2.14 2.04 2.38 2.93]
   [3.11 3.83 3.16 3.22]]

  [[4.34 4.46 3.34 2.69]
   [2.83 2.73 2.01 2.47]
   [3.4  3.65 3.3  2.27]]]]


In [25]:
all_heads_output, all_attn_weights = scaled_dot_product_attention(q_s_transposed,
                                                                  k_s_transposed,
                                                                  v_s_transposed)
print("Self attention output:\n", all_heads_output)

Self attention output:
 tf.Tensor(
[[[[3.7324066 4.4596424 2.823731  3.4364076]
   [3.7186124 4.4398947 2.8112495 3.4288704]
   [3.7297041 4.45584   2.8213634 3.4350026]]

  [[3.3065588 3.4400456 3.1833153 3.4144924]
   [3.2400954 3.4136553 3.1471026 3.3772635]
   [3.2602274 3.418443  3.1574974 3.389118 ]]

  [[4.1941414 4.333278  3.3307385 2.625926 ]
   [4.131205  4.274996  3.316106  2.6021   ]
   [4.1585345 4.3017426 3.3267047 2.6109216]]]], shape=(1, 3, 3, 4), dtype=float32)


In [26]:
print("Per head outputs from using separate sets of weights per head:")
print(out0, "\n")
print(out1, "\n")
print(out2)

Per head outputs from using separate sets of weights per head:
tf.Tensor(
[[[3.7324066 4.459643  2.823731  3.4364076]
  [3.7186127 4.439895  2.8112497 3.428871 ]
  [3.7297044 4.4558406 2.8213637 3.4350028]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[3.3065586 3.4400454 3.1833153 3.4144921]
  [3.2400954 3.4136553 3.1471026 3.3772635]
  [3.2602277 3.4184432 3.157498  3.3891182]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[4.1941414 4.333278  3.3307385 2.625926 ]
  [4.1312056 4.2749963 3.316106  2.6021001]
  [4.1585345 4.3017426 3.3267047 2.6109216]]], shape=(1, 3, 4), dtype=float32)


To get the final concatenated result, we need to reverse our **reshape** and **transpose** operation, starting with the **transpose** this time.

In [27]:
combined_out_b = tf.reshape(tf.transpose(all_heads_output, perm=[0, 2, 1, 3]),
                            shape=(batch_size, seq_len, embed_dim))
print("Final output from using single query, key, value matrices:\n",
      combined_out_b, "\n")
print("Final output from using separate query, key, value matrices per head:\n",
      combined_out_a)

Final output from using single query, key, value matrices:
 tf.Tensor(
[[[3.7324066 4.4596424 2.823731  3.4364076 3.3065588 3.4400456 3.1833153
   3.4144924 4.1941414 4.333278  3.3307385 2.625926 ]
  [3.7186124 4.4398947 2.8112495 3.4288704 3.2400954 3.4136553 3.1471026
   3.3772635 4.131205  4.274996  3.316106  2.6021   ]
  [3.7297041 4.45584   2.8213634 3.4350026 3.2602274 3.418443  3.1574974
   3.389118  4.1585345 4.3017426 3.3267047 2.6109216]]], shape=(1, 3, 12), dtype=float32) 

Final output from using separate query, key, value matrices per head:
 [[[3.7324066 4.459643  2.823731  3.4364076 3.3065586 3.4400454 3.1833153
   3.4144921 4.1941414 4.333278  3.3307385 2.625926 ]
  [3.7186127 4.439895  2.8112497 3.428871  3.2400954 3.4136553 3.1471026
   3.3772635 4.1312056 4.2749963 3.316106  2.6021001]
  [3.7297044 4.4558406 2.8213637 3.4350028 3.2602277 3.4184432 3.157498
   3.3891182 4.1585345 4.3017426 3.3267047 2.6109216]]]


Encapsulating everything in a class

In [28]:
class MultiHeadSelfAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadSelfAttention, self).__init__()
    self.d_model = d_model
    self.num_heads = num_heads

    self.d_head = self.d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(self.d_model)
    self.wk = tf.keras.layers.Dense(self.d_model)
    self.wv = tf.keras.layers.Dense(self.d_model)

    # Linear layer to generate final output
    self.dense = tf.keras.layers.Dense(self.d_model)

  def split_heads(self, x):
    batch_size = x.shape[0]

    split_inputs = tf.reshape(x, (batch_size, -1, self.num_heads, self.d_head))
    return tf.transpose(split_inputs, perm=[0, 2, 1, 3])

  def merge_heads(self, x):
    batch_size = x.shape[0]

    merged_inputs = tf.transpose(x, perm=[0, 2, 1, 3])
    return tf.reshape(merged_inputs, (batch_size, -1, self.d_model))

  def call(self, q, k, v, mask):
    qs = self.wq(q)
    ks = self.wk(k)
    vs = self.wv(v)

    qs = self.split_heads(qs)
    ks = self.split_heads(ks)
    vs = self.split_heads(vs)

    output, attn_weights = scaled_dot_product_attention(qs, ks, vs, mask)
    output = self.merge_heads(output)

    return self.dense(output), attn_weights

In [29]:
mhsa = MultiHeadSelfAttention(12, 3)

output, attn_weights = mhsa(x, x, x, None)
print(f"MHSA output{output.shape}:")
print(output)

MHSA output(1, 3, 12):
tf.Tensor(
[[[-0.5003724  -0.11505236  0.5572443   0.6003801   0.0584025
    0.6416788  -0.24842958  0.3024751   1.2644374  -0.44027513
   -0.09465392  0.25612828]
  [-0.5384439  -0.10926842  0.56433237  0.61952585  0.01193128
    0.67714345 -0.25823063  0.26884145  1.2858046  -0.44244006
   -0.12061815  0.27682045]
  [-0.44456115 -0.09979047  0.5613558   0.60058564  0.0818125
    0.64586514 -0.23325025  0.28775805  1.2517008  -0.45000398
   -0.05773485  0.27416545]]], shape=(1, 3, 12), dtype=float32)


###  Encoder block

We can now build our **Encoder Block**. In addition to the **Multi-Head Self Attention** layer, the **Encoder Block** also has **skip connections**, **layer normalization steps**, and a **two-layer feed-forward neural network**. The original **Attention Is All You Need** paper also included some **dropout** applied to the self-attention output which isn't shown in the illustration below

<div>
<img src="https://drive.google.com/uc?export=view&id=1D8sLDyQMqqhCjHWOn-I7rZKHugWxFyLy" width="500"/>
</div>

In [30]:
def feed_forward_network(d_model, hidden_dim):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(hidden_dim, activation="relu"),
      tf.keras.layers.Dense(d_model)
  ])

In [31]:
class EncoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(EncoderBlock, self).__init__()

    self.mhsa = MultiHeadSelfAttention(d_model, num_heads)
    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()

  def call(self, x, training, mask):
    mhsa_output, attn_weights = self.mhsa(x, x, x, mask)
    msha_output = self.dropout1(mhsa_output, training=training)
    msha_output = self.layernorm1(x + mhsa_output)

    ffn_output = self.ffn(mhsa_output)
    ffn_output = self.dropout2(ffn_output, training=training)
    output = self.layernorm2(mhsa_output + ffn_output)

    return output, attn_weights

Suppose we have an embedding dimension of 12, and we want 3 attention heads and a feed forward network with a hidden dimension of 48 (4x the embedding dimension). We would declare and use a single encoder block like so:

In [33]:
encoder_block = EncoderBlock(12, 3, 48)

block_output,  _ = encoder_block(x, True, None)
print(f"Output from single encoder block {block_output.shape}:")
print(block_output)

Output from single encoder block (1, 3, 12):
tf.Tensor(
[[[ 0.39541626  0.43086132 -1.9710463   0.41380224 -1.7375672
    1.7203096   0.88450885  0.18918006 -0.16157828 -0.74061376
    0.21856396  0.35816276]
  [ 0.32906473  1.195967   -1.8941271   0.31242618 -1.6850877
    1.493637    0.79473627  0.10767619 -0.19744128 -0.9286014
    0.19759953  0.2741505 ]
  [ 0.31259722  1.1597978  -1.9302864   0.31455603 -1.7048259
    1.5573195   0.7670412   0.10800369 -0.22547927 -0.7777343
    0.14544605  0.27356404]]], shape=(1, 3, 12), dtype=float32)


### Word and positional embeddings

Now we will be dealing with the input to the encoder block, the inputs will be positional word embeddings. <br>
We will start will a subword tokenizer called Byte-Pair Encoding

In [34]:
bpemb_en = BPEmb(lang="en")

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:00<00:00, 577309.46B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d100.w2v.bin.tar.gz


100%|██████████| 3784656/3784656 [00:01<00:00, 2113733.61B/s]


In [35]:
bpemb_vocab_size, bpemb_embed_size = bpemb_en.vectors.shape
print("Vocabulary size:", bpemb_vocab_size)
print("Embedding size:", bpemb_embed_size)

Vocabulary size: 10000
Embedding size: 100


In [36]:
bpemb_en.vectors[bpemb_en.words.index('car')]

array([-0.305548, -0.325598, -0.134716, -0.078735, -0.660545,  0.076211,
       -0.735487,  0.124533, -0.294402,  0.459688,  0.030137,  0.174041,
       -0.224223,  0.486189, -0.504649, -0.459699,  0.315747,  0.477885,
        0.091398,  0.427867,  0.016524, -0.076833, -0.899727,  0.493158,
       -0.022309, -0.422785, -0.154148,  0.204981,  0.379834,  0.070588,
        0.196073, -0.368222,  0.473406,  0.007409,  0.004303, -0.007823,
       -0.19103 , -0.202509,  0.109878, -0.224521, -0.35741 , -0.611633,
        0.329958, -0.212956, -0.497499, -0.393839, -0.130101, -0.216903,
       -0.105595, -0.076007, -0.483942, -0.139704, -0.161647,  0.136985,
        0.415363, -0.360143,  0.038601, -0.078804, -0.030421,  0.324129,
        0.223378, -0.523636, -0.048317, -0.032248, -0.117367,  0.470519,
        0.225816, -0.222065, -0.225007, -0.165904, -0.334389, -0.20157 ,
        0.572352, -0.268794,  0.301929, -0.005563,  0.387491,  0.261031,
       -0.11613 ,  0.074982, -0.008433,  0.259987, 

Although BPEmb provides embeddings we are not interested in it since we will be having our own embedding, we just need the tokenizer from BPEmb

BPEmb places underscores in front of any tokens which are whole words or intended to begin words.

In [37]:
sample_sentence = "Where can I find a pizzeria?"
tokens = bpemb_en.encode(sample_sentence)
print(tokens)

['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?']


In [38]:
token_seq = np.array(bpemb_en.encode_ids("Where can I find a pizzeria?"))
print(token_seq)

[ 571  280  386 1934    4   24  248 4339  177 9967]


Now that we have a way to tokenize and vectorize sentences, we can declare and use an embedding layer with the same vocabulary size as **BPEmb** and a desired embedding size.

In [39]:
token_embed = tf.keras.layers.Embedding(bpemb_vocab_size, embed_dim)
token_embeddings = token_embed(token_seq)

Next, we need to add *positional* information to each token embedding. As we covered in the slides, the original paper used sinusoidals but it's more common these days to just use another set of embeddings. We'll do the latter here.<br>

Here, we're declaring an embedding layer with rows equalling a maximum sequence length and columns equalling our token embedding size. We then generate a vector of position ids.

In [40]:
max_seq_len = 256
pos_embed = tf.keras.layers.Embedding(max_seq_len, embed_dim)

pos_idx = tf.range(len(token_seq))
print(pos_idx)

tf.Tensor([0 1 2 3 4 5 6 7 8 9], shape=(10,), dtype=int32)


In [41]:
position_embeddings = pos_embed(pos_idx)
print("Position embeddings for the input sequence\n", position_embeddings)

Position embeddings for the input sequence
 tf.Tensor(
[[ 1.82219483e-02 -1.69747695e-02  4.09597270e-02 -4.34763320e-02
   4.14819755e-02  4.56087925e-02 -2.87097339e-02 -1.60238370e-02
  -4.13637049e-02  3.21919434e-02 -1.00786611e-03  4.47556116e-02]
 [-2.40402818e-02 -1.27885714e-02  2.11089589e-02  4.15032171e-02
   1.66279562e-02 -3.20802554e-02  1.69699453e-02  4.52707745e-02
  -3.91988270e-02  3.49709131e-02 -4.06403057e-02  3.55405733e-03]
 [-2.60837562e-02  1.10583380e-03 -1.49048194e-02 -9.47207212e-03
   4.72656749e-02 -3.03499401e-05 -4.26038876e-02  4.37751301e-02
  -3.36046964e-02 -2.46626381e-02  2.73570679e-02 -2.06011776e-02]
 [-4.21421416e-02 -4.58909161e-02 -4.85099666e-02 -3.23428512e-02
  -2.08168030e-02 -3.84754315e-02  3.21880318e-02 -3.21805105e-02
  -1.01907961e-02 -4.71827276e-02 -2.22054962e-02  4.05672528e-02]
 [-4.70605604e-02  1.55666731e-02 -3.24656256e-02  8.93970579e-03
  -1.66302100e-02 -2.18254924e-02 -4.13849726e-02 -1.79235339e-02
   4.77838404e-02

In [42]:
input = token_embeddings + position_embeddings
print("Input to the initial encoder block:\n", input)

Input to the initial encoder block:
 tf.Tensor(
[[-0.0302702   0.01715881  0.05518873 -0.01663244 -0.00744355  0.01922347
  -0.07159269  0.02735343 -0.00614367  0.05000506  0.00195571  0.01436017]
 [ 0.01254373 -0.04264393  0.03776     0.00156671 -0.02343643  0.01124756
   0.01557662  0.05048685 -0.00756609  0.04741701  0.00406291 -0.04500313]
 [-0.00614567  0.00240604 -0.00406704  0.03552122 -0.0004468  -0.02634587
  -0.07185796  0.03045517 -0.02015697 -0.03462595 -0.00482651 -0.03510531]
 [-0.02954583 -0.08577025 -0.07290675  0.01363734  0.0220466  -0.07063078
   0.07918534  0.01435894 -0.04572603 -0.0679063  -0.02251847  0.00950702]
 [-0.09181182  0.0006953   0.00866111 -0.00483091 -0.02001427 -0.03301191
  -0.02965256 -0.02992774  0.02700661 -0.04919893  0.05357183  0.01765364]
 [ 0.00092864  0.02677102  0.04778886 -0.00727808  0.05796351 -0.05613257
   0.08918362 -0.03874144  0.01840472  0.06211065 -0.03454055 -0.00422086]
 [ 0.02542398  0.03531877 -0.02805072 -0.06887159 -0.03211

### Encoder

In [43]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim,
               src_vocab_size, max_seq_len, dropout_rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(src_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    self.blocks = [EncoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate)
    for _ in range (num_blocks)]

  def call(self, input, training, mask):
    token_embeds = self.token_embed(input)

    num_pos = input.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, input.shape)
    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=True)

    for block in self.blocks:
      x, weights = block(x, training, mask)

    return x, weights

Let's look at how positional encoding is working

In [45]:
# Batch of 3 sequences, each of length 10 (10 is also the
# maximum sequence length in this case).
seqs = np.random.randint(0, 10000, size=(3, 10))
print(seqs.shape)
print(seqs)

(3, 10)
[[3600 5099 1095  178 4667 7867 2778 8834 9393 9274]
 [1847 4139 3060 5670 7052 6481 8522 4947 2123 8864]
 [3606 2478 6616 4787 9641 5450 2893 2018 9268 9878]]


In [46]:
pos_ids = np.resize(np.arange(seqs.shape[1]), seqs.shape[0] * seqs.shape[1])
print(pos_ids)

[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]


In [47]:
pos_ids = np.reshape(pos_ids, (3, 10))
print(pos_ids.shape)
print(pos_ids)

(3, 10)
[[0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]]


In [48]:
pos_embed(pos_ids)

<tf.Tensor: shape=(3, 10, 12), dtype=float32, numpy=
array([[[ 1.82219483e-02, -1.69747695e-02,  4.09597270e-02,
         -4.34763320e-02,  4.14819755e-02,  4.56087925e-02,
         -2.87097339e-02, -1.60238370e-02, -4.13637049e-02,
          3.21919434e-02, -1.00786611e-03,  4.47556116e-02],
        [-2.40402818e-02, -1.27885714e-02,  2.11089589e-02,
          4.15032171e-02,  1.66279562e-02, -3.20802554e-02,
          1.69699453e-02,  4.52707745e-02, -3.91988270e-02,
          3.49709131e-02, -4.06403057e-02,  3.55405733e-03],
        [-2.60837562e-02,  1.10583380e-03, -1.49048194e-02,
         -9.47207212e-03,  4.72656749e-02, -3.03499401e-05,
         -4.26038876e-02,  4.37751301e-02, -3.36046964e-02,
         -2.46626381e-02,  2.73570679e-02, -2.06011776e-02],
        [-4.21421416e-02, -4.58909161e-02, -4.85099666e-02,
         -3.23428512e-02, -2.08168030e-02, -3.84754315e-02,
          3.21880318e-02, -3.21805105e-02, -1.01907961e-02,
         -4.71827276e-02, -2.22054962e-02,  

Let's try our encoder on a batch of sentences.

In [49]:
input_batch = [
    "Where can I find a pizzeria?",
    "Mass hysteria over listeria.",
    "I ain't no circle back girl."
]

bpemb_en.encode(input_batch)

[['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?'],
 ['▁mass', '▁hy', 'ster', 'ia', '▁over', '▁l', 'ister', 'ia', '.'],
 ['▁i', '▁a', 'in', "'", 't', '▁no', '▁circle', '▁back', '▁girl', '.']]

In [50]:
input_seqs = bpemb_en.encode_ids(input_batch)
print("Vectorized inputs:")
input_seqs

Vectorized inputs:


[[571, 280, 386, 1934, 4, 24, 248, 4339, 177, 9967],
 [1535, 1354, 1238, 177, 380, 43, 871, 177, 9935],
 [386, 4, 6, 9937, 9915, 467, 5410, 810, 3692, 9935]]

the input sequences aren't the same length in this batch. In this case, we need to pad them out so that they are

In [51]:
padded_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(input_seqs, padding="post")
print("Input to the encoder:")
print(padded_input_seqs.shape)
print(padded_input_seqs)

Input to the encoder:
(3, 10)
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]]


Since our input now has padding, now's a good time to cover **masking**.
<br>

In [52]:
enc_mask = tf.cast(tf.math.not_equal(padded_input_seqs, 0), tf.float32)
print("Input:")
print(padded_input_seqs, '\n')
print("Encoder mask:")
print(enc_mask)

Input:
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]] 

Encoder mask:
tf.Tensor(
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]], shape=(3, 10), dtype=float32)


Keep in mind that the dimension of the attention matrix (for this example) is going to be:<br>
*(batch size, number of heads, query size, key size)*<br>
(3, 3, 10, 10)

So we need to expand the mask dimensions like so:

In [54]:
enc_mask = enc_mask[:, tf.newaxis, tf.newaxis, :]
enc_mask

<tf.Tensor: shape=(3, 1, 1, 1, 1, 10), dtype=float32, numpy=
array([[[[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]]],




       [[[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 0.]]]]],




       [[[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]]]], dtype=float32)>

In [56]:
# We will call the encoder block like so

num_encoder_blocks = 6

# d_model is the embedding dimension used throughout.
d_model = 12

num_heads = 3

# Feed-forward network hidden dimension width.
ffn_hidden_dim = 48

src_vocab_size = bpemb_vocab_size
max_input_seq_len = padded_input_seqs.shape[1]

encoder = Encoder(
    num_encoder_blocks,
    d_model,
    num_heads,
    ffn_hidden_dim,
    src_vocab_size,
    max_input_seq_len)

In [57]:
encoder_output, attn_weights = encoder(padded_input_seqs, training=True,
                                       mask=enc_mask)
print(f"Encoder output {encoder_output.shape}:")
print(encoder_output)

Encoder output (3, 10, 12):
tf.Tensor(
[[[ 1.61519781e-01  6.69817746e-01 -5.12308061e-01  9.01943386e-01
   -1.80824292e+00 -4.99264985e-01 -1.19605660e+00 -1.06476843e+00
    5.68557143e-01  1.88536930e+00  6.48204505e-01  2.45229602e-01]
  [ 2.68859446e-01  6.58463836e-01 -5.21348119e-01  8.90243828e-01
   -1.81402874e+00 -5.08156538e-01 -1.20310354e+00 -1.07240808e+00
    5.57800114e-01  1.87181807e+00  6.36754513e-01  2.35104814e-01]
  [ 2.08389789e-01  2.37865616e-02 -4.73079890e-01  9.56770718e-01
   -1.78249192e+00 -4.59719777e-01 -1.16366160e+00 -1.03127134e+00
    6.20023072e-01  1.98964894e+00  7.00004756e-01  4.11600649e-01]
  [ 1.57458305e-01  6.62578166e-01 -5.12770653e-01  8.93421054e-01
   -1.80035233e+00 -4.99668121e-01 -1.19188237e+00 -1.06150782e+00
    5.62080443e-01  1.90904951e+00  6.40803397e-01  2.40790635e-01]
  [ 1.14336133e-01  6.25231743e-01 -5.63532710e-01  8.58763576e-01
   -1.86602509e+00 -1.15231602e-02 -1.25046790e+00 -1.11877429e+00
    5.23798883e-01 

### Decoder block

Let's build the **Decoder Block**. Everything we did to create the **encoder** block applies here. The major differences are that the **Decoder Block** has:
1. a **Multi-Head Cross-Attention** layer which uses the encoder's outputs as the keys and values.

2. an extra skip/residual connection along with an extra layer normalization step.

<div>
<img src="https://drive.google.com/uc?export=view&id=1WVT4SX49bnta4uscOTF4xrsxFI4PbPER" width="500"/>
</div>

In [58]:
class DecoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.2):
    super(DecoderBlock, self).__init__()

    self.mhsa1 = MultiHeadSelfAttention(d_model, num_heads)
    self.mhsa2 = MultiHeadSelfAttention(d_model, num_heads)

    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()
    self.layernorm3 = tf.keras.layers.LayerNormalization()

  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    mhsa_output1, attn_weights = self.mhsa1(target, target, target, decoder_mask)
    mhsa_output1 = self.dropout1(mhsa_output1, training=training)
    mhsa_output1 = self.layernorm1(mhsa_output1 + target)

    mhsa_output2, attn_weights = self.mhsa2(mhsa_output1, encoder_output,
                                            encoder_output,
                                            memory_mask)
    mhsa_output2 = self.dropout2(mhsa_output2, training=training)
    mhsa_output2 = self.layernorm2(mhsa_output2 + mhsa_output1)

    ffn_output = self.ffn(mhsa_output2)
    ffn_output = self.dropout3(ffn_output, training=training)
    output = self.layernorm3(ffn_output + mhsa_output2)

    return output, attn_weights

### Decoder

In [60]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(target_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    self.blocks = [DecoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate) for _ in range(num_blocks)]

  def call(self, encoder_outputs, target, training, decoder_mask, memory_mask):
    token_embeds = self.token_embed(target)

    # Generate position indices.
    num_pos = target.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, target.shape)

    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    for block in self.blocks:
      x, weights = block(encoder_output, x, training, decoder_mask, memory_mask)

    return x, weights

The decoder takes two masks:

The *decoder mask* which is a <u>combination of two masks</u>: one to account for the padding in target sequences, and the look-ahead mask. This mask is used in the decoder's **first** multi-head self-attention layer.

The *memory mask* which is used in the decoder's **second** multi-head self-attention. The keys and values for this layer are going to be the encoder's output, and this mask will ensure the decoder doesn't attend to any encoder output which corresponds to padding.

In [61]:
target_input_seqs = [
    [1, 652, 723, 123, 62],
    [1, 25,  98, 129, 248, 215, 359, 249],
    [1, 2369, 1259, 125, 486],
]

In [62]:
padded_target_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(target_input_seqs, padding="post")
print("Padded target inputs to the decoder:")
print(padded_target_input_seqs.shape)
print(padded_target_input_seqs)

Padded target inputs to the decoder:
(3, 8)
[[   1  652  723  123   62    0    0    0]
 [   1   25   98  129  248  215  359  249]
 [   1 2369 1259  125  486    0    0    0]]


In [63]:
dec_padding_mask = tf.cast(tf.math.not_equal(padded_target_input_seqs, 0), tf.float32)
dec_padding_mask = dec_padding_mask[:, tf.newaxis, tf.newaxis, :]
print(dec_padding_mask)

tf.Tensor(
[[[[1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 1, 8), dtype=float32)


the look-ahead mask is a diagonal where the lower half are 1s and the upper half are zeros. This is easy to create using the *band_part* method

In [64]:
target_input_seq_len = padded_target_input_seqs.shape[1]
look_ahead_mask = tf.linalg.band_part(tf.ones((target_input_seq_len,
                                               target_input_seq_len)), -1, 0)
print(look_ahead_mask)

tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1.]], shape=(8, 8), dtype=float32)


To create the decoder mask, we just need to combine the padding and look-ahead masks. Note how the columns of the resulting decoder mask are all zero for padding positions.

In [65]:
dec_mask = tf.minimum(dec_padding_mask, look_ahead_mask)
print("The decoder mask:")
print(dec_mask)

The decoder mask:
tf.Tensor(
[[[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 1. 0. 0.]
   [1. 1. 1. 1. 1. 1. 1. 0.]
   [1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 8, 8), dtype=float32)


We can now declare a decoder and pass it everything it needs. In our case, the memory mask is the same as the encoder mask.

In [66]:
decoder = Decoder(6, 12, 3, 48, 10000, 8)
decoder_output, _ = decoder(encoder_output, padded_target_input_seqs,
                            True, dec_mask, enc_mask)
print(f"Decoder output {decoder_output.shape}:")
print(decoder_output)

Decoder output (3, 8, 12):
tf.Tensor(
[[[-8.8539821e-01  6.0751778e-01 -1.3738725e-01 -8.6763248e-02
    1.6306596e+00 -2.6366785e-01  1.4358404e+00 -1.4373859e+00
   -7.8010112e-01  1.3945577e+00 -1.1337178e+00 -3.4415382e-01]
  [-7.8027517e-01  6.3656628e-01 -6.2655085e-01  3.7751541e-01
    1.2428114e+00 -6.2185472e-01  1.3170148e+00 -1.7722126e+00
    2.6306489e-01  1.4296468e+00 -1.2079386e+00 -2.5778776e-01]
  [-7.5314182e-01  7.9830962e-01 -6.1600941e-01  2.9582545e-01
    1.5980927e+00  1.7473064e-02  1.5998267e+00 -1.6669861e+00
   -2.9680535e-01  8.3317631e-01 -9.7582680e-01 -8.3393466e-01]
  [-7.3061341e-01  8.1737435e-01 -7.7596700e-01  7.3493160e-03
    1.7230958e+00  8.4926619e-04  5.7488269e-01 -1.7727554e+00
   -5.1440674e-01  1.6677879e+00 -8.1850636e-01 -1.7909034e-01]
  [-5.2621698e-01  6.3444841e-01 -8.7892771e-01  1.6074647e-01
    1.2533325e+00  4.0912312e-01  1.1167920e+00 -1.9645927e+00
   -6.2419146e-01  1.5302031e+00 -9.5763403e-01 -1.5308262e-01]
  [-7.592214

### Transformer

In [67]:
class Transformer(tf.keras.Model):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
               target_vocab_size, max_input_len, max_target_len, dropout_rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
                           max_input_len, dropout_rate)

    self.decoder = Decoder(num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
                           max_target_len, dropout_rate)

    self.output_layer = tf.keras.layers.Dense(target_vocab_size)


  def call(self, input_seqs, target_input_seqs, training, encoder_mask,
           decoder_mask, memory_mask):
    encoder_output, encoder_attn_weights = self.encoder(input_seqs,
                                                        training, encoder_mask)

    decoder_output, decoder_attn_weights = self.decoder(encoder_output,
                                                        target_input_seqs, training,
                                                        decoder_mask, memory_mask)

    return self.output_layer(decoder_output), encoder_attn_weights, decoder_attn_weights

In [69]:
transformer = Transformer(
    num_blocks = 6,
    d_model = 12,
    num_heads = 3,
    hidden_dim = 48,
    source_vocab_size = bpemb_vocab_size,
    target_vocab_size = 7000, # made-up target vocab size.
    max_input_len = padded_input_seqs.shape[1],
    max_target_len = padded_target_input_seqs.shape[1])

transformer_output, _, _ = transformer(padded_input_seqs,
                                       padded_target_input_seqs, True,
                                       enc_mask, dec_mask, memory_mask=enc_mask)
print(f"Transformer output {transformer_output.shape}:")
print(transformer_output) # If training, we would use this output to calculate losses.

Transformer output (3, 8, 7000):
tf.Tensor(
[[[ 0.01284234  0.07227436  0.09752458 ...  0.02201883 -0.06278675
    0.03497577]
  [ 0.01606729  0.03789817  0.044289   ... -0.07608898 -0.03115907
   -0.00040297]
  [ 0.01155925  0.10143702  0.06334066 ... -0.01543371 -0.0725012
    0.01715794]
  ...
  [ 0.02296371  0.0597923   0.02989318 ... -0.06274884 -0.04978283
    0.01587684]
  [ 0.02745717  0.10437047  0.05982758 ...  0.00036109 -0.0538918
    0.02103401]
  [ 0.01715882  0.08757053 -0.0079866  ... -0.09258862 -0.02295894
    0.0021074 ]]

 [[-0.02423562  0.06052178  0.00236865 ... -0.03600153 -0.08614506
   -0.02912395]
  [ 0.01155205  0.05440085 -0.03475895 ... -0.00840835 -0.07614617
    0.00731555]
  [ 0.00424316  0.10854127 -0.08500981 ... -0.07976189 -0.06361163
   -0.02422469]
  ...
  [ 0.02735201  0.14400566 -0.12040977 ... -0.00666932 -0.00438626
    0.00946678]
  [-0.00979459  0.10396613 -0.05031697 ... -0.05823579 -0.09110771
   -0.03022914]
  [ 0.04948877  0.11255334 -0.1