In [2]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import io
import unicodedata
import re
import os
from sklearn.model_selection import train_test_split
import numpy as np
import time
import matplotlib.pyplot as plt

## 1. Data preprocessing

In [3]:
target_file = 'data/divina_syll_textonly_only_tercets.txt'

In [4]:
target_text_raw = open(target_file, 'rb').read().decode(encoding='utf-8')
print('Length of target text: {} characters'.format(len(target_text_raw)))

Length of target text: 886672 characters


In [5]:
target_vocab = sorted(set(target_text_raw))
target_vocab_size = len(target_vocab)

In [6]:
print('Target vocab size: {}'.format(target_vocab_size))

Target vocab size: 81


The *preprocess* function adds the start and end symbols to each line and eliminates the empty ones.

In [7]:
def preprocess(text):
    """
    For each line in the file, add start symbol "^" in the beginning and end symbol "$" in the end
    """

    text_prepr = []

    for line in text.split('\n'):
      if line.strip() != '':
        line_prepr = line.strip()
        text_prepr.append(line_prepr)
    
    text_prepr = np.array(text_prepr)
    text_prepr = text_prepr.reshape(-1,3)
    
    for tercet in range(0, text_prepr.shape[0]):
      text_prepr[tercet][0] = '^' + text_prepr[tercet][0] + '$'
      text_prepr[tercet][1] = text_prepr[tercet][1] + '$'
      text_prepr[tercet][2] = text_prepr[tercet][2] + '£'

    return text_prepr

In [8]:
target_text_prepr = preprocess(target_text_raw)

In [9]:
target_text_prepr = target_text_prepr.reshape(14133)

In [10]:
target_text_prepr = target_text_prepr.tolist()

In [11]:
target_text_prepr = list(map(lambda x: re.sub(' ', '§', x), target_text_prepr))

The tokenizer encodes each line into a tensor of char-indexes and for simplicity fits only on the target's vocabulary.

In [12]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True, lower=False)
tokenizer.fit_on_texts(target_text_prepr)

target_text_lines_enc = tokenizer.texts_to_sequences(target_text_prepr)

In [73]:
target_text = np.array([np.array(x) for x in target_text_lines_enc])

  """Entry point for launching an IPython kernel.


Padding is required in order to have a non-ragged tensor to feed to the neural network.

## 2. The Transformer model


In [14]:
target_text_ = []

for line_number in range(0, len(target_text) - 3):
    
    target_verses = []
    
    for i in range(3):
        target_verses += list(target_text[line_number + i])

    target_text_.append(target_verses)
    
target_text_ = np.array(target_text_)

  if sys.path[0] == '':


In [15]:
target_text_.shape

(14130,)

In [16]:
def pad(x):
    return tf.keras.preprocessing.sequence.pad_sequences(x, padding='post') 

In [17]:
target_text_ = pad(target_text_)

In [18]:
input_train, input_test, target_train, target_test = train_test_split(
    target_text_, target_text_
    )

The dataset is created by grouping the lines in batches and by shuffling them.

Each input's line is in correspondence with its target.

In [19]:
terces_per_batch = 4
terces_len = target_text_.shape[1]
batch_len = terces_per_batch * (terces_len + 1) #perché +1?

In [20]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = terces_per_batch
steps_per_epoch = len(input_train)//BATCH_SIZE

vocab_size = len(tokenizer.word_index)+1 # the +1 is added to take into account the id 0 of the padding

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [21]:
dataset

<BatchDataset shapes: ((4, 171), (4, 171)), types: (tf.int32, tf.int32)>

We define the positional encoding to add to the embedding.

This allows to take into account the order of the characters in the input sequence.

In [22]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [23]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

We define two masks: 

one is used to mask the padding added to the sequences in the preprocessing step; 

the other one is used to mask the positions following the current one and not predicted yet;

The first mask is used from both the encoder and the decoder, while the last mask is used only in the self-attention of the decoder.

In [24]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [25]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [26]:
def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)

  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)

  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by
  # the decoder.
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

  return enc_padding_mask, combined_mask, dec_padding_mask

The *scaled_dot_product_attention* gets the attention weights by applying the softmax to the rescaled dot product between the query matrix and the key matrix, while the output is obtained by multiplying the value matrix for those attention weights.

The query, key and value matrices are built by multiplying the embedding matrix with the query, key and value weight matrices, which initially are randomly initialized.

In [27]:
def scaled_dot_product_attention(q, k, v, mask):
  """
  Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead)
  but it must be broadcastable for addition.

  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable
          to (..., seq_len_q, seq_len_k). Defaults to None.

  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)

  # softmax is normalized on the last axis (seq_len_k) so that the scores add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

The multi-headed attention allows to improve the performance of the attention mechanism by working with multiple sets of query, key and value weight matrices.

These heads work in parallel and process at the same time all the lines of each batch.

At the end, the results of all the attention heads are concatenated and multiplied by an additional weight matrix, to adjust the dimension before passing through the final *point_wise_feed_forward_network*.

In [28]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """
    Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention,(batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

In [29]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
      ])

Each encoder is constituted by a multi-headed self-attention layer and by a final feed forward layer. 

Both sub-layers have a residual connection around them and are followed by a layer-normalization step.

In [30]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

    return out2

The decoder equals the encoder, a part from the fact that it contains a slightly different self-attention layer and an additional attention layer.

Indeed, the decoder is characterized by a self-attention layer which focuses only on earlier positions in its input sequence, not looking at the positions which have not been predicted yet.

What's more the decoder is also characterized by an attention layer which obtains its key and value matrices from the output of the encoder, while the query matrix is obtained from the output of the previous self-attention in the decoder.

The encoder-decoder attention helps the decoder to focus on appropriate positions in the input sequence of the encoder during the translation.

In [31]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)

  def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)

    # enc_output.shape == (batch_size, input_seq_len, d_model)
    attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

    return out3, attn_weights_block1, attn_weights_block2

The encoding component is a stack of encoders and the decoding component is a stack of decoders of the same number.

At the beginning, in the encoding, each input character is turned into a vector using an embedding algorithm and adding the positional encoding to it.

This happens only in the bottom-most encoder, while the following encoders take the output of the encoder which is directly below.

The same for the decoding.


In [32]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding,
                                            self.d_model)

    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]

    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

    return x  # (batch_size, input_seq_len, d_model)

In [33]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}

    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)

      attention_weights[f'decoder_layer{i+1}_block1'] = block1
      attention_weights[f'decoder_layer{i+1}_block2'] = block2

    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

In the transformer, the output of the encoding is passed to the stack of decoders and the output of the decoding is projected by a feed forward network into a vector of logits of dimension equal to the one of the target's vocabulary.

Obviously this is done for each character of each line of each batch.

In [34]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                             input_vocab_size, pe_input, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                           target_vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inp, tar, training, enc_padding_mask,
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)

    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

    return final_output, attention_weights

## 3. Training

In [35]:
num_layers = 4
d_model = 256
dff = 1024
num_heads = 8
dropout_rate = 0.1

In [36]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [37]:
learning_rate = CustomSchedule(d_model)

The loss is calculated using Sparse Categorical Crossentropy and the loss of the padding is masked.

The same is done for the accuracy.

In [38]:
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [39]:
def accuracy_function(real, pred):
  accuracies = tf.equal(real, tf.cast(tf.argmax(pred, axis=2), dtype=tf.int32))

  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)

  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [40]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [41]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    pe_input=batch_len,
    pe_target=batch_len,
    rate=dropout_rate)

In [42]:
checkpoint_path = "./checkpoints/test"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Latest checkpoint restored!!')

To train the decoder we use teacher forcing, calculating the loss between the predicted logits and the real id of the character.

In [43]:
@tf.function
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]

  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp,
                                 True,
                                 enc_padding_mask,
                                 combined_mask,
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

  train_loss(loss)
  train_accuracy(accuracy_function(tar_real, predictions))

In [44]:
EPOCHS=20
for epoch in range(EPOCHS):
  start = time.time()

  train_loss.reset_states()
  train_accuracy.reset_states()

  for (batch, (inp, tar)) in enumerate(dataset):
    train_step(inp, tar)

    if batch % 50 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  if (epoch + 1) % 5 == 0:
    ckpt_save_path = ckpt_manager.save()
    print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

  print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Epoch 1 Batch 0 Loss 5.6833 Accuracy 0.0000
Epoch 1 Batch 50 Loss 4.3460 Accuracy 0.1239
Epoch 1 Batch 100 Loss 3.7302 Accuracy 0.1648
Epoch 1 Batch 150 Loss 3.4774 Accuracy 0.1831
Epoch 1 Batch 200 Loss 3.2785 Accuracy 0.2044
Epoch 1 Batch 250 Loss 3.0962 Accuracy 0.2260
Epoch 1 Batch 300 Loss 2.9539 Accuracy 0.2432
Epoch 1 Batch 350 Loss 2.8356 Accuracy 0.2585
Epoch 1 Batch 400 Loss 2.7424 Accuracy 0.2707
Epoch 1 Batch 450 Loss 2.6680 Accuracy 0.2798
Epoch 1 Batch 500 Loss 2.6047 Accuracy 0.2875
Epoch 1 Batch 550 Loss 2.5513 Accuracy 0.2941
Epoch 1 Batch 600 Loss 2.5050 Accuracy 0.2998
Epoch 1 Batch 650 Loss 2.4644 Accuracy 0.3051
Epoch 1 Batch 700 Loss 2.4288 Accuracy 0.3098
Epoch 1 Batch 750 Loss 2.3965 Accuracy 0.3142
Epoch 1 Batch 800 Loss 2.3684 Accuracy 0.3181
Epoch 1 Batch 850 Loss 2.3424 Accuracy 0.3220
Epoch 1 Batch 900 Loss 2.3180 Accuracy 0.3259
Epoch 1 Batch 950 Loss 2.2941 Accuracy 0.3298
Epoch 1 Batch 1000 Loss 2.2717 Accuracy 0.3336
Epoch 1 Batch 1050 Loss 2.2506 Accur

KeyboardInterrupt: ignored

## 4. Translation

We define the *evaluate* function to preprocess the sentence in input to the encoder and to get the predicted ids of the translation.

The ids of the translation are obtained by applying *argmax* to the predicted logits of the decoder.

We begin feeding the decoder with the id of the start symbol and, at each new step, we pass to the decoder the sequence it has just thrown out.

The translation stops when the end symbol is reached.

In [None]:
def evaluate_greedy(encoder_input, decoder_input):
  
  encoder_input = tf.convert_to_tensor(encoder_input)
  encoder_input = tf.expand_dims(encoder_input, 0)

  decoder_input = tf.convert_to_tensor(decoder_input)
  output = tf.expand_dims(decoder_input, 0)
  result = ''

  terces = 0
  for i in range(600):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)

    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions, attention_weights = transformer(encoder_input,
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)

    # select the last character from the seq_len dimension
    predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

    predicted_id = tf.argmax(predictions, axis=-1)

    # concatenate the predicted_id to the output which is given to the decoder as its input.
    output = tf.concat([tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)], axis=-1)
    result += tokenizer.index_word[predicted_id.numpy()[0][0]] + ' '

    # return the result if the predicted_id is equal to the end token
    if predicted_id == tokenizer.word_index["§"]:
      terces += 1
      if terces == 3:
        return result

  # output.shape (1, tokens)

  return result

In [None]:
encoder_input = [0]
decoder_input = target_text_[0]

generated_text = evaluate_greedy(encoder_input, decoder_input)
print(generated_text)

| l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l | l 

In [115]:
def evaluate_topk(encoder_input, decoder_input, k=5, temperature=0.5):
  
  encoder_input = tf.expand_dims(encoder_input, 0)

  output = tf.expand_dims(decoder_input, 0)

  terces = 0
  for i in range(600):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)

    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions, attention_weights = transformer(encoder_input,
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)

    # select the last character from the seq_len dimension
    predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
    predictions, indices = tf.math.top_k(predictions,k=k)

    predictions /= temperature
    predictions = np.squeeze(predictions, axis=0)
    indices = np.squeeze(indices, axis=0)
    indices = np.squeeze(indices, axis=0)
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    predicted_id = indices[predicted_id]

    # return the result if the predicted_id is equal to the end token
    if predicted_id == tokenizer.word_index["§"]:
      terces += 1
      if terces == terces_per_batch-1:
        return tf.squeeze(output, axis=0)

  # output.shape (1, tokens)

    predicted_id = tf.expand_dims(predicted_id, 0)
    predicted_id = tf.expand_dims(predicted_id, 0)
    output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)

In [116]:
out_list = target_text_[0:4].reshape(-1)
offset = terces_len
out_list = out_list[-offset:]
out_list = out_list[out_list != 0]
txt_gen = [tokenizer.index_word[i] for i in out_list]
k = 1
t = 1
for i in range(32//(terces_per_batch-1)): # 30 terces = cantica
  out = evaluate_topk([0], out_list, k, t)
  out_list = out.numpy().tolist()
  out_list = out_list[offset:]
  out_str = [tokenizer.index_word[i] for i in out_list]
  txt_gen += out_str

print(txt_gen)

ValueError: ignored