In [1]:
from transformer import Transformer
import tensorflow as tf
import tensorflow_datasets as tfds

import time
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
examples, metadata  = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)

In [3]:
train_examples, val_examples = examples['train'], examples['validation']

In [4]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples), target_vocab_size=2**13)

tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=2**13)

In [5]:
sample_string = 'Transformer is awesome.'

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

Tokenized string is [7915, 1248, 7946, 7194, 13, 2799, 7877]
The original string: Transformer is awesome.


In [6]:
for ts in tokenized_string:
    print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

7915 ----> T
1248 ----> ran
7946 ----> s
7194 ----> former 
13 ----> is 
2799 ----> awesome
7877 ----> .


In [7]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
def encode(lang1, lang2):
    lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
      lang1.numpy()) + [tokenizer_pt.vocab_size+1]

    lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang2.numpy()) + [tokenizer_en.vocab_size+1]

    return lang1, lang2

In [8]:
def filter_max_length(x, y, max_length=40):
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [9]:
def tf_encode(pt, en):
    return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])

In [10]:
train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.filter(filter_max_length).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))

In [11]:
pt_batch, en_batch = next(iter(val_dataset))


In [12]:
print(pt_batch[0, :10])
print(en_batch[0, :10])

tf.Tensor([8214 1259    5   63 5284   50  277    2 8215    0], shape=(10,), dtype=int64)
tf.Tensor([8087   18   12  631   15   31  272    2 8088    0], shape=(10,), dtype=int64)


In [13]:
n_layers = 4
d_model = 128
d_ff = 512
n_heads = 8

input_vocab_size = tokenizer_pt.vocab_size + 2
output_vocab_size = tokenizer_en.vocab_size + 2
dropout_rate = 0.1

In [14]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [15]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [16]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [17]:
def loss_function(real, pred):
    """
    sparse categorical crossentropy, masking out padded words
    """
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [18]:
loss_function(tf.convert_to_tensor([1, 1, 2]), 
              tf.convert_to_tensor([
                  [0.0, 99.0, 0.0],
                  [0.0, 99.0, 0.0],
                  [0.0, 0.0, 99.0]
              ]))

<tf.Tensor: id=207723, shape=(), dtype=float32, numpy=0.0>

In [19]:
loss_function(tf.convert_to_tensor([1, 1, 0]), 
              tf.convert_to_tensor([
                  [0.0, 0.0, 0.0],
                  [0.0, 0.0, 0.0],
                  [99.0, 0.0, 0.0]
              ]))

<tf.Tensor: id=207747, shape=(), dtype=float32, numpy=0.7324082>

In [20]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [27]:
transformer = Transformer(input_vocab_size, output_vocab_size, d_model, n_layers, n_heads, d_ff, dropout_rate)

In [28]:
checkpoint_path = "checkpoints"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

In [29]:
EPOCHS = 5

In [30]:
def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)

    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by 
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

def create_look_ahead_mask(sequence_length):
    """
    Args:
        sequence_length: the length of input sequence
    Returns:
        look_ahead_mask: shape (sequence_length, sequence_length)
        
    e.g
    sequence_lenght = 3
    look_ahead_mask = [
        [0, 1, 1], on predicting the 1st word, only the 0th word (the <START/> token) will be used
        [0, 0, 1], on predicting the 2nd word, only the 1st word and the 0th word can be used
        [0, 0, 0]  and so on
    ]
    """
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((sequence_length, sequence_length)), -1, 0)
    return look_ahead_mask

def create_padding_mask(sparse_input_sequence):
    """
    Args:
        sparse_input_sequence: shape(batch_size, sequence_length) e.g [0, 1, 3, 0, 5, 13]
    Returns:
        padding_mask: boolean mask where 1s indicates padding. e.g [1, 0, 0, 1, 0, 0] for the example input
    """
    mask = tf.cast(tf.math.equal(sparse_input_sequence, 0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]
    

In [31]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions = transformer(inp, tar_inp, 
                                     True, 
                                     enc_padding_mask, 
                                     combined_mask, 
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [32]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> portuguese, tar -> english
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)

        if batch % 50 == 0:
            print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
              epoch + 1, batch, train_loss.result(), train_accuracy.result()))

    if (epoch + 1) % 3 == 0:
        save_path = ckpt_manager.save()
        print("Saved checkpoint for step: {}".format(save_path))

    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.4134 Accuracy 0.0000
Epoch 1 Batch 50 Loss 3.4729 Accuracy 0.0239
Epoch 1 Batch 100 Loss 3.2400 Accuracy 0.0344
Epoch 1 Batch 150 Loss 3.1261 Accuracy 0.0444
Epoch 1 Batch 200 Loss 3.0323 Accuracy 0.0537
Epoch 1 Batch 250 Loss 2.9512 Accuracy 0.0622
Epoch 1 Batch 300 Loss 2.8769 Accuracy 0.0693
Epoch 1 Batch 350 Loss 2.8184 Accuracy 0.0754
Epoch 1 Batch 400 Loss 2.7647 Accuracy 0.0803
Epoch 1 Batch 450 Loss 2.7194 Accuracy 0.0846
Epoch 1 Batch 500 Loss 2.6828 Accuracy 0.0885
Epoch 1 Batch 550 Loss 2.6508 Accuracy 0.0920
Epoch 1 Batch 600 Loss 2.6213 Accuracy 0.0951
Epoch 1 Batch 650 Loss 2.5933 Accuracy 0.0981
Epoch 1 Batch 700 Loss 2.5690 Accuracy 0.1005
Epoch 1 Loss 2.5676 Accuracy 0.1006
Time taken for 1 epoch: 135.47014904022217 secs

Epoch 2 Batch 0 Loss 2.0592 Accuracy 0.1367
Epoch 2 Batch 50 Loss 2.2168 Accuracy 0.1366
Epoch 2 Batch 100 Loss 2.2062 Accuracy 0.1376
Epoch 2 Batch 150 Loss 2.2007 Accuracy 0.1385
Epoch 2 Batch 200 Loss 2.1904 Accuracy 0.1389
E

In [33]:
def evaluate(inp_sentence):
    start_token = [tokenizer_pt.vocab_size]
    end_token = [tokenizer_pt.vocab_size + 1]

    # inp sentence is portuguese, hence adding the start and end token
    inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_en.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(40):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions = transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer_en.vocab_size+1:
            return tf.squeeze(output, axis=0)

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0)

In [37]:
def translate(sentence):
    result = evaluate(sentence)

    predicted_sentence = tokenizer_en.decode([i for i in result 
                                            if i < tokenizer_en.vocab_size])  

    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(predicted_sentence))


In [41]:
translate("este é um problema que temos que resolver.")
print ("Real translation: this is a problem we have to solve .")

Input: este é um problema que temos que resolver.
Predicted translation: this is a problem that we have to do in the universe .
Real translation: this is a problem we have to solve .


In [36]:
def create_position_encoding(sequence_length, d_embedding):
        """
        Args:
            sequence_length: the length of input sequence. 
            d_embedding: the dimension of the embedding space. i.e 
        Returns:
            positional_encoding: shape (1, sequence_length, d_embedding)
            Note: the output is unsequeeze at dimension 1 to enable easier broadcasting
        """

        # position[i] = the i-th index the sequence
        positions = tf.range(sequence_length, dtype=tf.float32)[:, tf.newaxis]
        # embedding_indices[i] = the i-th index in the embedding 
        # Note: embedding_indices has length d_embedding/2, because sin and cos share the same input
        # e.g [0, 2, 4, 6] for d_embedding = 8
        embedding_indices = tf.range(d_embedding, delta=2, dtype=tf.float32)[tf.newaxis:]

        # inner = pos/10000^(2i/EmbeddingDimension), i.e the input to sin and cos
        inner = positions / 10000**(embedding_indices/tf.cast(d_embedding, tf.float32))
        sin_position_encodings = tf.math.sin(inner)
        cos_position_encodings = tf.math.cos(inner)
        # to create alternating sin and cos encodings, we use a hack: we expand dim at the last axis and concatenate the resulting tensors
        sin_position_encodings = tf.expand_dims(sin_position_encodings, axis=-1)
        cos_position_encodings = tf.expand_dims(cos_position_encodings, axis=-1)
        position_encodings = tf.concat([sin_position_encodings, cos_position_encodings], axis=-1)
        position_encodings = tf.reshape(position_encodings, (1, sequence_length, d_embedding))
        return position_encodings