In [1]:
from transformer import Transformer
from utils import create_masks

import tensorflow as tf
import tensorflow_datasets as tfds

import time
import numpy as np
import matplotlib.pyplot as plt

In [2]:
examples, metadata  = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)

In [3]:
train_examples, val_examples = examples['train'], examples['validation']

In [4]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples), target_vocab_size=2**13)

tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=2**13)

In [5]:
sample_string = 'Transformer is awesome.'

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

Tokenized string is [7915, 1248, 7946, 7194, 13, 2799, 7877]
The original string: Transformer is awesome.


In [6]:
for ts in tokenized_string:
    print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

7915 ----> T
1248 ----> ran
7946 ----> s
7194 ----> former 
13 ----> is 
2799 ----> awesome
7877 ----> .


In [7]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
def encode(lang1, lang2):
    lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
      lang1.numpy()) + [tokenizer_pt.vocab_size+1]

    lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang2.numpy()) + [tokenizer_en.vocab_size+1]

    return lang1, lang2

In [8]:
def filter_max_length(x, y, max_length=40):
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [9]:
def tf_encode(pt, en):
    return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])

In [10]:
train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)


val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.filter(filter_max_length).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))

In [11]:
pt_batch, en_batch = next(iter(val_dataset))
pt_batch, en_batch

(<tf.Tensor: id=207688, shape=(64, 40), dtype=int64, numpy=
 array([[8214, 1259,    5, ...,    0,    0,    0],
        [8214,  299,   13, ...,    0,    0,    0],
        [8214,   59,    8, ...,    0,    0,    0],
        ...,
        [8214,   95,    3, ...,    0,    0,    0],
        [8214, 5157,    1, ...,    0,    0,    0],
        [8214, 4479, 7990, ...,    0,    0,    0]], dtype=int64)>,
 <tf.Tensor: id=207689, shape=(64, 40), dtype=int64, numpy=
 array([[8087,   18,   12, ...,    0,    0,    0],
        [8087,  634,   30, ...,    0,    0,    0],
        [8087,   16,   13, ...,    0,    0,    0],
        ...,
        [8087,   12,   20, ...,    0,    0,    0],
        [8087,   17, 4981, ...,    0,    0,    0],
        [8087,   12, 5453, ...,    0,    0,    0]], dtype=int64)>)

In [12]:
n_layers = 4
d_model = 128
d_ff = 512
n_heads = 8

input_vocab_size = tokenizer_pt.vocab_size + 2
output_vocab_size = tokenizer_en.vocab_size + 2
dropout_rate = 0.1

In [13]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [14]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [15]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [16]:
def loss_function(real, pred):
    """
    sparse categorical crossentropy, masking out padded words
    """
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [17]:
loss_function(tf.convert_to_tensor([1, 1, 2]), 
              tf.convert_to_tensor([
                  [0.0, 99.0, 0.0],
                  [0.0, 99.0, 0.0],
                  [0.0, 0.0, 99.0]
              ]))

<tf.Tensor: id=207715, shape=(), dtype=float32, numpy=0.0>

In [18]:
loss_function(tf.convert_to_tensor([1, 1, 0]), 
              tf.convert_to_tensor([
                  [0.0, 0.0, 0.0],
                  [0.0, 0.0, 0.0],
                  [99.0, 0.0, 0.0]
              ]))

<tf.Tensor: id=207739, shape=(), dtype=float32, numpy=0.7324082>

In [19]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [20]:
transformer = Transformer(input_vocab_size, output_vocab_size, d_model, n_layers, n_heads, d_ff, dropout_rate)

In [21]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

In [22]:
EPOCHS = 20

In [28]:
def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)

    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(inp)

    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by 
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

def create_look_ahead_mask(sequence_length):
    """
    Args:
        sequence_length: the length of input sequence
    Returns:
        look_ahead_mask: shape (sequence_length, sequence_length)
        
    e.g
    sequence_lenght = 3
    look_ahead_mask = [
        [0, 1, 1], on predicting the 1st word, only the 0th word (the <START/> token) will be used
        [0, 0, 1], on predicting the 2nd word, only the 1st word and the 0th word can be used
        [0, 0, 0]  and so on
    ]
    """
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((sequence_length, sequence_length)), -1, 0)
    return look_ahead_mask

def create_padding_mask(sparse_input_sequence):
    """
    Args:
        sparse_input_sequence: shape(batch_size, sequence_length) e.g [0, 1, 3, 0, 5, 13]
    Returns:
        padding_mask: boolean mask where 1s indicates padding. e.g [1, 0, 0, 1, 0, 0] for the example input
    """
    mask = tf.cast(tf.math.equal(sparse_input_sequence, 0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]
    

In [33]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions = transformer(inp, tar_inp, 
                                     True, 
                                     enc_padding_mask, 
                                     combined_mask, 
                                     dec_padding_mask)
        loss = loss_function(tar_real, predictions)
        print(loss)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

In [34]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> portuguese, tar -> english
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)

    if batch % 50 == 0:
        print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                             ckpt_save_path))

    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

tf.Tensor(0.97533166, shape=(), dtype=float32)
tf.Tensor(1.1959298, shape=(), dtype=float32)
tf.Tensor(1.3234972, shape=(), dtype=float32)
tf.Tensor(1.2935189, shape=(), dtype=float32)
tf.Tensor(1.198511, shape=(), dtype=float32)
tf.Tensor(1.2025603, shape=(), dtype=float32)
tf.Tensor(1.2521956, shape=(), dtype=float32)
tf.Tensor(1.0709819, shape=(), dtype=float32)
tf.Tensor(1.1045295, shape=(), dtype=float32)
tf.Tensor(1.2931602, shape=(), dtype=float32)
tf.Tensor(1.2769495, shape=(), dtype=float32)
tf.Tensor(1.2119265, shape=(), dtype=float32)
tf.Tensor(1.2470858, shape=(), dtype=float32)
tf.Tensor(1.2664611, shape=(), dtype=float32)
tf.Tensor(1.2775202, shape=(), dtype=float32)
tf.Tensor(1.1450373, shape=(), dtype=float32)
tf.Tensor(1.2294505, shape=(), dtype=float32)


KeyboardInterrupt: 