In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

# Load the dataset
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']


2023-12-17 10:41:03.459593: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Use a smaller subset for tokenizer training
SUBSET_SIZE = 500  # for example

# Create smaller datasets for building the tokenizers
small_dataset_en = (en.numpy() for pt, en in train_examples.take(SUBSET_SIZE))
small_dataset_pt = (pt.numpy() for pt, en in train_examples.take(SUBSET_SIZE))

tokenizer_en = tfds.deprecated.text

# Initialize a tokenizer with the desired vocabulary size and OOV token
tokenizer_en = tokenizer_en.SubwordTextEncoder.build_from_corpus(
    small_dataset_en, target_vocab_size=2**13
)

tokenizer_pt = tfds.deprecated.text

# Initialize a tokenizer with the desired vocabulary size and OOV token
tokenizer_pt = tokenizer_pt.SubwordTextEncoder.build_from_corpus(
    small_dataset_pt, target_vocab_size=2**13
)

2023-12-17 10:41:08.587866: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-17 10:41:10.357340: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [3]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64


def encode(lang1, lang2):
    lang1 = (
        [tokenizer_pt.vocab_size]
        + tokenizer_pt.encode(lang1.numpy())
        + [tokenizer_pt.vocab_size + 1]
    )

    lang2 = (
        [tokenizer_en.vocab_size]
        + tokenizer_en.encode(lang2.numpy())
        + [tokenizer_en.vocab_size + 1]
    )

    return lang1, lang2


def tf_encode(pt, en):
    result_pt, result_en = tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
    result_pt.set_shape([None])
    result_en.set_shape([None])
    return result_pt, result_en


train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [4]:
from classes import Transformer

num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = tokenizer_pt.vocab_size + 2
target_vocab_size = tokenizer_en.vocab_size + 2
dropout_rate = 0.1

transformer = Transformer(
    num_layers,
    d_model,
    num_heads,
    dff,
    input_vocab_size,
    target_vocab_size,
    pe_input=input_vocab_size,
    pe_target=target_vocab_size,
    rate=dropout_rate,
)

In [5]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)  # Cast step to float32
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

# Example usage
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)


loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

In [6]:
EPOCHS = 20

for epoch in range(EPOCHS):
    for (batch, (inp, tar)) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            predictions = transformer(inp, tar, True, None, None, None)
            loss = loss_function(tar, predictions)

        gradients = tape.gradient(loss, transformer.trainable_variables)    
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        if batch % 50 == 0:
            print(f'Epoch {epoch} Batch {batch} Loss {loss.numpy()}')


2023-12-17 10:41:22.847556: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:422] ShuffleDatasetV3:38: Filling up shuffle buffer (this may take a while): 16587 of 20000
2023-12-17 10:41:24.433952: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] Shuffle buffer filled.


Shape of x after embedding: (64, 174, 128)
Shape of x after scaling: (64, 174, 128)
Shape of x after adding positional encoding: (64, 174, 128)
After splitting heads: (64, 8, 174, 16) (64, 8, 174, 16) (64, 8, 174, 16)
After scaled dot product attention: (64, 8, 174, 16)
After concatenating heads: (64, 174, 128)
EncoderLayer output: (64, 174, 128)
After splitting heads: (64, 8, 174, 16) (64, 8, 174, 16) (64, 8, 174, 16)
After scaled dot product attention: (64, 8, 174, 16)
After concatenating heads: (64, 174, 128)
EncoderLayer output: (64, 174, 128)
After splitting heads: (64, 8, 174, 16) (64, 8, 174, 16) (64, 8, 174, 16)
After scaled dot product attention: (64, 8, 174, 16)
After concatenating heads: (64, 174, 128)
EncoderLayer output: (64, 174, 128)
After splitting heads: (64, 8, 174, 16) (64, 8, 174, 16) (64, 8, 174, 16)
After scaled dot product attention: (64, 8, 174, 16)
After concatenating heads: (64, 174, 128)
EncoderLayer output: (64, 174, 128)
Decoder After adding positional enco







Epoch 0 Batch 0 Loss 7.875532150268555
Shape of x after embedding: (64, 427, 128)
Shape of x after scaling: (64, 427, 128)
Shape of x after adding positional encoding: (64, 427, 128)
After splitting heads: (64, 8, 427, 16) (64, 8, 427, 16) (64, 8, 427, 16)
After scaled dot product attention: (64, 8, 427, 16)
After concatenating heads: (64, 427, 128)
EncoderLayer output: (64, 427, 128)
After splitting heads: (64, 8, 427, 16) (64, 8, 427, 16) (64, 8, 427, 16)
After scaled dot product attention: (64, 8, 427, 16)
After concatenating heads: (64, 427, 128)
EncoderLayer output: (64, 427, 128)
After splitting heads: (64, 8, 427, 16) (64, 8, 427, 16) (64, 8, 427, 16)
After scaled dot product attention: (64, 8, 427, 16)
After concatenating heads: (64, 427, 128)
EncoderLayer output: (64, 427, 128)
After splitting heads: (64, 8, 427, 16) (64, 8, 427, 16) (64, 8, 427, 16)
After scaled dot product attention: (64, 8, 427, 16)
After concatenating heads: (64, 427, 128)
EncoderLayer output: (64, 427, 12







Shape of x after embedding: (64, 152, 128)
Shape of x after scaling: (64, 152, 128)
Shape of x after adding positional encoding: (64, 152, 128)
After splitting heads: (64, 8, 152, 16) (64, 8, 152, 16) (64, 8, 152, 16)
After scaled dot product attention: (64, 8, 152, 16)
After concatenating heads: (64, 152, 128)
EncoderLayer output: (64, 152, 128)
After splitting heads: (64, 8, 152, 16) (64, 8, 152, 16) (64, 8, 152, 16)
After scaled dot product attention: (64, 8, 152, 16)
After concatenating heads: (64, 152, 128)
EncoderLayer output: (64, 152, 128)
After splitting heads: (64, 8, 152, 16) (64, 8, 152, 16) (64, 8, 152, 16)
After scaled dot product attention: (64, 8, 152, 16)
After concatenating heads: (64, 152, 128)
EncoderLayer output: (64, 152, 128)
After splitting heads: (64, 8, 152, 16) (64, 8, 152, 16) (64, 8, 152, 16)
After scaled dot product attention: (64, 8, 152, 16)
After concatenating heads: (64, 152, 128)
EncoderLayer output: (64, 152, 128)
Decoder After adding positional enco







Shape of x after embedding: (64, 110, 128)
Shape of x after scaling: (64, 110, 128)
Shape of x after adding positional encoding: (64, 110, 128)
After splitting heads: (64, 8, 110, 16) (64, 8, 110, 16) (64, 8, 110, 16)
After scaled dot product attention: (64, 8, 110, 16)
After concatenating heads: (64, 110, 128)
EncoderLayer output: (64, 110, 128)
After splitting heads: (64, 8, 110, 16) (64, 8, 110, 16) (64, 8, 110, 16)
After scaled dot product attention: (64, 8, 110, 16)
After concatenating heads: (64, 110, 128)
EncoderLayer output: (64, 110, 128)
After splitting heads: (64, 8, 110, 16) (64, 8, 110, 16) (64, 8, 110, 16)
After scaled dot product attention: (64, 8, 110, 16)
After concatenating heads: (64, 110, 128)
EncoderLayer output: (64, 110, 128)
After splitting heads: (64, 8, 110, 16) (64, 8, 110, 16) (64, 8, 110, 16)
After scaled dot product attention: (64, 8, 110, 16)
After concatenating heads: (64, 110, 128)
EncoderLayer output: (64, 110, 128)
Decoder After adding positional enco







Shape of x after embedding: (64, 106, 128)
Shape of x after scaling: (64, 106, 128)
Shape of x after adding positional encoding: (64, 106, 128)
After splitting heads: (64, 8, 106, 16) (64, 8, 106, 16) (64, 8, 106, 16)
After scaled dot product attention: (64, 8, 106, 16)
After concatenating heads: (64, 106, 128)
EncoderLayer output: (64, 106, 128)
After splitting heads: (64, 8, 106, 16) (64, 8, 106, 16) (64, 8, 106, 16)
After scaled dot product attention: (64, 8, 106, 16)
After concatenating heads: (64, 106, 128)
EncoderLayer output: (64, 106, 128)
After splitting heads: (64, 8, 106, 16) (64, 8, 106, 16) (64, 8, 106, 16)
After scaled dot product attention: (64, 8, 106, 16)
After concatenating heads: (64, 106, 128)
EncoderLayer output: (64, 106, 128)
After splitting heads: (64, 8, 106, 16) (64, 8, 106, 16) (64, 8, 106, 16)
After scaled dot product attention: (64, 8, 106, 16)
After concatenating heads: (64, 106, 128)
EncoderLayer output: (64, 106, 128)
Decoder After adding positional enco







Shape of x after embedding: (64, 129, 128)
Shape of x after scaling: (64, 129, 128)
Shape of x after adding positional encoding: (64, 129, 128)
After splitting heads: (64, 8, 129, 16) (64, 8, 129, 16) (64, 8, 129, 16)
After scaled dot product attention: (64, 8, 129, 16)
After concatenating heads: (64, 129, 128)
EncoderLayer output: (64, 129, 128)
After splitting heads: (64, 8, 129, 16) (64, 8, 129, 16) (64, 8, 129, 16)
After scaled dot product attention: (64, 8, 129, 16)
After concatenating heads: (64, 129, 128)
EncoderLayer output: (64, 129, 128)
After splitting heads: (64, 8, 129, 16) (64, 8, 129, 16) (64, 8, 129, 16)
After scaled dot product attention: (64, 8, 129, 16)
After concatenating heads: (64, 129, 128)
EncoderLayer output: (64, 129, 128)
After splitting heads: (64, 8, 129, 16) (64, 8, 129, 16) (64, 8, 129, 16)
After scaled dot product attention: (64, 8, 129, 16)
After concatenating heads: (64, 129, 128)
EncoderLayer output: (64, 129, 128)
Decoder After adding positional enco







Shape of x after embedding: (64, 159, 128)
Shape of x after scaling: (64, 159, 128)
Shape of x after adding positional encoding: (64, 159, 128)
After splitting heads: (64, 8, 159, 16) (64, 8, 159, 16) (64, 8, 159, 16)
After scaled dot product attention: (64, 8, 159, 16)
After concatenating heads: (64, 159, 128)
EncoderLayer output: (64, 159, 128)
After splitting heads: (64, 8, 159, 16) (64, 8, 159, 16) (64, 8, 159, 16)
After scaled dot product attention: (64, 8, 159, 16)
After concatenating heads: (64, 159, 128)
EncoderLayer output: (64, 159, 128)
After splitting heads: (64, 8, 159, 16) (64, 8, 159, 16) (64, 8, 159, 16)
After scaled dot product attention: (64, 8, 159, 16)
After concatenating heads: (64, 159, 128)
EncoderLayer output: (64, 159, 128)
After splitting heads: (64, 8, 159, 16) (64, 8, 159, 16) (64, 8, 159, 16)
After scaled dot product attention: (64, 8, 159, 16)
After concatenating heads: (64, 159, 128)
EncoderLayer output: (64, 159, 128)
Decoder After adding positional enco







Shape of x after embedding: (64, 225, 128)
Shape of x after scaling: (64, 225, 128)
Shape of x after adding positional encoding: (64, 225, 128)
After splitting heads: (64, 8, 225, 16) (64, 8, 225, 16) (64, 8, 225, 16)
After scaled dot product attention: (64, 8, 225, 16)
After concatenating heads: (64, 225, 128)
EncoderLayer output: (64, 225, 128)
After splitting heads: (64, 8, 225, 16) (64, 8, 225, 16) (64, 8, 225, 16)
After scaled dot product attention: (64, 8, 225, 16)
After concatenating heads: (64, 225, 128)
EncoderLayer output: (64, 225, 128)
After splitting heads: (64, 8, 225, 16) (64, 8, 225, 16) (64, 8, 225, 16)
After scaled dot product attention: (64, 8, 225, 16)
After concatenating heads: (64, 225, 128)
EncoderLayer output: (64, 225, 128)
After splitting heads: (64, 8, 225, 16) (64, 8, 225, 16) (64, 8, 225, 16)
After scaled dot product attention: (64, 8, 225, 16)
After concatenating heads: (64, 225, 128)
EncoderLayer output: (64, 225, 128)
Decoder After adding positional enco

2023-12-17 10:42:51.487734: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-12-17 10:42:51.517752: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


KeyboardInterrupt: 