In [8]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds

from pathlib import Path


In [10]:
# Get the data

# Each line of text is an english sentence and its spanish translation

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets", extract=True)

text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

# Clean and separate the text into english and spanish sets

# Removing spanish punctuation
text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.shuffle(pairs)

sentences_en, sentences_es = zip(*pairs)

# Vectorizing the text 

# One vectorization layer per language. We set all vectorized sequences to 50 tokens (shorter get padding, longer are cropped).
vocab_size = 1000
max_length = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)

text_vec_layer_es = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)
# For spanish we add startofseq and endofseq to every sentence before vectorizing
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

# Splitting to train/valid sets

X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

# For training data we have the sentences with the "startofseq" token
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])

# For labels we have the sentences the same sentences with the "endofseq" token
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

# Input layers
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [13]:
# Vectorizing and embedding 

max_length = 50  # max length in the whole training set
embed_size = 128

encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size)
batch_max_len_enc = tf.shape(encoder_embeddings)[1]
encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))
batch_max_len_dec = tf.shape(decoder_embeddings)[1]
decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_dec))

In [12]:
# Positional encoding 

# Implementation of the positional encoding layer as described in the attention article.
# Uses sine and cosine wave functions to get the positional vectors.

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        assert embed_size % 2 == 0, "embed_size must be even"
        p, i = np.meshgrid(np.arange(max_length),
                           2 * np.arange(embed_size // 2))
        pos_emb = np.empty((1, max_length, embed_size))
        pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (i / embed_size)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (i / embed_size)).T
        self.pos_encodings = tf.constant(pos_emb.astype(self.dtype))
        self.supports_masking = True

    def call(self, inputs):
        batch_max_length = tf.shape(inputs)[1]
        return inputs + self.pos_encodings[:, :batch_max_length]


pos_embed_layer = PositionalEncoding(max_length, embed_size)
encoder_in = pos_embed_layer(encoder_embeddings)
decoder_in = pos_embed_layer(decoder_embeddings)

In [None]:
# Encoder - Multihead attention 

N = 2 # number of attention units
num_heads = 8
dropout_rate = 0.1
n_units = 128 # for the first dense layer in each feedforward block 

# Ignoring pad tokens
encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis] 
Z = encoder_in

for _ in range(N): 
  skip=Z

  attn_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
  Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip])) 
  
  skip=Z

  Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
  Z = tf.keras.layers.Dense(embed_size)(Z)
  Z = tf.keras.layers.Dropout(dropout_rate)(Z)

  # Residual connection
  Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

In [None]:
# Decoder

# Masking pad AND masking future values (decoder only looks at past values to get context)
decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
# Creates a lower triangular matrix which we'll use to ignore future tokens
causal_mask = tf.linalg.band_part(tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0)

encoder_outputs = Z  # let's save the encoder's final outputs
Z = decoder_in  # the decoder starts with its own inputs
for _ in range(N):
    # First attention is for self-attention (decoder to decoder attention)
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

    # Second attention is for attention between encoder outputs and decoder inputs
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

    skip = Z
    Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

In [None]:
# Compile and fit

# Set the last dense layer for final token prediction (vocab size)
Y_proba = tf.keras.layers.Dense(vocab_size, activation="softmax")(Z)

# Build the model
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

model_ckpt = tf.keras.callbacks.ModelCheckpoint("translation-transformer", monitor="val_accuracy", save_best_only=True)
model.fit((X_train, X_train_dec), Y_train, epochs=10, validation_data=((X_valid, X_valid_dec), Y_valid), callbacks=[ model_ckpt ])

Training on kaggle...

In [14]:
model = tf.keras.models.load_model("models/transformer-translation")



In [15]:
# Translation
def translate(sentence_en):
  tf.get_logger().setLevel('ERROR')

  translation = ""
  for word_idx in range(max_length):
    X = np.array([sentence_en])
    X_dec = np.array(["startofseq " + translation])
    y_proba = model.predict((X, X_dec))[0, word_idx]
    
    predicted_word_id = np.argmax(y_proba)
    predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]

    if predicted_word == "endofseq":
      break

    translation += " " + predicted_word
  
  tf.get_logger().setLevel('INFO')
  return translation.strip()

In [22]:
# good
print (translate("I like soccer"))

# good
print (translate("I like soccer and I like to go to the beach"))

# not bad
print (translate("Who saw the movie? I want to know if it's good"))


me gusta el fútbol
me gusta el fútbol y me gusta ir a la playa
quién vio la película que quiero saber si es bueno
