In [5]:
import os
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub

from pathlib import Path
from pprint import pprint

In [2]:
# Get the data

# Each line of text is an english sentence and its spanish translation

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets", extract=True)

text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [6]:
# Clean and separate the text into english and spanish sets

# Removing spanish punctuation

text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.shuffle(pairs)

sentences_en, sentences_es = zip(*pairs)

In [8]:
# Review the data

for i in range(3):
  print (sentences_en[i], "=>", sentences_es[i])

Are you happy right now? => Eres feliz ahora mismo?
This photo was taken in Boston three years ago. => Esta foto fue tomada en Boston hace tres años.
I am planting beans in my garden. => Estoy plantando judías en mi jardín.


In [9]:
# Vectorizing the text 

# One vectorization layer per language. We set all vectorized sequences to 50 tokens (shorter get padding, longer are cropped).
vocab_size = 1000
max_length = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)

text_vec_layer_es = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)
# For spanish we add startofseq and endofseq to every sentence before vectorizing
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

In [12]:
# Inspecting the vocabulary (top 10 tokens)

print (text_vec_layer_en.get_vocabulary()[:10])
print (text_vec_layer_es.get_vocabulary()[:10])

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']
['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']


In [29]:
# Splitting to train/valid sets

X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])

# For training data we have the sentences with the "startofseq" token
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])

# For labels we have the sentences the same sentences with the "endofseq" token
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

In [30]:
# Building the encoder-decoder 

# We're using the functional API since this model is not sequential

# Input layers
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

# Text vectorization and embedding layers
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

# Creating the encoder
encoder = tf.keras.layers.LSTM(512, return_sequences=True, return_state=True)
# Short and long term outputs
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

# Creating the decoder
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

# Creating the model
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])


In [None]:
# Training
model.fit((X_train, X_train_dec), Y_train, epochs=10, validation_data=((X_valid, X_valid_dec), Y_valid))

Training on kaggle...

In [35]:
# Loading the model
model = tf.keras.models.load_model("models/enc-dec-translation")

2023-11-09 08:42:39.216970: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-09 08:42:40.232138: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-09 08:42:40.706297: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-09 08:42:41.099340: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-09 08:42:41.112764: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond' has 5 outputs but the _ou

In [45]:
# Translation
def translate(sentence_en):
  tf.get_logger().setLevel('ERROR')

  translation = ""
  for word_idx in range(max_length):
    X = np.array([sentence_en])
    X_dec = np.array(["startofseq " + translation])
    y_proba = model.predict((X, X_dec))[0, word_idx]
    
    predicted_word_id = np.argmax(y_proba)
    predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]

    if predicted_word == "endofseq":
      break

    translation += " " + predicted_word
  
  tf.get_logger().setLevel('INFO')
  return translation.strip()

# good
print (translate("I like soccer"))

# bad...
print (translate("I like soccer and I like to go to the beach"))


me gusta el fútbol
me gusta la playa pero prefiero a la playa


## Bidirectional RNNs

Improving the translation task by peeking into the future to understand context better.

In [None]:
# Building the encoder-decoder 

# We're using the functional API since this model is not sequential

# Input layers
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

# Text vectorization and embedding layers
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

# Creating the encoder - wrapping it in a bidirectional RNN
encoder = tf.keras.layers.Bidirectional(
  tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
)
# Short and long term outputs
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
# Concatenate forward and reverse states (bidirectional RNN). Remember that encoder_state is an array where 
# element 1: short term forward
# element 2: long term forward
# element 3: short term reverse
# element 4: long term reverse
encoder_state = [
  tf.concat(encoder_state[::2], axis=-1), #short-term (0 & 2)
  tf.concat(encoder_state[1::2], axis=-1) # long-term (1 & 3)
]

# Creating the decoder
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

# Creating the model
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])


Training on kaggle...

In [51]:
model = tf.keras.models.load_model("models/enc-dec-translation-bi")

2023-11-09 12:56:33.180656: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 14 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.
2023-11-09 12:56:33.228569: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.
2023-11-09 12:56:37.531740: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-09 12:56:37.580934: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-09 12:56:41.971827: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 14 outputs but the _output_sh

In [53]:
# good
print (translate("I like soccer"))

# also good...
print (translate("I like soccer and I like to go to the beach"))

# real bad...
print (translate("I like soccer and I like to go to the beach. I also enjoy basketball and eating cheese"))

me gusta el fútbol
me gusta el fútbol y me gusta ir a la playa
me gusta el almuerzo y yo me gusta ir a la escuela y no [UNK] a comer a la [UNK] y [UNK]


## Attention

Making the encoder-decoder model even more robust to longer sequences by using Attention cells. This builds on the bi-directional
RNN model.

In [48]:
# Building the encoder-decoder 

# We're using the functional API since this model is not sequential

# Input layers
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

# Text vectorization and embedding layers
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

# Creating the encoder - wrapping it in a bidirectional RNN
encoder = tf.keras.layers.Bidirectional(
  tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
)
# Short and long term outputs
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
# Concatenate forward and reverse states (bidirectional RNN). Remember that encoder_state is an array where 
# element 1: short term forward
# element 2: long term forward
# element 3: short term reverse
# element 4: long term reverse
encoder_state = [
  tf.concat(encoder_state[::2], axis=-1), #short-term (0 & 2)
  tf.concat(encoder_state[1::2], axis=-1) # long-term (1 & 3)
]

# Creating the decoder
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

# Attention
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(attention_outputs)

# Creating the model
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])


Training on kaggle...

In [54]:
model = tf.keras.models.load_model("models/enc-dec-translation-attention")

2023-11-09 13:20:18.410287: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-09 13:20:19.053898: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 14 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.
2023-11-09 13:20:19.278868: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 14 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.
2023-11-09 13:20:25.648214: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-11-09 13:20:25.693215: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond' has 5 outputs but the _ou

In [58]:
# good
print (translate("I like soccer"))

# also good...
print (translate("I like soccer and I like to go to the beach"))

# real bad still...
print (translate("I like soccer and I like to go to the beach. I also enjoy basketball and eating cheese"))

me gusta el fútbol
me gusta [UNK] y me gusta ir a la playa
me gusta [UNK] y a la playa me gusta ir me gusta la playa [UNK] como [UNK]
