# Machine Translation

**Recurrent Neural Network that accepts English text as input and returns the French translation**

**Natural Language Processing**

This notebook is based on the Natural Language Processing [capstone project](https://github.com/udacity/aind2-nlp-capstone) of the [Udacity's Artificial Intelligence  Nanodegree](https://www.udacity.com/course/artificial-intelligence-nanodegree--nd889).

The dataset is a reduced sentence set taken from [WMT](http://www.statmt.org/). The `small_vocab_en` file contains English sentences with their French translations in the `small_vocab_fr` file.  The punctuations have been delimited using spaces already, and all the text have been converted to lowercase.

In [None]:
import os, sys
import numpy as np

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
sys.path.append("../")
import ds_boost
from tensorflow import keras

log = ds_boost.logger.init(level="DEBUG", save_log=False)

ds_boost.set_parent_execution_path()
ds_boost.info_system()
np.random.seed(9)

%matplotlib inline
%load_ext autoreload
%autoreload 2

## 1. Load and prepare the data

In [None]:
with open("data/small_vocab_en", "r") as f:
    english_sentences = f.read().split("\n")
with open("data/small_vocab_fr", "r") as f:
    french_sentences = f.read().split("\n")

print(f"Number of sentences: {len(english_sentences)}\n")
for i in range(2):
    print(f"sample {i}:")
    print(f"{english_sentences[i]}  \n{french_sentences[i]} \n")

In [None]:
import collections

words = {}
words["English"] = [word for sentence in english_sentences for word in sentence.split()]
words["French"] = [word for sentence in french_sentences for word in sentence.split()]

for key, value in words.items():
    print(f"{key}: {len(value)} words, {len(collections.Counter(value))} unique words")

### Tokenize
Low complexity word to numerical word ids

In [None]:
from keras.preprocessing.text import Tokenizer


def tokenize(x):
    """
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    tokens = tokenizer.texts_to_sequences(x)

    return tokens, tokenizer

### Padding
When batching the sequence of word ids together, each sequence needs to be the same length.  Since sentences are dynamic in length, we can add padding to the end of the sequences to make them the same length.

In [None]:
from tensorflow.keras.utils import pad_sequences


def pad(x, length=None):
    """
    :param x: List of sequences.
    :param length: Length to pad the sequence to. If None, longest sequence length in x.
    :return: Padded numpy array of sequences
    """
    return pad_sequences(x, maxlen=length, padding="post")

### Preprocess pipeline

In [None]:
def preprocess(x, y, length=None):
    """
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x, length)
    preprocess_y = pad(preprocess_y, length)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dims
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk


x, y, x_tk, y_tk = preprocess(english_sentences, french_sentences)  # length=150)
print("Data Preprocessed")

### Split the data into training and test sets

In [None]:
# Only the 10 last translations will be predicted
x_train, y_train = x[:-10], y[:-10]
x_test, y_test = x[-10:-1], y[-10:-1]  # last sentence removed
test_english_sentences, test_french_sentences = (
    english_sentences[-10:],
    french_sentences[-10:],
)

### Ids Back to Text
The function `logits_to_text` will bridge the gap between the logits from the neural network to the French translation.

In [None]:
def logits_to_text(logits, tokenizer, show_pad=True):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = "<PAD>" if show_pad else ""

    return " ".join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

## 2. Recurrent neural network
Model that incorporates encoder-decoder, embedding and bidirectional RNNs: 
- An embedding is a vector representation of the word that is close to similar words in $n$-dimensional space, where the $n$ represents the size of the embedding vectors 
- The encoder creates a matrix representation of the sentence
- The decoder takes this matrix as input and predicts the translation as output

In [None]:
from keras.models import Sequential
from keras.layers import GRU, Dense, TimeDistributed, LSTM, Bidirectional, RepeatVector
from tensorflow.keras.layers import Embedding
from keras.layers import Dropout
from keras.losses import sparse_categorical_crossentropy


def rnn_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build a model with embedding, encoder-decoder, and bidirectional RNN
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    learning_rate = 0.01

    model = Sequential()

    vector_size = english_vocab_size // 10

    model.add(
        Embedding(
            english_vocab_size + 1,
            vector_size,
            input_shape=input_shape[1:],
            mask_zero=False,
        )
    )
    model.add(Bidirectional(GRU(output_sequence_length)))
    model.add(Dense(128, activation="relu"))

    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(french_vocab_size + 1, activation="softmax")))
    print(model.summary())

    model.compile(
        loss=sparse_categorical_crossentropy,
        optimizer=keras.optimizers.Adam(learning_rate),
        metrics=["accuracy"],
    )

    return model


model = rnn_model(x_train.shape, y_train.shape[1], len(x_tk.word_index), len(y_tk.word_index))

### Train the model

In [None]:
print("Training...")
callbacks = [keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=3, verbose=1)]
history = model.fit(x_train, y_train, batch_size=1024, epochs=50, verbose=0, validation_split=0.2, callbacks=callbacks)
ds_boost.show_training(history)

### Evaluate the model

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print(f"Test Accuracy: {score[1]:.2f}\n")

y = model.predict(x_test)

for idx, value in enumerate(y):
    print(f"Sample: {test_english_sentences[idx]}")
    print(f"Actual: {test_french_sentences[idx]}")
    print(f"Predicted: {logits_to_text(value, y_tk, show_pad=False)}\n")