In [1]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            
    except RuntimeError as e:
        print(e)

from helpers import load_config
from helpers.logging import print_status_bar
from helpers.evaluation import compute_bleu

from sklearn.model_selection import train_test_split
import string
import numpy as np
import io
import os
from time import time

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
config = load_config("config.json")

# Download Dataset

In [3]:
dataset_params = config["dataset"]
path_to_zip = tf.keras.utils.get_file(os.path.join(os.getcwd(), "datasets", "fra-eng.zip"), origin=dataset_params["url"], extract=True)
path_to_file = os.path.join(os.path.dirname(path_to_zip), "fra.txt")

# Preprocessing

In [4]:
def clean_sentence(sentence):
    # make a space between each punctionation
    sentence = sentence.translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation}))
    
    sentence = sentence.strip()  # remove spaces
    return sentence

def preprocess_a_sentence(sentence):
    # clean it
    sentence = clean_sentence(sentence)
    # add the start and end of sequences
    return '<sos> {} <eos>'.format(sentence)

def load_dataset(path, num_examples=None):
    with open(path, encoding='utf-8') as f:
        lines = f.read().strip().split("\n")
    # list containing a set of (input, output)
    sentence_pairs = [[preprocess_a_sentence(sen) for sen in line.split('\t')]  for line in lines[:num_examples]]
    return zip(*sentence_pairs)

def create_shifted_target(y):
    "Remove the start token and append a padding to the end."
    return y[:, :-1], y[:, 1:]

In [5]:
def get_tokenizer(lang, top_k=None):
    
    # we are keeping the punctionation
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, filters='’,?!"#$%&()*+-/:;=.@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(lang)
    
    sequences = tokenizer.texts_to_sequences(lang)
    # pad the tensors
    sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding="post")
    return sequences, tokenizer

def create_dataset(X, y, batch_size=None, buffer=False, prefetch=tf.data.experimental.AUTOTUNE):
    X_set = tf.data.Dataset.from_tensor_slices(X)
    y_set = tf.data.Dataset.from_tensor_slices(y[0],)
    a_set = tf.data.Dataset.zip((X_set, y_set))
    if buffer:
        a_set = a_set.shuffle(X[0].shape[0])
    if batch_size is not None:
        a_set = a_set.batch(batch_size, drop_remainder=True)
    return a_set.repeat().prefetch(prefetch)

def padded_transform(X, tokenizer, X_max):
    X = tokenizer.texts_to_sequences(X)
    X = tf.keras.preprocessing.sequence.pad_sequences(X, padding="post", maxlen=X_max)
    return X

def dataset_padded_transform(X, y, X_tokenizer, y_tokenizer, X_max, y_max):
    X = padded_transform(X, X_tokenizer, X_max)
    y = padded_transform(y, y_tokenizer, y_max)
    return X, y

In [6]:
def dataset(input_lang, target_lang, batch_size, prefetch=tf.data.experimental.AUTOTUNE, 
            valid_size=0.15, top_k=None):    
    
    encoder_train, encoder_valid, target_train, target_valid = train_test_split(input_lang, 
                                                                                target_lang, 
                                                                                test_size=valid_size)
    # build tokenizer
    encoder_train, input_tokenizer = get_tokenizer(encoder_train, top_k=top_k)
    target_train, target_tokenizer = get_tokenizer(target_train, top_k=top_k)
    
    # transform and pad
    encoder_valid, target_valid = dataset_padded_transform(encoder_valid, target_valid,
                                                           input_tokenizer, target_tokenizer,
                                                           encoder_train.shape[1], target_train.shape[1])
    
    decoder_train, y_train = create_shifted_target(target_train)
    train_attention_weights = np.zeros((len(decoder_train)), dtype=np.float32)
    
    decoder_valid, y_valid = create_shifted_target(target_valid)
    valid_attention_weights = np.zeros((len(decoder_valid)), dtype=np.float32)
    
    # create dataset    
    train_set = create_dataset((encoder_train, decoder_train),
                               (y_train, train_attention_weights),
                               batch_size=batch_size, buffer=True,
                               prefetch=prefetch)
    
    valid_set = create_dataset((encoder_valid, decoder_valid),
                               (y_valid, valid_attention_weights),
                               batch_size=batch_size, prefetch=prefetch)
    
    # information about the training set:
    info = dict(
        train_size=encoder_train.shape[0],
        train_input_max_pad=encoder_train.shape[1],
        train_target_max_pad=target_train.shape[1],
        valid_size=encoder_valid.shape[0],
    )
    return train_set, valid_set, info, input_tokenizer, target_tokenizer

In [7]:
# Load configuration
batch_size = dataset_params["batch_size"]
num_examples = dataset_params["num_examples"]
top_k = dataset_params['top_common_words']

# load dataset and split training, validation and testing sets.
target_lang, input_lang = load_dataset(path_to_file, num_examples=num_examples)
encoder_train, encoder_test, target_train, target_test = train_test_split(input_lang, 
                                                                          target_lang, 
                                                                          test_size=0.2)
# create training and validation set
train_set, valid_set, info, input_tokenizer, target_tokenizer = dataset(encoder_train, target_train, batch_size, 
                                                                        top_k=top_k)
print(info)

{'train_size': 680, 'train_input_max_pad': 13, 'train_target_max_pad': 7, 'valid_size': 120}


In [8]:
print(info)

{'train_size': 680, 'train_input_max_pad': 13, 'train_target_max_pad': 7, 'valid_size': 120}


In [9]:
for x, y, in train_set.take(1):
    i = 10
    enc_x, dec_x = x
    # y, att = y
    print(input_tokenizer.sequences_to_texts([enc_x[i].numpy()]))
    print(target_tokenizer.sequences_to_texts([dec_x[i].numpy()]))
    print(target_tokenizer.sequences_to_texts([y[i].numpy()]))
    # print(att.shape)

['<sos> je suis juste <eos>']
["<sos> i ' m fair <eos>"]
["i ' m fair <eos>"]


# Model

## Setting Hyperparameters

In [10]:
model_config = config['model']
N = model_config['N']
model_depth = model_config['model_depth']
num_heads = model_config['num_heads']
dff = model_config['dff']
dropout_rate = model_config['dropout_rate']
epochs = model_config['epochs']


steps_per_epoch = info['train_size'] // batch_size
validation_steps = info['valid_size'] // batch_size
max_input_vocab = len(input_tokenizer.index_word) + 1
max_target_vocab = len(target_tokenizer.index_word) + 1
input_max_positional_encoding = max_input_vocab
target_max_positional_encoding = max_target_vocab

## Build

In [14]:
from model import TransformerV2, CustomSchedule
transformer = TransformerV2(N, model_depth, num_heads, dff, 
                            max_input_vocab, max_target_vocab, 
                            input_max_positional_encoding, target_max_positional_encoding, 
                            rate=dropout_rate)

## Compile

Where are going to use adam optimizer with a custom LR:

$$l_{\text{rate}} = d_{\text{model}}^{-0.5} * \text{min}(\text{step_num}^{-0.5}, \text{step_num} * \text{warmup_steps}^{-1.5})$$

In [15]:
learning_rate = CustomSchedule(model_depth)

adam = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

def neglected_loss(y_true, y_pred):
    return tf.constant(0, dtype=tf.float32)

# transformer.compile(optimizer=adam, metrics=[['accuracy'], []], loss_weights=[1., 0.],
#                     loss=["sparse_categorical_crossentropy", neglected_loss])
transformer.compile(optimizer=adam, metrics=['accuracy'], loss='sparse_categorical_crossentropy')

## Training

In [50]:
history = transformer.fit(train_set, steps_per_epoch=steps_per_epoch, epochs=10,
                          validation_data=valid_set, validation_steps=validation_steps)

Train for 10 steps, validate for 1 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluation

The following steps are used for evaluation:

- Encode the input sentence using the pt tokenizer and add the necessary starting and ending tokens
- Create a decoder input and set the first value to start token
- Calculate the padding masks and the look ahead masks.
- The decoder outputs the predictions by looking at the encoder's output and its own output.
- Select the last work and calculate the argmax of that.
- Concatentane the predicted word of input and refeed the decoder until the end token is predicted.

In [21]:
def predict(enc_inputs, target_tokenizer, sos_token="<sos>"):
    y_preds = tf.fill(dims=(len(enc_inputs), 1), value=target_tokenizer.word_index[sos_token])
    for i in range(info['train_target_max_pad']):
        pad_size = max_target_vocab - y_preds.shape[1]
        dec_input = tf.pad(y_preds, [[0, 0], [0, pad_size]])
        y_probs_next = transformer.predict((enc_inputs, dec_input))
        y_probs_next = y_probs_next[:, i:i+1]  # we only care about the current state
        y_pred_next = tf.argmax(y_probs_next, axis=-1, output_type=tf.int32)
        y_preds = tf.concat([y_preds, y_pred_next], axis=1)
    return y_preds[:, 1:]  # remove the <sos> token from results

In [51]:
enc_inputs = padded_transform(encoder_train[:10], input_tokenizer, info['train_input_max_pad'])
predicted_captions = predict(enc_inputs, target_tokenizer)
predicted_captions

<tf.Tensor: shape=(10, 7), dtype=int32, numpy=
array([[3, 2, 0, 0, 0, 0, 0],
       [3, 2, 0, 0, 0, 0, 0],
       [2, 0, 0, 0, 0, 0, 0],
       [3, 2, 0, 0, 0, 0, 0],
       [3, 2, 0, 0, 0, 0, 0],
       [2, 0, 0, 0, 0, 0, 0],
       [3, 2, 0, 0, 0, 0, 0],
       [3, 2, 0, 0, 0, 0, 0],
       [3, 2, 0, 0, 0, 0, 0],
       [3, 2, 0, 0, 0, 0, 0]])>

In [53]:
def evaluate(sentences, translations, input_tokenizer, target_tokenizer, info, sos_token="<sos>"):
    enc_translations = padded_transform(translations, input_tokenizer, info['train_target_max_pad'])
    
    enc_inputs = padded_transform(sentences, input_tokenizer, info['train_input_max_pad'])
    predicted_captions = predict(enc_inputs, target_tokenizer, sos_token=sos_token)
    padding_indices = np.argwhere(predicted_captions == target_tokenizer.texts_to_sequences(["<eos>"]))[:, 1]
    
    bleu = compute_bleu(enc_translations[:, np.newaxis, :], predicted_captions.numpy(), 
                        padding_indices=padding_indices, max_order=3)
    return bleu

In [56]:
bleu = evaluate(encoder_train[:10], target_test[:10], input_tokenizer, target_tokenizer, info)
print("bleu: {}\tbleu1: {}\tbleu2: {}\tbleu 3: {}".format(bleu[0], bleu[1][0], bleu[1][1], bleu[1][2]))

bleu: 0.0	bleu1: 0.0	bleu2: 0.0	bleu 3: 0.0


In [60]:
def translate(sentence, actual_translation, input_tokenizer, target_tokenizer, info):
    predicted_arr = evaluate([sentence], [actual_translation], input_tokenizer, target_tokenizer, info)
    predicted_sentence = target_tokenizer.sequences_to_texts(predicted_arr)
    #predicted_sentence = " ".join(word in predicted_sentence)
    
    print('Input: {}'.format(sentence))
    print('Actual translation: %s' % (actual_translation))
    print('Predicted translation: {}'.format(predicted_sentence))

In [None]:
sample_x, sample_y = encoder_test[:5], target_test[:5]
for X, y in zip(sample_x, sample_y):
    translate(X, y, input_tokenizer, target_tokenizer, info)

# Notes to improve

- Use Beam Search.
- Use keras.lambda on every tf operation so that we can save the model. Then, we can load the model and add attention layer output to visualize the attention weights.

# References and Further readings.

- [Attention is all you need](https://arxiv.org/abs/1706.03762)
- [Tensorflow Transformer tutorial](https://www.tensorflow.org/tutorials/text/transformer)
- [The illustrated Transformer](http://jalammar.github.io/illustrated-transformer/)
- [Hands-on ML with Scikit-learn, keras and Tensorflow](https://github.com/ageron/handson-ml2)
- [Python BLEU Score implementation](https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py)