In [None]:
# Install JAX/TRAX.
!pip install --upgrade -q jax
!pip install --upgrade -q jaxlib
!pip install --upgrade -q trax

In [None]:
# Make sure the Colab Runtime is set to Accelerator: TPU.
import requests
import os
if 'TPU_DRIVER_MODE' not in globals():
  url = 'http://' + os.environ['COLAB_TPU_ADDR'].split(':')[0] + ':8475/requestversion/tpu_driver0.1-dev20191206'
  resp = requests.post(url)
  TPU_DRIVER_MODE = 1

# The following is required to use TPU Driver as JAX's backend.
from jax.config import config
config.FLAGS.jax_xla_backend = "tpu_driver"
config.FLAGS.jax_backend_target = "grpc://" + os.environ['COLAB_TPU_ADDR']
print(config.FLAGS.jax_backend_target)

grpc://10.58.165.234:8470


In [None]:
import trax
from trax.data import inputs
from trax import layers as tl
from trax.supervised import training

import numpy as np

from termcolor import colored
import random

!pip list | grep trax

trax                          1.4.1


In [None]:
# MOUNT DRIVE
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# This will download the train dataset if no data_dir is specified.
train_stream_fn = trax.data.TFDS('para_crawl/enfr',
                                 data_dir='/content/drive/MyDrive/Colab Notebooks/data/',
                                 keys=('en', 'fr'),
                                 eval_holdout_size=0.01, # 1% for eval
                                 train=True)

# Get generator function for the eval set
eval_stream_fn = trax.data.TFDS('para_crawl/enfr',
                                data_dir='/content/drive/MyDrive/Colab Notebooks/data/',
                                keys=('en', 'fr'),
                                eval_holdout_size=0.01, # 1% for eval
                                train=False)

In [None]:
train_stream = train_stream_fn()
print(colored('train data (en, fr) tuple:', 'red'), next(train_stream))
print()

eval_stream = eval_stream_fn()
print(colored('eval data (en, fr) tuple:', 'red'), next(eval_stream))

[31mtrain data (en, fr) tuple:[0m (b'Dynamic offensive player ... highly skilled ... highly competitive ... great hands ... tremendous speed ... sees the ice incredibly well ... quick feet ... excellent acceleration ... has the will to win ... great confidence ... can play \xe2\x80\x98D\xe2\x80\x99', b'Joueur offensif dynamique\xe2\x80\xa6 tr\xc3\xa8s talentueux\xe2\x80\xa6 tr\xc3\xa8s comp\xc3\xa9titif\xe2\x80\xa6 excellentes mains\xe2\x80\xa6 vitesse exceptionnelle\xe2\x80\xa6 voit tr\xc3\xa8s bien ce qui se passe sur la glace\xe2\x80\xa6 pieds rapides\xe2\x80\xa6 excellente acc\xc3\xa9l\xc3\xa9ration\xe2\x80\xa6 veut gagner\xe2\x80\xa6 tr\xc3\xa8s confiant\xe2\x80\xa6 peut jouer comme d\xc3\xa9fenseur')

[31meval data (en, fr) tuple:[0m (b'The real estate site Como vender una casa offers thousands of real estate ads, for example house or house All regions.', b"Le site immobilier Como vender una casa vous propose des milliers d' annonces immobili\xc3\xa8res, par exemple maison ou

## 1.2  Tokenization and Formatting


**Tokenizing the sentences using subword representations:** we want to represent each sentence as an array of integers instead of strings. For our application, we will use *subword* representations to tokenize our sentences. This is a common technique to avoid out-of-vocabulary words by allowing parts of words to be represented separately. For example, instead of having separate entries in your vocabulary for "fear", "fearless", "fearsome", "some", and "less", you can simply store "fear", "some", and "less" then allow your tokenizer to combine these subwords when needed. This allows it to be more flexible so you won't have to save uncommon words explicitly in your vocabulary (e.g. *stylebender*, *nonce*, etc). Tokenizing is done with the `trax.data.Tokenize()` command. The combined subword vocabulary for English, German and French (i.e. `endefr_32k.subword`) is provided by trax. Feel free to open this file to see how the subwords look like.

In [None]:
# global variables that state the filename and directory of the vocabulary file
VOCAB_FILE = 'endefr_32k.subword'
VOCAB_DIR = 'gs://trax-ml/vocabs/'

# Tokenize the dataset.
tokenized_train_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(train_stream)
tokenized_eval_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(eval_stream)

**Append an end-of-sentence token to each sentence:** We will assign a token (i.e. in this case `1`) to mark the end of a sentence. This will be useful in inference/prediction so we'll know that the model has completed the translation.

In [None]:
# Append EOS at the end of each sentence.

# Integer assigned as end-of-sentence (EOS)
EOS = 1

# generator helper function to append EOS to each sentence
def append_eos(stream):
    for (inputs, targets) in stream:
        inputs_with_eos = list(inputs) + [EOS]
        targets_with_eos = list(targets) + [EOS]
        yield np.array(inputs_with_eos), np.array(targets_with_eos)

# append EOS to the train data
tokenized_train_stream = append_eos(tokenized_train_stream)

# append EOS to the eval data
tokenized_eval_stream = append_eos(tokenized_eval_stream)

In [None]:
# Filter too long sentences to not run out of memory.
# length_keys=[0, 1] means we filter both English and French sentences, so
# both much be not longer that 512 tokens for training / 1024 for eval.
filtered_train_stream = trax.data.FilterByLength(
    max_length=512, length_keys=[0, 1])(tokenized_train_stream)
filtered_eval_stream = trax.data.FilterByLength(
    max_length=1024, length_keys=[0, 1])(tokenized_eval_stream)

# print a sample input-target pair of tokenized sentences
train_input, train_target = next(filtered_train_stream)
print(colored(f'Single tokenized example input:', 'red' ), train_input)
print(colored(f'Single tokenized example target:', 'red'), train_target)

[31mSingle tokenized example input:[0m [   46  2695  6423     8   194  7235 12073  7108 27233    16   136   993
  1144  7077    33     4   652     8   239  1268  2854     3     1]
[31mSingle tokenized example target:[0m [  119 29825    23  4003     5  1057  4329 23916    16  7235 31309 25455
   306  6145   120  2088    24    31   265  9164     5    18    31   652
     5   197  3208  2854     3     1]


## 1.3  tokenize & detokenize helper functions

- tokenize(): converts a text sentence to its corresponding token list (i.e. list of indices). Also converts words to subwords (parts of words).
- detokenize(): converts a token list to its corresponding sentence (i.e. string).

In [None]:
# Setup helper functions for tokenizing and detokenizing sentences
def tokenize(input_str, vocab_file=None, vocab_dir=None):
  
  
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    # Use the trax.data.tokenize method. It takes streams and returns streams,
    # we get around it by making a 1-element stream with `iter`.
    inputs =  next(trax.data.tokenize(iter([input_str]),
                                      vocab_file=vocab_file, vocab_dir=vocab_dir))
    # Mark the end of the sentence with EOS
    inputs = list(inputs) + [EOS]
    # Adding the batch dimension to the front of the shape
    batch_inputs = np.reshape(np.array(inputs), [1, -1])
    return batch_inputs

def detokenize(integers, vocab_file=None, vocab_dir=None):
    
    # Remove the dimensions of size 1
    integers = list(np.squeeze(integers))
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    # Remove the EOS to decode only the original tokens
    if EOS in integers:
        integers = integers[:integers.index(EOS)]  
    return trax.data.detokenize(integers, vocab_file=vocab_file, vocab_dir=vocab_dir)

Let's see how we might use these functions:

In [None]:
# Detokenize an input-target pair of tokenized sentences
print(colored(f'Single detokenized example input:', 'red'), detokenize(train_input, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(colored(f'Single detokenized example target:', 'red'), detokenize(train_target, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print()

# Tokenize and detokenize a word that is not explicitly saved in the vocabulary file.
# See how it combines the subwords 'hell' and 'o' to form the word 'hello'.
print(colored(f"tokenize('hello'): ", 'green'), tokenize('hello', vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))

[31mSingle detokenized example input:[0m The coming lessons of your Ascended Masters can give further examples for the importance of these various actions.
[31mSingle detokenized example target:[0m Les leçons à venir de vos Maîtres Ascensionnés pourront vous donner d’autres exemples de l’importance de ces diverses actions.

[32mtokenize('hello'): [0m [[11068  5505     1]]


## 1.4  Bucketing

Bucketing the tokenized sentences is an important technique used to speed up training in NLP.
Here is a 
[nice article describing it in detail](https://medium.com/@rashmi.margani/how-to-speed-up-the-training-of-the-sequence-model-using-bucketing-techniques-9e302b0fd976)
but the gist is very simple. Our inputs have variable lengths and you want to make these the same when batching groups of sentences together. One way to do that is to pad each sentence to the length of the longest sentence in the dataset. This might lead to some wasted computation though. For example, if there are multiple short sentences with just two tokens, do we want to pad these when the longest sentence is composed of a 100 tokens? Instead of padding with 0s to the maximum length of a sentence each time, we can group our tokenized sentences by length and bucket, as on this image (from the article above):

![alt text](https://miro.medium.com/max/700/1*hcGuja_d5Z_rFcgwe9dPow.png)

We batch the sentences with similar length together (e.g. the blue sentences in the image above) and only add minimal padding to make them have equal length (usually up to the nearest power of two). This allows to waste less computation when processing padded sequences.
In Trax, it is implemented in the [bucket_by_length](https://github.com/google/trax/blob/5fb8aa8c5cb86dabb2338938c745996d5d87d996/trax/supervised/inputs.py#L378) function.

In [None]:

boundaries =  [  8,  16,  32,  64, 128, 256]
batch_sizes = [128, 128, 128, 128, 128, 128, 128]
# Notice all is 128. As we are using TPUs, We need the same batch_size to run in parallel.
# You can make diffrent batch_sizes if you are using GPU or CPU.

# Create the generators.
train_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(filtered_train_stream)

eval_batch_stream = trax.data.BucketByLength(
    boundaries, batch_sizes,
    length_keys=[0, 1]  # As before: count inputs and targets to length.
)(filtered_eval_stream)

# Add masking for the padding (0s).
train_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream)
eval_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream)

## 1.5 Exploring the data

We will now be displaying some of our data. You will see that the functions defined above (i.e. `tokenize()` and `detokenize()`)

In [None]:
input_batch, target_batch, mask_batch = next(train_batch_stream)

# let's see the data type of a batch
print("input_batch data type: ", type(input_batch))
print("target_batch data type: ", type(target_batch))

# let's see the shape of this particular batch (batch length, sentence length)
print("input_batch shape: ", input_batch.shape)
print("target_batch shape: ", target_batch.shape)

input_batch data type:  <class 'numpy.ndarray'>
target_batch data type:  <class 'numpy.ndarray'>
input_batch shape:  (128, 32)
target_batch shape:  (128, 32)


The `input_batch` and `target_batch` are Numpy arrays consisting of tokenized English sentences and French sentences respectively. These tokens will later be used to produce embedding vectors for each word in the sentence (so the embedding for a sentence will be a matrix).

We can now visually inspect some of the data. You can run the cell below several times to shuffle through the sentences.

In [None]:
# pick a random index less than the batch size.
index = random.randrange(len(input_batch))

# use the index to grab an entry from the input and target batch
print(colored('THIS IS THE ENGLISH SENTENCE: \n', 'red'), detokenize(input_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR), '\n')
print(colored('THIS IS THE TOKENIZED VERSION OF THE ENGLISH SENTENCE: \n ', 'red'), input_batch[index], '\n')
print(colored('THIS IS THE FRENCH TRANSLATION: \n', 'red'), detokenize(target_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR), '\n')
print(colored('THIS IS THE TOKENIZED VERSION OF THE FRENCH TRANSLATION: \n', 'red'), target_batch[index], '\n')

[31mTHIS IS THE ENGLISH SENTENCE: 
[0m This means the need for a higher religious awareness. 

[31mTHIS IS THE TOKENIZED VERSION OF THE ENGLISH SENTENCE: 
 [0m [  163  1229     4   336    33    17  2297 12638  5567     3     1     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0] 

[31mTHIS IS THE FRENCH TRANSLATION: 
[0m C'est dire la nécéssité d'une conscience religieuse supérieure. 

[31mTHIS IS THE TOKENIZED VERSION OF THE FRENCH TRANSLATION: 
[0m [  176     7    37   654    10  8133  9783  8068  1815    24     7    41
 14196 13724 12867   350 10141     3     1     0     0     0     0     0
     0     0     0     0     0     0     0     0] 



# Part (2):  Model

Now that we’ve seen preprocessing, it’s time to move into Modeling itself. Trax allows the use of Predefined Models, such as:
 - Seq2Seq with Attention
 - BERT
 - Transformer
 - Reformer

We will be using Transformer in this Notebook As Trax provided a pretrained Transformer NMT Model which is traind on English to German dataset and We now are going to train it on English to French dataset and get a very close results to the one provide by Google Brain Team.

You can simply change `trax.models.Transformer` in the next cell to `trax.models.Reformer` to use the Reformer model.

```python
# you could check the available pretrained models and vocab files provided by trax by running:
!gsutil ls gs://trax-ml/
```

In [None]:
# Create a Transformer model.
model = trax.models.Transformer(
    input_vocab_size=33600,
    d_model=512, d_ff=2048, dropout = 0.1,
    n_heads=8, n_encoder_layers=6, n_decoder_layers=6,
    max_len=2048, mode='train')

# Pre-trained Transformer model config in gs://trax-ml/models/translation/ende_wmt32k.gin
# Initialize Transformer using pre-trained weights.
model.init_from_file('gs://trax-ml/models/translation/ende_wmt32k.pkl.gz',
                     weights_only=True)

# You also, could intiate the model from an output checpoint.
# simply change 'gs://trax-ml/models/translation/ende_wmt32k.pkl.gz' to 'output_dir/ + last_checkpoint'
# for example:
# model.init_from_file('/content/drive/MyDrive/Colab Notebooks/Transformer_FR_pretrained_336/model.pkl.gz',
#                      weights_only=True)

You could have a peek at the model layers.

In [None]:
# model

# Part (3):  Training
We will now be training our model in this section. Doing supervised training in Trax is pretty straightforward (short example [here](https://trax-ml.readthedocs.io/en/latest/notebooks/trax_intro.html#Supervised-training)). We will be instantiating three classes for this: `TrainTask`, `EvalTask`, and `Loop`. Let's take a closer look at each of these in the sections below.

## 3.1  TrainTask

The [TrainTask](https://trax-ml.readthedocs.io/en/latest/trax.supervised.html#trax.supervised.training.TrainTask) class allows us to define the labeled data to use for training and the feedback mechanisms to compute the loss and update the weights. 

In [None]:
train_task = training.TrainTask(
    # use the train batch stream as labeled data
    labeled_data= train_batch_stream,
    # use the cross entropy loss with LogSoftmax
    loss_layer= tl.CrossEntropyLossWithLogSoftmax(),
    # use the Adafactor optimizer with learning rate of 0.001
    optimizer= trax.optimizers.Adafactor(learning_rate=0.001, epsilon1=1e-30),
    # have 500 warmup steps
    lr_schedule= trax.lr.multifactor(constant=1.0, warmup_steps=500),
    # have a checkpoint every 100 steps
    n_steps_per_checkpoint= 10,
    # saving a checkpoint every 1000 steps on the output_dir
    n_steps_per_permanent_checkpoint = 1000
)

## 3.2  EvalTask

The [EvalTask](https://trax-ml.readthedocs.io/en/latest/trax.supervised.html#trax.supervised.training.EvalTask) on the other hand allows us to see how the model is doing while training. For our application, we want it to report the cross entropy loss with LogSoftmax and accuracy.

In [None]:
eval_task = training.EvalTask(
    # use the eval batch stream as labeled data
    labeled_data=eval_batch_stream,
    # use the cross entropy loss with LogSoftmax and accuracy as metrics
    metrics=[tl.CrossEntropyLossWithLogSoftmax(), tl.WeightedCategoryAccuracy()],
    # you could specify the number of eval batch by n_eval_batches = 64 or any other number
    # but it not specified here as we want to evaluate the whole eval data
    # n_eval_batches = 64
)

## 3.3  Loop

The [Loop](https://trax-ml.readthedocs.io/en/latest/trax.supervised.html#trax.supervised.training.Loop) class defines the model we will train as well as the train and eval tasks to execute. Its `run()` method allows us to execute the training for a specified number of steps.

In [None]:
# define the output directory
output_dir = '/content/drive/MyDrive/Colab Notebooks/Transformer_FR_pretrained_336'

# # remove old model if it exists. restarts training.
# !rm -rf output_dir

# define the training loop
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

In [None]:
# Start Training!
training_loop.run(5000)


Step      1: Total number of trainable weights: 80370196
Step      1: Ran 1 train steps in 154.88 secs
Step      1: train CrossEntropyLossWithLogSoftmax |  10.05621243
Step      1: eval  CrossEntropyLossWithLogSoftmax |  9.81082535
Step      1: eval        WeightedCategoryAccuracy |  0.07909705

Step     10: Ran 9 train steps in 408.63 secs
Step     10: train CrossEntropyLossWithLogSoftmax |  9.26965141
Step     10: eval  CrossEntropyLossWithLogSoftmax |  8.89805794
Step     10: eval        WeightedCategoryAccuracy |  0.11580883

Step     20: Ran 10 train steps in 23.21 secs
Step     20: train CrossEntropyLossWithLogSoftmax |  8.11123180
Step     20: eval  CrossEntropyLossWithLogSoftmax |  7.71368456
Step     20: eval        WeightedCategoryAccuracy |  0.09313989

Step     30: Ran 10 train steps in 23.40 secs
Step     30: train CrossEntropyLossWithLogSoftmax |  7.44197845
Step     30: eval  CrossEntropyLossWithLogSoftmax |  7.18544197
Step     30: eval        WeightedCategoryAccuracy 

In [None]:
training_loop = training.Loop(model,
                              train_task,
                              eval_tasks=[eval_task],
                              output_dir=output_dir)

In [None]:
training_loop.run(900)


Step   5110: Ran 10 train steps in 30.06 secs
Step   5110: train CrossEntropyLossWithLogSoftmax |  1.74502718
Step   5110: eval  CrossEntropyLossWithLogSoftmax |  1.83426988
Step   5110: eval        WeightedCategoryAccuracy |  0.62784290

Step   5120: Ran 10 train steps in 31.10 secs
Step   5120: train CrossEntropyLossWithLogSoftmax |  1.71466386
Step   5120: eval  CrossEntropyLossWithLogSoftmax |  1.76296377
Step   5120: eval        WeightedCategoryAccuracy |  0.66135991

Step   5130: Ran 10 train steps in 30.33 secs
Step   5130: train CrossEntropyLossWithLogSoftmax |  1.74822271
Step   5130: eval  CrossEntropyLossWithLogSoftmax |  1.96706641
Step   5130: eval        WeightedCategoryAccuracy |  0.60536551

Step   5140: Ran 10 train steps in 31.13 secs
Step   5140: train CrossEntropyLossWithLogSoftmax |  1.72740114
Step   5140: eval  CrossEntropyLossWithLogSoftmax |  1.51474619
Step   5140: eval        WeightedCategoryAccuracy |  0.72492111

Step   5150: Ran 10 train steps in 32.65 se

## More Steps (optional)

As we have specified the `n_steps_per_permanent_checkpoint` in `training.TrainTask` it saves checkpoint in `output_dir` after the specified number of steps. So, if you have face runtime disconnection or you want to train the model for more number of steps to improve the result, you could load last checkpoint saved and load it using `training_loop.load_checkpoint`. 

This is an optional way. you could have used `model.init_from_file` as in (Part (2): Model) cells. change 'gs://trax-ml/models/translation/ende_wmt32k.pkl.gz' to 'output_dir/ + last_checkpoint'

In [None]:
output_dir = '/content/drive/MyDrive/Colab Notebooks/Transformer_FR_pretrained_336/'

# This loads a checkpoint:
training_loop.load_checkpoint(directory=output_dir, filename="model.pkl.gz")
# Continue training:
training_loop.run(5000)

# Part (4):  Testing

We will now be using the model you just trained to translate English sentences to French. We will implement this with two functions: The first allows you to identify the next symbol (i.e. output token). The second one takes care of combining the entire translated string.


In [None]:
model = trax.models.Transformer(
    input_vocab_size=33600,
    d_model=512, d_ff=2048, dropout = 0.1,
    n_heads=8, n_encoder_layers=6, n_decoder_layers=6,
    max_len=2048, mode='eval')

model.init_from_file('/content/drive/MyDrive/Summarizer/French Translation/models/model.pkl.gz',weights_only=True)

(((),
  ((), ((), ((), (), ()))),
  (array([[ 0.10244494,  0.10391301,  0.51431483, ..., -0.30733603,
            0.02270049,  0.16791232],
          [-1.3212025 ,  1.2569402 ,  0.39365283, ..., -0.7830569 ,
           -0.14921941, -1.2095921 ],
          [-0.02032815, -0.04200342,  0.24198914, ...,  1.5189921 ,
            0.0125878 , -0.2884543 ],
          ...,
          [ 0.09797635, -0.05788774, -0.25835893, ...,  0.05283534,
           -0.11368033,  0.16660483],
          [ 0.45425078, -0.5002881 , -1.692131  , ...,  2.9195695 ,
           -0.32482827, -3.3580527 ],
          [ 1.8254347 , -0.5009047 , -0.47748274, ...,  0.68252856,
            1.8195908 , -1.1546456 ]], dtype=float32),
   (),
   array([[[ 1.4844471 ,  1.6264353 , -0.36766744, ...,  1.2089864 ,
            -0.13306352,  0.24316476],
           [ 1.27974   , -1.756567  ,  4.1149864 , ..., -0.62950826,
             0.06156211,  0.22143582],
           [-0.2676652 , -1.9155992 ,  0.70056814, ..., -0.05727421,
      

## 4.1  Decoding

In [None]:
# Setup helper functions for tokenizing and detokenizing sentences
def tokenize(input_str, vocab_file=None, vocab_dir=None):
   
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    # Use the trax.data.tokenize method. It takes streams and returns streams,
    # we get around it by making a 1-element stream with `iter`.
    inputs = next(trax.data.tokenize(iter([input_str]),
                                      vocab_file=vocab_file, vocab_dir=vocab_dir))
    # Mark the end of the sentence with EOS
    inputs = list(inputs) + [EOS]
    # Adding the batch dimension to the front of the shape
    batch_inputs = np.reshape(np.array(inputs), [1, -1])
    return batch_inputs

def detokenize(integers, vocab_file=None, vocab_dir=None):
 
    # Remove the dimensions of size 1
    integers = list(np.squeeze(integers))
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    # Remove the EOS to decode only the original tokens
    if EOS in integers:
        integers = integers[:integers.index(EOS)]  
    return trax.data.detokenize(integers, vocab_file=vocab_file, vocab_dir=vocab_dir)

In [None]:
def next_symbol(model, input_tokens, cur_output_tokens, temperature):
    
    # set the length of the current output tokens
    token_length = len(cur_output_tokens)
    # calculate next power of 2 for padding length 
    padded_length = np.power(2, int(np.ceil(np.log2(token_length + 1))))
    # pad cur_output_tokens up to the padded_length
    padded = cur_output_tokens + [0] * (padded_length - token_length) 
    # model expects the output to have an axis for the batch size in front so
    # convert `padded` list to a numpy array with shape (x, <padded_length>) where the
    # x position is the batch axis.
    padded_with_batch = np.expand_dims(padded, axis=0)
    # the model prediction.
    output, _ = model((input_tokens, padded_with_batch))   
    # get log probabilities from the last token output
    log_probs = output[0, token_length, :]
    # get the next symbol by getting a logsoftmax sample
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))
    return symbol, float(log_probs[symbol])

In [None]:
def sampling_decode(input_sentence, model = None, temperature=0.0, vocab_file=None, vocab_dir=None):
        
    # encode the input sentence
    input_tokens = tokenize(input_sentence, vocab_file=vocab_file, vocab_dir=vocab_dir)
    # initialize the list of output tokens
    cur_output_tokens = []
    # initialize an integer that represents the current output index
    cur_output = 0  
    # Set the encoding of the "end of sentence" as 1
    EOS = 1
    # check that the current output is not the end of sentence token
    while cur_output != EOS: 
        # update the current output token by getting the index of the next word
        cur_output, log_prob = next_symbol(model, input_tokens, cur_output_tokens, temperature)
        # append the current output token to the list of output tokens
        cur_output_tokens.append(cur_output) 
    # detokenize the output tokens
    sentence = detokenize(cur_output_tokens, vocab_file=vocab_file, vocab_dir=vocab_dir)
    return cur_output_tokens, log_prob, sentence

In [None]:
VOCAB_FILE = 'endefr_32k.subword'
VOCAB_DIR = 'gs://trax-ml/vocabs/'

In [None]:
# Test the function above. Try varying the temperature setting with values from 0 to 1.
# Run it several times with each setting and see how often the output changes.
sampling_decode("Hello.", model, temperature=0.0, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)

([7949, 5505, 3, 1], 17.780824661254883, 'Hello.')

In [None]:
def greedy_decode_test(sentence, model=None, vocab_file=None, vocab_dir=None):
    """Prints the input and output of our NMT model using greedy decode
    Args:
        sentence (str): a custom string.
        model: the NMT model.
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file
    Returns:
        str: the translated sentence
    """    
    _,_, translated_sentence = sampling_decode(sentence, model, vocab_file=vocab_file, vocab_dir=vocab_dir)   
    print("English: ", sentence)
    print("French: ", translated_sentence)
    return translated_sentence

In [None]:
# put a custom string here
your_sentence = 'I love languages.'
greedy_decode_test(your_sentence, model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR);

English:  I love languages.
French:  J'aime les langues.


In [None]:
greedy_decode_test('You are almost done with the assignment!', model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR);

English:  You are almost done with the assignment!
French:  Vous êtes presque fait avec l'attribution !


In [None]:
def generate_samples(sentence, n_samples, model=None, temperature=0.6, vocab_file=None, vocab_dir=None):
    """Generates samples using sampling_decode()
    Args:
        sentence (str): sentence to translate.
        n_samples (int): number of samples to generate
        model: the NMT model.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file      
    Returns:
        tuple: (list, list)
            list of lists: token list per sample
            list of floats: log probability per sample
    """
    # define lists to contain samples and probabilities
    samples, log_probs = [], []
    # run a for loop to generate n samples
    for _ in range(n_samples):
        # get a sample using the sampling_decode() function
        sample, logp, _ = sampling_decode(sentence, model, temperature, vocab_file=vocab_file, vocab_dir=vocab_dir)
        # append the token list to the samples list
        samples.append(sample)
        # append the log probability to the log_probs list
        log_probs.append(logp)               
    return samples, log_probs

In [None]:
# generate 4 samples with the default temperature (0.6)
generate_samples('I love languages.', 4, model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)

([[769, 7, 31720, 13, 15267, 3, 1],
  [769, 7, 31720, 21, 15267, 3, 1],
  [769, 7, 31720, 21, 15267, 3, 1],
  [254, 31720, 21, 15267, 3, 1]],
 [21.068099975585938,
  21.208398818969727,
  21.208398818969727,
  22.02057647705078])

In [None]:
detokenize([769, 31, 31720, 21, 15267, 3, 1], VOCAB_FILE, VOCAB_DIR)

'J’aime les langues.'

In [None]:
def jaccard_similarity(candidate, reference):
    """Returns the Jaccard similarity between two token lists
    Args:
        candidate (list of int): tokenized version of the candidate translation
        reference (list of int): tokenized version of the reference translation
    Returns:
        float: overlap between the two token lists
    """  
    # convert the lists to a set to get the unique tokens
    can_unigram_set, ref_unigram_set = set(candidate), set(reference)  
    # get the set of tokens common to both candidate and reference
    joint_elems = can_unigram_set.intersection(ref_unigram_set)
    # get the set of all tokens found in either candidate or reference
    all_elems = can_unigram_set.union(ref_unigram_set)
    # divide the number of joint elements by the number of all elements
    overlap = len(joint_elems) / len(all_elems)
    return overlap

In [None]:
# for making a frequency table easily
from collections import Counter

def rouge1_similarity(system, reference):
    """Returns the ROUGE-1 score between two token lists
    Args:
        system (list of int): tokenized version of the system translation
        reference (list of int): tokenized version of the reference translation
    Returns:
        float: overlap between the two token lists
    """    
    # make a frequency table of the system tokens
    sys_counter = Counter(system)   
    # make a frequency table of the reference tokens
    ref_counter = Counter(reference)
    # initialize overlap to 0
    overlap = 0
    # run a for loop over the sys_counter object
    for token in sys_counter:      
        # lookup the value of the token in the sys_counter dictionary 
        token_count_sys = sys_counter.get(token,0)
        # lookup the value of the token in the ref_counter dictionary 
        token_count_ref = ref_counter.get(token,0)
        # update the overlap by getting the smaller number between the two token counts above
        overlap += min(token_count_sys, token_count_ref) 
    # get the precision (i.e. number of overlapping tokens / number of system tokens)
    precision = overlap / sum(sys_counter.values())    
    # get the recall (i.e. number of overlapping tokens / number of reference tokens)
    recall = overlap / sum(ref_counter.values()) 
    if precision + recall != 0:
        # compute the f1-score
        rouge1_score = 2 * ((precision * recall)/(precision + recall))
    else:
        rouge1_score = 0 
    return rouge1_score

In [None]:
def average_overlap(similarity_fn, samples, *ignore_params):
    """Returns the arithmetic mean of each candidate sentence in the samples
    Args:
        similarity_fn (function): similarity function used to compute the overlap
        samples (list of lists): tokenized version of the translated sentences
        *ignore_params: additional parameters will be ignored
    Returns:
        dict: scores of each sample
            key: index of the sample
            value: score of the sample
    """    
    # initialize dictionary
    scores = {}
    # run a for loop for each sample
    for index_candidate, candidate in enumerate(samples):    
        # initialize overlap to 0.0
        overlap = 0.0
        # run a for loop for each sample
        for index_sample, sample in enumerate(samples): 
            # skip if the candidate index is the same as the sample index
            if index_candidate == index_sample:
                continue                
            # get the overlap between candidate and sample using the similarity function
            sample_overlap = similarity_fn(candidate,sample)            
            # add the sample overlap to the total overlap
            overlap += sample_overlap            
        # get the score for the candidate by computing the average
        score = overlap/index_sample        
        # save the score in the dictionary. use index as the key.
        scores[index_candidate] = score        
    return scores

It is also common to see the weighted mean being used to calculate the overall score instead of just the arithmetic mean.

In [None]:
def weighted_avg_overlap(similarity_fn, samples, log_probs):
    """Returns the weighted mean of each candidate sentence in the samples
    Args:
        samples (list of lists): tokenized version of the translated sentences
        log_probs (list of float): log probability of the translated sentences
    Returns:
        dict: scores of each sample
            key: index of the sample
            value: score of the sample
    """
    # initialize dictionary
    scores = {}   
    # run a for loop for each sample
    for index_candidate, candidate in enumerate(samples):          
        # initialize overlap and weighted sum
        overlap, weight_sum = 0.0, 0.0 
        # run a for loop for each sample
        for index_sample, (sample, logp) in enumerate(zip(samples, log_probs)):
            # skip if the candidate index is the same as the sample index            
            if index_candidate == index_sample:
                continue            
            # convert log probability to linear scale
            sample_p = float(np.exp(logp))
            # update the weighted sum
            weight_sum += sample_p
            # get the unigram overlap between candidate and sample
            sample_overlap = similarity_fn(candidate, sample)           
            # update the overlap
            overlap += sample_p * sample_overlap        
        # get the score for the candidate
        score = overlap / weight_sum
        # save the score in the dictionary. use index as the key.
        scores[index_candidate] = score
    return scores

### 4.2.4 Putting it all together

We will now put everything together and develop the `mbr_decode()` function.

In [None]:
def mbr_decode(sentence, n_samples=4, score_fn=weighted_avg_overlap, similarity_fn=rouge1_similarity, model=model,
               temperature=0.6, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR):
    """Returns the translated sentence using Minimum Bayes Risk decoding
    Args:
        sentence (str): sentence to translate.
        n_samples (int): number of samples to generate
        score_fn (function): function that generates the score for each sample
        similarity_fn (function): function used to compute the overlap between a
        pair of samples
        model: the NMT model.
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)
        vocab_file (str): filename of the vocabulary
        vocab_dir (str): path to the vocabulary file
    Returns:
        str: the translated sentence
    """
    # generate samples
    samples, log_probs = generate_samples(sentence, n_samples,
                                          model, temperature,
                                          vocab_file, vocab_dir)   
    # use the scoring function to get a dictionary of scores
    scores = score_fn(similarity_fn, samples, log_probs)
    # find the key with the highest score
    max_index = max(scores, key=scores.get) 
    # detokenize the token list associated with the max_index
    translated_sentence = detokenize(samples[max_index], vocab_file, vocab_dir)
    return (translated_sentence, max_index, scores)

In [None]:
# put a custom string here
your_sentence = 'She speaks English, French and German.'

In [None]:
mbr_decode(your_sentence)

('Elle parle anglais, français et allemand.',
 2,
 {0: 0.7307046434100283,
  1: 0.7516939246570612,
  2: 0.8509111099700796,
  3: 0.8509111099700796})

In [None]:
mbr_decode('You have completed the tutorial.')[0]

'Vous avez obtenu le tutoriel.'