# Neural machine translation with basic encoder-decoder architecture

### Some content is taken from [tensorflow-tutorial ](https://www.tensorflow.org/tutorials/text/nmt_with_attention ) and added a *translate_batch()* function to translate a batch and dump outputs into a file

In [2]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
from utils.dataset import NMTDataset

In [3]:
!ls utils

dataset.py  __pycache__


## Download and prepare the dataset

We'll use the same dataset we worked on notebook-1 (text-processing). For our convenience we've created a `utils/dataset.py` file which returns train and validation tf.data.Dataset objects. 

In [4]:
BUFFER_SIZE = 32000
BATCH_SIZE = 64
num_examples = 30000

dataset_creator = NMTDataset('en-spa')
train_dataset, val_dataset, inp_lang, targ_lang = dataset_creator.call(num_examples, BUFFER_SIZE, BATCH_SIZE)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [13]:
print("Inpute Vocabulary Size: {}".format(len(inp_lang.word_index)))
print("Target Vocabulary Size: {}".format(len(targ_lang.word_index)))


Inpute Vocabulary Size: 9414
Target Vocabulary Size: 4935


In [14]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape


(TensorShape([64, 16]), TensorShape([64, 11]))

## Write the encoder and decoder model

Implement an encoder-decoder model 

<p align="center">
<img src="https://www.guru99.com/images/1/111318_0848_seq2seqSequ1.png" width="500" alt="basic encoder-decoder">

The input is put through an encoder model which gives us the encoder outputs of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*.


In [41]:
# Define some useful parameters for further use

vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]

embedding_dim = 128
units = 1024
steps_per_epoch = num_examples//BATCH_SIZE



In [21]:
## Encoder has single layer of GRU layer on top of the embedding layer 

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x):
    x = self.embedding(x)
    output, S = self.gru(x)
    # output.shape = (BATCH_SIZE, max_length_input, enc_units)
    # S.shape = (BATCH_SIZE, enc_units)
    return output, S

In [71]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
# sample input
sample_output, sample_hidden = encoder(example_input_batch)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 16, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [72]:
# We see from the architecture image that we only require S from encoders to start decoding. 


class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

  def call(self, x, S):
    # x = Decoder input, shape = (BATCH_SIZE, 1)
    # S = final hidden state from Encoder, shape = (BATCH_SIZE, units)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=S)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    output = self.fc(output)

    return output, state

In [73]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, hidden_state = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))
print ('Decoder hidden state shape : (batch_size, units) {}'.format(hidden_state.shape))

Decoder output shape: (batch_size, vocab size) (64, 4936)
Decoder hidden state shape : (batch_size, units) (64, 1024)


## Define the optimizer and the loss function

In [74]:
# Let's use the default parameters of Adam Optimizer

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

## Checkpoints (Object-based saving)

In [75]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## Training

1. Pass the *input* through the *encoder* which return the *S (encoder hidden state)*.
2. Encoder hidden state and the decoder input (which is the *start token*) is passed to the decoder.
3. The decoder returns the *predictions* and the *decoder hidden state*.
4. The decoder hidden state is then passed back into the model for the next time step and the predictions are used to calculate the loss.
5. Use *teacher forcing* to decide the next input to the decoder.
6. *Teacher forcing* is the technique where the *target word* is passed as the *next input* to the decoder.
7. The final step is to calculate the gradients and apply it to the optimizer and backpropagate.

In [76]:
@tf.function
def train_step(inp, targ):
  loss = 0

  with tf.GradientTape() as tape:
    enc_outputs, S = encoder(inp)

    dec_hidden = S

    # For the inital step of decoder, we pass the start token as input to decoder
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, max_length_output):
      # passing enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden)
      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [77]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  total_loss = 0

  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.7110
Epoch 1 Batch 100 Loss 2.1290
Epoch 1 Batch 200 Loss 1.9746
Epoch 1 Batch 300 Loss 1.7997
Epoch 1 Loss 1.5881
Time taken for 1 epoch 48.84951186180115 sec

Epoch 2 Batch 0 Loss 1.5871
Epoch 2 Batch 100 Loss 1.5062
Epoch 2 Batch 200 Loss 1.4628
Epoch 2 Batch 300 Loss 1.3579
Epoch 2 Loss 1.1528
Time taken for 1 epoch 36.51072692871094 sec

Epoch 3 Batch 0 Loss 1.2039
Epoch 3 Batch 100 Loss 1.1193
Epoch 3 Batch 200 Loss 1.1510
Epoch 3 Batch 300 Loss 1.0542
Epoch 3 Loss 0.9103
Time taken for 1 epoch 36.272050857543945 sec

Epoch 4 Batch 0 Loss 0.9092
Epoch 4 Batch 100 Loss 0.9387
Epoch 4 Batch 200 Loss 0.8021
Epoch 4 Batch 300 Loss 0.8796
Epoch 4 Loss 0.7179
Time taken for 1 epoch 36.298630475997925 sec

Epoch 5 Batch 0 Loss 0.7306
Epoch 5 Batch 100 Loss 0.7684
Epoch 5 Batch 200 Loss 0.7052
Epoch 5 Batch 300 Loss 0.6842
Epoch 5 Loss 0.5639
Time taken for 1 epoch 35.95931553840637 sec

Epoch 6 Batch 0 Loss 0.4709
Epoch 6 Batch 100 Loss 0.4752
Epoch 6 Batch 200 Lo

We see that training loss has gone down < 0.1, which looks good. let's try to translate some example sentences to see if it's really working good or has the model overfitted. 

## Translate

* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state.
* Stop predicting when the model predicts the *end token*.

Note: The encoder output and last encoder hidden state S is calculated only once for one input.

In [78]:
def evaluate(sentence):
  sentence = dataset_creator.preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_input,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  enc_out, enc_hidden = encoder(inputs)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)
  print(dec_hidden.shape, dec_input.shape)

  for t in range(max_length_output):
    predictions, dec_hidden = decoder(dec_input, dec_hidden)


    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [79]:
def translate(sentence):
  result, sentence = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

## Restore the latest checkpoint and test

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [80]:
translate(u'hace mucho frio aqui.')

(1, 1024) (1, 1)
Input: <start> hace mucho frio aqui . <end>
Predicted translation: it s very cold here . <end> 


In [81]:
translate(u'esta es mi vida.')

(1, 1024) (1, 1)
Input: <start> esta es mi vida . <end>
Predicted translation: this is my life . <end> 


In [82]:
translate(u'¿todavia estan en casa?')

(1, 1024) (1, 1)
Input: <start> ¿ todavia estan en casa ? <end>
Predicted translation: are you still at home ? <end> 


In [83]:
# wrong translation
translate(u'trata de averiguarlo.')

(1, 1024) (1, 1)
Input: <start> trata de averiguarlo . <end>
Predicted translation: try to figure it out . <end> 


In [86]:
def translate_batch(test_dataset):
  with open('output_text.txt', 'w') as f:
    for (inputs, targets) in test_dataset:
      outputs = np.zeros((BATCH_SIZE, max_length_output),dtype=np.int16)
      hidden_state = tf.zeros((BATCH_SIZE, units))
      enc_output, dec_h = encoder(inputs)
      dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
      for t in range(max_length_output):
        preds, dec_h = decoder(dec_input, dec_h)
        predicted_id = tf.argmax(preds, axis=1).numpy()
        outputs[:, t] = predicted_id
        dec_input = tf.expand_dims(predicted_id, 1)
      outputs = targ_lang.sequences_to_texts(outputs)
      for t, item in enumerate(outputs):
        try:
          i = item.index('<end>')
          f.write("%s\n" %item[:i])
        except: 
          f.write("%s \n" % item) # For those translated sequences which didn't correctly translated and have <end> token.

outputs = translate_batch(val_dataset)

In [87]:
!tail output_text.txt
! wc -l output_text.txt

she s taller than me . 
do you get him ? 
her life is in danger . 
she has green eyes . 
crime is no other way . 
i can t budge it . 
did you get married ? 
he went abroad . 
don t be angry . 
are you healthy ? 
5952 output_text.txt
