# Translation with Recurrent Neural Networks

This notebook is addapted from this notebook: [Seq-2-Seq](https://github.com/random-forests/applied-dl/blob/master/examples/8-seq2seq.ipynb), which in tern is based on this Tensorflow [tutorial](https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention) and this helpful [article](https://machinetalk.org/2019/03/29/neural-machine-translation-with-attention-mechanism/).


In [0]:
%tensorflow_version 2.x

In [0]:
import numpy as np
import re
import tensorflow as tf
import time
import unicodedata

The training data is a set of parallel sentences in English and Danish

In [0]:
sentences = [
  ("Do you want a cup of coffee?", "Vil du have en kop kaffe?"),
  ("I've had coffee already.", "Jeg har fået kaffe allerede."),
  ("Can I get you a coffee?", "Må jeg give dig en kop kaffe?"),
  ("Please give me some coffee.", "Giv mig venligst en kop kaffe."),
  ("Would you like me to make coffee?", "Skal jeg lave kaffe?"),
  ("Two coffees, please.", "To kaffe, tak."),
  ("How about a cup of coffee?", "Hvad med en kop kaffe?"),
  ("I drank two cups of coffee.", "Jeg drak to kopper kaffe."),
  ("Would you like to have a cup of coffee?", "Vil du have en kop kaffe?"),
  ("There'll be coffee and cake at five.", "Der vil være kaffe og kage klokken fem."),
  ("Another coffee, please.", "Endnu en kaffe, tak."),
  ("I made coffee.", "Jeg har lavet kaffe."),
  ("I would like to have a cup of coffee.", "Jeg vil gerne have en kop kaffe."),
  ("Do you want me to make coffee?", "Skal jeg lave kaffe?"),
  ("It is hard to wake up without a strong cup of coffee.", "Det er svært at vågne uden en kop stærk kaffe."),
  ("All I drank was coffee.", "Jeg drak kun kaffe."),
  ("I've drunk way too much coffee today.", "Jeg har drukket for meget kaffe i dag."),
  ("Which do you prefer, tea or coffee?", "Hvad foretrækker du, te eller kaffe?"),
  ("There are many kinds of coffee.", "Der er mange slags kaffe."),
  ("I will make some coffee.",	"Jeg laver noget kaffe.")
]

We need to do some preprocessing of data to handle Danish letters and to make sure that we tokenize correctly.

In [0]:
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = re.sub(r"ø", r"oe", s)
  s = re.sub(r"æ", r"ae", s)
  s = re.sub(r"å", r"aa", s)
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  s = re.sub(r"([?.!,])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = re.sub(r"[^a-zA-Z?.!,]+", " ", s)
  s = s.strip()
  s = '<start> ' + s + ' <end>'
  return s

In [0]:
print("Original:", sentences[0])
sentences = [(preprocess(source), preprocess(target)) for (source, target) in sentences]
print("Preprocessed:", sentences[0])

We tokenize the sentences into words, create a dictionary to be able to replace the words with numbers and add som padding at the end of sentences to make them the same length.

In [0]:
source_sentences, target_sentences = list(zip(*sentences))
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer.fit_on_texts(source_sentences)
source_data = source_tokenizer.texts_to_sequences(source_sentences)
print("Sequence:", source_data[0])
source_data = tf.keras.preprocessing.sequence.pad_sequences(source_data, padding='post')
print("Padded:", source_data[0])

We do the same for the target sentences

In [0]:
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_sentences)
target_data = target_tokenizer.texts_to_sequences(target_sentences)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')

Create labels for the decoder by shifting the target sequence one to the right.

In [0]:
target_labels = np.zeros(target_data.shape)
target_labels[:,0:target_data.shape[1] -1] = target_data[:,1:]

print("Target sequence", target_data[0])
print("Target label", target_labels[0])

In [0]:
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

We define a function to be able to decode the numerical sequence back to a sentence

In [0]:
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
decode(source_data[0], source_tokenizer)

In [0]:
batch_size = 5
dataset = tf.data.Dataset.from_tensor_slices((source_data, target_data, target_labels)).batch(batch_size)

In [0]:
example_batch = next(iter(dataset))
source, target, taget_labels = example_batch
print("Shapes:", source.shape, target.shape, taget_labels.shape)

In [0]:
embedding_size = 32
rnn_size = 64

First we define the Encoder. The embedding layer reduces the dimension of the sparse word vector and the GRU layer is the recurrent layer.

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))

Demonstrate calling the encoder.

In [0]:
# Create a batch of one sentence
ex_sentence = tf.expand_dims(source_data[0], axis=0)
ex_translation = tf.expand_dims(target_data[0], axis=0)
ex_labels = tf.expand_dims(target_labels[0], axis=0)
print(ex_sentence.shape)

encoder = Encoder()
hidden_state = encoder.init_state(batch_size=1)
print(hidden_state.shape)

output, hidden_state = encoder(ex_sentence, hidden_state)
print(output.shape)

Next we define the decoder.

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

Demonstrate calling the decoder.

In [0]:
decoder = Decoder()
decoder_output, decoder_state = decoder(ex_labels, hidden_state)
print(decoder_output.shape)

In [0]:
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def calc_loss(targets, logits):
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  return crossentropy(targets, logits, sample_weight=mask)

print("Loss", calc_loss(ex_labels, decoder_output))

In [0]:
def translate(idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(sentences))
    
    input_sent = source_data[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    
    hidden_state = encoder.init_state(batch_size=1)
    output, hidden_state = encoder(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    out_words = []
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved for the padding, usually 
        # this will only happen before the decoder is trained, just stop 
        # translating and return what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return sentences[idx][0], sentences[idx][1], translation

In [0]:
input_sent, target_sent, translation = translate()
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function # remove this annotation when debugging
def train_step(source_seq, target_seq, target_labels, initial_state):
  
  with tf.GradientTape() as tape:
    encoder_output, encoder_state = encoder(source_seq, initial_state)
    logits, decoder_state = decoder(target_seq, encoder_state)
    loss = calc_loss(target_labels, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

Do the actual training and print an example every 10th epoch.


In [0]:
EPOCHS = 300

for epoch in range(EPOCHS):
    start = time.time()
  
    en_initial_states = encoder.init_state(batch_size)
    
    for batch, (source_seq, target_seq, target_labels) in enumerate(dataset):
      loss = train_step(source_seq, target_seq, target_labels, en_initial_states)
      elapsed = time.time() - start
    
    if epoch % 10 == 0:
      print("Epoch #%d, Loss %.4f, Time %.2f sec" % (epoch, loss, elapsed))
      input_sent, target_sent, translation = translate()
      print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))