In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TensorFlow Addons Networks : Sequence-to-Sequence NMT with Attention Mechanism

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/addons/tutorials/networks_seq2seq_nmt"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/networks_seq2seq_nmt.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/addons/blob/master/docs/tutorials/networks_seq2seq_nmt.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
      <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/networks_seq2seq_nmt.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

## Overview
This notebook gives a brief introduction into the ***Sequence to Sequence Model Architecture***
In this noteboook we broadly cover four essential topics necessary for Neural Machine Translation:


* **Data cleaning**
* **Data preparation**
* **Neural Translation Model with Attention**
* **Final Translation**

The basic idea behind such a model though, is only the encoder-decoder architecture. These networks are usually used for a variety of tasks like text-summerization, Machine translation, Image Captioning, etc. This tutorial provideas a hands-on understanding of the concept, explaining the technical jargons wherever necessary. We focus on the task of Neural Machine Translation (NMT) which was the very first testbed for seq2seq models.



## Setup

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import sys
sys.path.append("..") # Require to have the utilities packages in path

In [0]:
try:
  %tensorflow_version 2.x
except:
  pass

## Additional Resources:

These are a lst of resurces you must install in order to allow you to run this notebook:


1. [German-English Dataset](http://www.manythings.org/anki/deu-eng.zip)


The dataset should be downloaded, in order to compile this notebook, the embeddings can be used, as they are pretrained. Though, we carry out our own training here !!


In [0]:
from pathlib import Path
import csv
import string
import re
import pandas as pd
from pickle import dump
from unicodedata import normalize
from numpy import array
import itertools
from pickle import load
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
import tensorflow.keras.layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from pickle import load
from numpy import array
from numpy import argmax
import numpy as np

import tensorflow as tf
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from sklearn.model_selection import train_test_split
import tensorflow_addons as tfa
import unicodedata


Using TensorFlow backend.


In [0]:
data_path = Path(r"/content/drive/My Drive/Colab Notebooks/IFT 6759/Project 2/Data/")
files = list(data_path.glob("*"))

with open(data_path / "unaligned.en", 'r') as f:
    unaligned_en = [line.rstrip() for line in f] # Remove the \n
    unaligned_en = pd.DataFrame(unaligned_en, columns=["text"])
    f.close()
    
with open(data_path / "unaligned.fr", 'r') as f:
    unaligned_fr = [line.rstrip() for line in f] # Remove the \n
    unaligned_fr = pd.DataFrame(unaligned_fr, columns=["text"])
    f.close()

with open(data_path / "train.lang1", 'r') as f:
    train_lang1_en = [line.rstrip() for line in f] # Remove the \n
    train_lang1_en = pd.DataFrame(train_lang1_en, columns=["text"])
    f.close()

with open(data_path / "train.lang2", 'r') as f:
    train_lang2_fr = [line.rstrip() for line in f] # Remove the \n
    train_lang2_fr = pd.DataFrame(train_lang2_fr, columns=["text"])
    f.close()

## Data Cleaning

Our data set is a German-English translation dataset. It contains 152,820 pairs of English to German phases, one pair per line with a tab separating the language. These dataset though organized needs cleaning before we can work on it. This will enable us to remove unnecessary bumps that may come in during the training.

## Saving the Cleaned Dataset

In [0]:
def create_dataset():
    frText = []
    enText = []
    for ((indexFr, rowFr), (indexEn, rowEn))  in zip(train_lang2_fr.iterrows(), train_lang1_en.iterrows()):
        frText.append(preprocess_sentence(rowFr['text']))
        enText.append(preprocess_sentence(rowEn['text']))

    return frText, enText

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [0]:
fr, en = create_dataset()

In [0]:
frTrain, frValid, enTrain, enValid  = train_test_split(fr,en,test_size=0.2, shuffle = False)
print(enTrain[0], '\n', frTrain[0], '\n', enValid[0],  '\n', frValid[0])

<start> so too does the idea that accommodating religious differences is dangerous <end> 
 <start> l idee de concilier les differences religieuses semble donc dangereuse . <end> 
 <start> my wish is for you to help a strong sustainable movement to educate every child about food to inspire families to cook again and to empower people everywhere to fight obesity <end> 
 <start> mon souhait est que vous souteniez un puissant mouvement durable pour eduquer chaque enfant a l alimentation , pour inspirer les familles a cuisiner a nouveau , et dynamiser les gens partout a lutter contre l obesite . <end>


## Tokenization

In [0]:
# English tokenizer
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token="<unk>")
en_tokenizer.fit_on_texts(enTrain)
# English train
data_en_train = en_tokenizer.texts_to_sequences(enTrain)
data_en_train = tf.keras.preprocessing.sequence.pad_sequences(data_en_train,padding='post')
# English valid
data_en_valid = en_tokenizer.texts_to_sequences(enValid)
data_en_valid = tf.keras.preprocessing.sequence.pad_sequences(data_en_valid,padding='post')

# French tokenizer
fr_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token="<unk>")
fr_tokenizer.fit_on_texts(frTrain)
# French train
data_fr_train = fr_tokenizer.texts_to_sequences(frTrain)
data_fr_train = tf.keras.preprocessing.sequence.pad_sequences(data_fr_train,padding='post')
# French valid
data_fr_valid = fr_tokenizer.texts_to_sequences(frValid)
data_fr_valid = tf.keras.preprocessing.sequence.pad_sequences(data_fr_valid,padding='post')

In [0]:
def max_len(tensor):
    #print( np.argmax([len(t) for t in tensor]))
    return max( len(t) for t in tensor)

## Model Parameters

In [0]:
X_train = data_en_train
Y_train = data_fr_train
BATCH_SIZE = 64
BUFFER_SIZE = len(X_train)
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
embedding_dims = 256
rnn_units = 1024
dense_units = 1024
Dtype = tf.float32   #used to initialize DecoderCell Zero state

## Dataset Prepration

In [0]:
Tx = max_len(data_en_train)
Ty = max_len(data_fr_train)  

input_vocab_size = len(en_tokenizer.word_index)+1  
output_vocab_size = len(fr_tokenizer.word_index)+ 1
dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
example_X, example_Y = next(iter(dataset))
#print(example_X.shape) 
#print(example_Y.shape) 

## Defining NMT Model

In [0]:
#ENCODER
class EncoderNetwork(tf.keras.Model):
    def __init__(self,input_vocab_size,embedding_dims, rnn_units ):
        super().__init__()
        self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
                                                           output_dim=embedding_dims)
        self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units,return_sequences=True, 
                                                     return_state=True )

    
#DECODER
class DecoderNetwork(tf.keras.Model):
    def __init__(self,output_vocab_size, embedding_dims, rnn_units):
        super().__init__()
        self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,
                                                           output_dim=embedding_dims) 
        self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
        self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
        # Create attention mechanism with memory = None
        self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[Tx])
        self.rnn_cell =  self.build_rnn_cell(BATCH_SIZE)
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler= self.sampler,
                                                output_layer=self.dense_layer)

    def build_attention_mechanism(self, units,memory, memory_sequence_length):
        return tfa.seq2seq.LuongAttention(units, memory = memory, 
                                          memory_sequence_length=memory_sequence_length)
        #return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)

    # wrap decodernn cell  
    def build_rnn_cell(self, batch_size ):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
                                                attention_layer_size=dense_units)
        return rnn_cell
    
    def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size, 
                                                                dtype = Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state) 
        return decoder_initial_state

## Initializing Training functions

In [0]:
def loss_function(y_pred, y):
   
    #shape of y [batch_size, ty]
    #shape of y_pred [batch_size, Ty, output_vocab_size] 
    sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                                                  reduction='none')
    loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
    mask = tf.logical_not(tf.math.equal(y,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss


def train_step(input_batch, output_batch,encoder_initial_cell_state):
    #initialize loss = 0
    loss = 0
    with tf.GradientTape() as tape:
        encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
        a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, 
                                                        initial_state =encoder_initial_cell_state)

        #[last step activations,last memory_state] of encoder passed as input to decoder Network
        
         
        # Prepare correct Decoder input & output sequence data
        decoder_input = output_batch[:,:-1] # ignore <end>
        #compare logits with timestepped +1 version of decoder_input
        decoder_output = output_batch[:,1:] #ignore <start>


        # Decoder Embeddings
        decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)

        #Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
        decoderNetwork.attention_mechanism.setup_memory(a)
        decoder_initial_state = decoderNetwork.build_decoder_initial_state(BATCH_SIZE,
                                                                           encoder_state=[a_tx, c_tx],
                                                                           Dtype=tf.float32)
        
        #BasicDecoderOutput        
        outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp,initial_state=decoder_initial_state,
                                               sequence_length=BATCH_SIZE*[Ty-1])

        logits = outputs.rnn_output
        #Calculate loss

        loss = loss_function(logits, decoder_output)

    #Returns the list of all layer variables / weights.
    variables = encoderNetwork.trainable_variables + decoderNetwork.trainable_variables  
    # differentiate loss wrt variables
    gradients = tape.gradient(loss, variables)

    #grads_and_vars – List of(gradient, variable) pairs.
    grads_and_vars = zip(gradients,variables)
    optimizer.apply_gradients(grads_and_vars)
    return loss

In [0]:
#RNN LSTM hidden and memory state initializer
def initialize_initial_state():
        return [tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units))]

## Training

In [0]:
def train(epochs): 
  for i in range(1, epochs+1):

      encoder_initial_cell_state = initialize_initial_state()
      total_loss = 0.0

      for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
          batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
          total_loss += batch_loss
          if (batch+1)%5 == 0:
              print("total loss: {} epoch {} batch {} ".format(batch_loss.numpy(), i, batch+1))


In [0]:
encoderNetwork = EncoderNetwork(input_vocab_size,embedding_dims, rnn_units)
decoderNetwork = DecoderNetwork(output_vocab_size,embedding_dims, rnn_units)
optimizer = tf.keras.optimizers.Adam()

In [0]:
train(15)

#encoderNetwork.save_weights()
#torch.save(decoderNetwork.state_dict(), 'decoder.dict')

total loss: 1.9634898900985718 epoch 1 batch 5 
total loss: 1.7855565547943115 epoch 1 batch 10 
total loss: 1.4182482957839966 epoch 1 batch 15 
total loss: 1.4669640064239502 epoch 1 batch 20 
total loss: 1.5078994035720825 epoch 1 batch 25 
total loss: 1.5681970119476318 epoch 1 batch 30 
total loss: 1.3845863342285156 epoch 1 batch 35 
total loss: 1.5007833242416382 epoch 1 batch 40 
total loss: 1.3609955310821533 epoch 1 batch 50 
total loss: 1.5118592977523804 epoch 1 batch 55 
total loss: 1.610312581062317 epoch 1 batch 60 
total loss: 1.4810110330581665 epoch 1 batch 65 
total loss: 1.3774493932724 epoch 1 batch 70 
total loss: 1.4326605796813965 epoch 1 batch 75 
total loss: 1.3058146238327026 epoch 1 batch 80 
total loss: 1.2256760597229004 epoch 1 batch 85 
total loss: 1.2180213928222656 epoch 1 batch 90 
total loss: 1.36214280128479 epoch 1 batch 95 
total loss: 1.245031476020813 epoch 1 batch 100 
total loss: 1.4819265604019165 epoch 1 batch 105 
total loss: 1.478101253509

## Evaluation

In [0]:
def cleanDataset(myDataset):
  myCleanDataset = []
  for ele in myDataset:
    sentence = re.sub('<start> ', '', ele)
    sentence = re.sub(' <end>', '', sentence)
    myCleanDataset.append(sentence)
  return myCleanDataset

cleanEnValid = cleanDataset(enValid)
cleanFrValid = cleanDataset(frValid)
cleanEnTrain = cleanDataset(enTrain)
cleanFrTrain = cleanDataset(frTrain)

In [0]:
def translateSentence(input_raw):
  #input_raw="so too does the idea that accommodating religious differences is dangerous"

  # We have a transcript file containing English-Hindi pairs
  # Preprocess X
  input_lines = ['<start> '+input_raw+'']
  input_sequences = [[en_tokenizer.word_index[w] for w in line.split(' ')] for line in input_lines]
  input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences,
                                                                  maxlen=Tx, padding='post')
  inp = tf.convert_to_tensor(input_sequences)
  inference_batch_size = input_sequences.shape[0]
  encoder_initial_cell_state = [tf.zeros((inference_batch_size, rnn_units)),
                                tf.zeros((inference_batch_size, rnn_units))]
  encoder_emb_inp = encoderNetwork.encoder_embedding(inp)
  a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp,
                                                  initial_state =encoder_initial_cell_state)

  start_tokens = tf.fill([inference_batch_size],fr_tokenizer.word_index['<start>'])

  end_token = fr_tokenizer.word_index['<end>']

  greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

  decoder_input = tf.expand_dims([fr_tokenizer.word_index['<start>']]* inference_batch_size,1)
  decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)

  decoder_instance = tfa.seq2seq.BasicDecoder(cell = decoderNetwork.rnn_cell, sampler = greedy_sampler,
                                              output_layer=decoderNetwork.dense_layer)
  decoderNetwork.attention_mechanism.setup_memory(a)
  decoder_initial_state = decoderNetwork.build_decoder_initial_state(inference_batch_size,
                                                                    encoder_state=[a_tx, c_tx],
                                                                    Dtype=tf.float32)

  # Since we do not know the target sequence lengths in advance, we use maximum_iterations to limit the translation lengths.
  # One heuristic is to decode up to two times the source sentence lengths.
  maximum_iterations = tf.round(tf.reduce_max(Tx) * 2)

  #initialize inference decoder
  decoder_embedding_matrix = decoderNetwork.decoder_embedding.variables[0] 
  (first_finished, first_inputs,first_state) = decoder_instance.initialize(decoder_embedding_matrix,
                              start_tokens = start_tokens,
                              end_token=end_token,
                              initial_state = decoder_initial_state)
 
  inputs = first_inputs
  state = first_state  
  predictions = np.empty((inference_batch_size,0), dtype = np.int32)                                                                             
  for j in range(maximum_iterations):
      outputs, next_state, next_inputs, finished = decoder_instance.step(j,inputs,state)
      inputs = next_inputs
      state = next_state
      outputs = np.expand_dims(outputs.sample_id,axis = -1)
      predictions = np.append(predictions, outputs, axis = -1)
  
  for i in range(len(predictions)):
    line = predictions[i,:]
    seq = list(itertools.takewhile( lambda index: index !=2, line))
    sentence = ""
    for w in seq:
      word = fr_tokenizer.index_word[w]
      if word != "<end>":
        sentence += word
        sentence += " "
      else:
        break
  return sentence

## Final Translation

In [0]:
translatedSentences = []
trainTranslate = []
for sentence in cleanEnTrain[0:100]:
  trainTranslate.append(translateSentence(sentence))

In [0]:
pip install sacrebleu

Collecting sacrebleu
[?25l  Downloading https://files.pythonhosted.org/packages/f5/58/5c6cc352ea6271125325950715cf8b59b77abe5e93cf29f6e60b491a31d9/sacrebleu-1.4.6-py3-none-any.whl (59kB)
[K     |█████▌                          | 10kB 25.8MB/s eta 0:00:01[K     |███████████                     | 20kB 3.1MB/s eta 0:00:01[K     |████████████████▋               | 30kB 4.5MB/s eta 0:00:01[K     |██████████████████████▏         | 40kB 2.9MB/s eta 0:00:01[K     |███████████████████████████▋    | 51kB 3.6MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.2MB/s 
Collecting portalocker
  Downloading https://files.pythonhosted.org/packages/64/03/9abfb3374d67838daf24f1a388528714bec1debb1d13749f0abd7fb07cfb/portalocker-1.6.0-py2.py3-none-any.whl
Collecting mecab-python3
[?25l  Downloading https://files.pythonhosted.org/packages/18/49/b55a839a77189042960bf96490640c44816073f917d489acbc5d79fa5cc3/mecab_python3-0.996.5-cp36-cp36m-manylinux2010_x86_64.whl (17.1MB)
[K     |██

In [0]:
import sacrebleu

bleu_scores = []
for i in range(len(trainTranslate)):
    bleu_scores += [sacrebleu.corpus_bleu(trainTranslate[i], cleanFrTrain[i]).score]
    
np.mean(bleu_scores)

['donc , cela ne peut pas etre en energie ', 'monsieur le president , monsieur le commissaire , mesdames et messieurs , je voudrais remercier le rapporteur pour sa premiere cooperation ', 'la commission presentera un role proche de la publicite , mais je souhaite , en principe de compte , une fois de soutien publique et concrete , est un pourcentage d emissions d assistance et de lutte contre le terrorisme ', 'il est imperatif que ce rapport est raisonnable ', 'la commission de l environnement et de la sante publique et de la politique de l union europeenne est que l europe est une faible de la dette ', 'en outre , la commission a publie une proposition visant a mettre en place une politique coherente et constante d aide a l encontre des criteres de lutte utilisee et evaluer la corruption internationale a l egard de la communaute mondiale ', 'mais , monsieur le commissaire , mesdames et messieurs , je voudrais vous remercier de votre attention ', 'ils ont egalise les conditions de crai

2.6747765385227456

### The accuracy can be improved by implementing:
* Beam Search or Lexicon Search
* Bi-directional encoder-decoder model 