# Translation by RNN with Attention

This note book shows a sequence to sequence (seq2seq) implementaion using tensorflow and keras. Basically I followed [this notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb#scrollTo=CiwtNgENbx2g).

In [1]:
import numpy as np
import tensorflow as tf
import keras

Using TensorFlow backend.


## Prepare data

### Get data

In [2]:
import sys
import os
import urllib
import zipfile
import unicodedata
import re

In [3]:
url = 'http://download.tensorflow.org/data/spa-eng.zip'

In [4]:
def fetch_data(filename):
    data_dir = os.path.join(os.getcwd(), 'data')
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    filepath = os.path.join(data_dir, filename)
    if not os.path.exists(filepath):
        filepath, _ = urllib.request.urlretrieve(url, filepath)
    return filepath

In [5]:
filepath = fetch_data('spa-eng.zip')

### Preprocess

In [6]:
def unicode_to_ascii(s):
    '''Convert the unicode file to ascii'''
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [7]:
def preprocess_sentence(w):
    '''Preprocess sentence.
    Replace everything with space except for some specific punctuations.
    '''
    w = unicode_to_ascii(w.lower().strip())
    
    # create a space between a word and the punctuation
    w = re.sub(r'([?.!,¿])', r' \1 ', w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r'[^a-zA-Z?.!,¿]+', ' ', w)
    w = w.rstrip().strip()
    
    # adding a start and and end token to the setence
    w = '<SOS> '+w+' <EOS>'
    return w

In [8]:
def read_data(filepath, num_samples):
    with zipfile.ZipFile(filepath) as f:
        lines = tf.compat.as_str(f.read(f.namelist()[-1])).split('\n')
    word_pairs = [[preprocess_sentence(w) 
                   for w in l.split('\t')] for l in lines[:num_samples]]
    return word_pairs

In [9]:
data = read_data(filepath, 30000)

In [10]:
data[:10]

[['<SOS> go . <EOS>', '<SOS> ve . <EOS>'],
 ['<SOS> go . <EOS>', '<SOS> vete . <EOS>'],
 ['<SOS> go . <EOS>', '<SOS> vaya . <EOS>'],
 ['<SOS> go . <EOS>', '<SOS> vayase . <EOS>'],
 ['<SOS> hi . <EOS>', '<SOS> hola . <EOS>'],
 ['<SOS> run ! <EOS>', '<SOS> corre ! <EOS>'],
 ['<SOS> run . <EOS>', '<SOS> corred . <EOS>'],
 ['<SOS> who ? <EOS>', '<SOS> ¿ quien ? <EOS>'],
 ['<SOS> fire ! <EOS>', '<SOS> fuego ! <EOS>'],
 ['<SOS> fire ! <EOS>', '<SOS> incendio ! <EOS>']]

In [11]:
data[-10:]

[['<SOS> we will have much fun . <EOS>',
  '<SOS> vamos a divertirnos mucho . <EOS>'],
 ['<SOS> we will not surrender . <EOS>', '<SOS> no nos rendiremos . <EOS>'],
 ['<SOS> we wish you well , tom . <EOS>',
  '<SOS> te deseamos lo mejor , tom . <EOS>'],
 ['<SOS> we won t let you down . <EOS>',
  '<SOS> no te decepcionaremos . <EOS>'],
 ['<SOS> we work to earn money . <EOS>',
  '<SOS> trabajamos para ganar dinero . <EOS>'],
 ['<SOS> we ll go after we eat . <EOS>',
  '<SOS> iremos despues de comer . <EOS>'],
 ['<SOS> we ll live like kings . <EOS>', '<SOS> viviremos como reyes . <EOS>'],
 ['<SOS> we ll lose everything . <EOS>', '<SOS> lo perderemos todo . <EOS>'],
 ['<SOS> we ll meet right here . <EOS>',
  '<SOS> nos encontraremos aqui mismo . <EOS>'],
 ['<SOS> we ll see you at . <EOS>',
  '<SOS> te veremos a las dos y media . <EOS>']]

### Build dictionaries

Since we need look up tables for both spanish and english, we gonna build a class.

In [12]:
class WordIndex(object):
    def __init__(self, sentence_iter):
        '''
        sentence_iter: an iterable object that gives an sentence at each iteration
        '''
        self.sentence_iter = sentence_iter
        # do not include 'UNK' for know
        # vocab = ['<pad>', '<UNK>']
        self.vocab = ['<pad>']
        self.build_dict()
        
    def build_dict(self):
        seen = set()
        for setence in self.sentence_iter:
            seen.update(setence.split())
        self.vocab.extend(sorted(seen))
        
        self.word2ind = {word: i for i, word in enumerate(self.vocab)}
        self.ind2word = {i: word for i, word in enumerate(self.vocab)}
        
    def __len__(self):
        return len(self.word2ind)

In [13]:
input_ind = WordIndex(en for en, sp in data)
target_ind = WordIndex(sp for en, sp in data)

### Convert words to indexes

In [14]:
input_tensor = [[input_ind.word2ind[word] for word in en.split()] for en, sp in data]
target_tensor = [[target_ind.word2ind[word] for word in sp.split()] for en, sp in data]

In [15]:
max_len_input = max([len(item) for item in input_tensor])
max_len_target = max([len(item) for item in target_tensor])

In [16]:
max_len_input

11

In [17]:
max_len_target

16

### Pad the tensor to have the same length

In [18]:
input_tensor = keras.preprocessing.sequence.pad_sequences(input_tensor,
                                                         maxlen=max_len_input,
                                                         padding='post')
target_tensor = keras.preprocessing.sequence.pad_sequences(target_tensor,
                                                          maxlen=max_len_target,
                                                          padding='post')

### Create mini-batch

Note that this won't work under eager mode (tensorflow 1.5 version).

In [19]:
def get_batch(train, target, batch_size=32, epochs=1, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((train, target))
    if shuffle:
        dataset = dataset.shuffle(len(train))
    dataset = dataset.batch(batch_size).repeat(epochs)
    train_batch, target_batch = dataset.make_one_shot_iterator().get_next()
    return train_batch, target_batch

## Build Model

We gonna build encoder and decoder based on keras.Model class. The RNN we gonna use is GRU.

In [20]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.hidden_size, return_sequences=True,
                                   return_state=True, recurrent_activation='sigmoid',
                                   recurrent_initializer='glorot_uniform')
    
    def __call__(self, x, hidden_state):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden_state)
        return output, state
    
    def initialize_hidden_state(self, batch_size):
        return tf.zeros((batch_size, self.hidden_size))

Implemented attention model when decode. I am following Bahdanau's additive style, where the score is the product of a weighting matrix and the activation of encoder output and hidden states. Summing up encoder output weighted by the attention score, we obtain the context vector. And by concating this context vector with embedding output for decoder, we send the merged vector to GRU.

In [21]:
class Decoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.hidden_size, return_sequences=True,
                                   return_state=True, recurrent_activation='sigmoid',
                                   recurrent_initializer='glorot_uniform')
        self.fc = keras.layers.Dense(vocab_size)
        
        # attention
        self.W1 = keras.layers.Dense(self.hidden_size)
        self.W2 = keras.layers.Dense(self.hidden_size)
        self.V = keras.layers.Dense(1)
        
    def __call__(self, x, hidden_state, encoder_output):
        '''encoder_output shape == (batch_size, max_length, hidden_size)
        hidden_state shape == (batch_size, hidden_size)
        hidden_with_time_axis shape = (batch_size, 1, hidden_size)
        perform addition to calculate the attention score.
        '''
        hidden_with_time_axis = tf.expand_dims(hidden_state, 1)
        # score shape == (batch_size, max_length, 1)
        score = self.V(tf.nn.tanh(self.W1(encoder_output)+self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape == (batch_size, hidden_size)
        context_vector = tf.reduce_sum(attention_weights*encoder_output, axis=1)
        
        # x shape == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        # concat input embedding with summed attention
        x = tf.concat([tf.expand_dims(context_vector, axis=1), x], axis = -1)
        
        output, state = self.gru(x)#, initial_state=hidden_state)
        output = tf.reshape(output, (-1, output.shape[-1]))
        
        x = self.fc(output)
        return x, state, attention_weights
    
    def inilialize_hidden_state(self, batch_size):
        return tf.zeros((batch_size, self.hidden_size))

In [22]:
def loss_function(real, logits):
    '''mask padded words'''
    mask = 1-np.equal(real, 0)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=logits)*mask
    return tf.reduce_mean(loss)

In [23]:
embedding_dim = 256
hidden_size = 512
batch_size = 64

### Build graph

In [None]:
target_ind.word2ind['<SOS>']

5

In [25]:
tf.reset_default_graph()

inputs = tf.placeholder(tf.int32, shape=[None, max_len_input])
labels = tf.placeholder(tf.int32, shape=[None, max_len_target])

encoder = Encoder(len(input_ind), embedding_dim, hidden_size)
decoder = Decoder(len(target_ind), embedding_dim, hidden_size)

hidden_state = encoder.initialize_hidden_state(tf.shape(inputs)[0])
enc_output, enc_hidden = encoder(inputs, hidden_state)
dec_hidden = enc_hidden

dec_input = tf.fill((tf.shape(inputs)[0], 1), target_ind.word2ind['<SOS>'])
loss = tf.get_variable('marginalLoss', [], dtype=tf.float32, 
                       initializer=tf.constant_initializer(0), trainable=False)

# training loss
for t in range(1, max_len_target):
    logits, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
    loss = loss+loss_function(labels[:, t], logits)
    dec_input = tf.expand_dims(labels[:, t], 1)
    
# prediction
dec_input = tf.fill((tf.shape(inputs)[0], 1), target_ind.word2ind['<SOS>'])
dec_hidden = enc_hidden
# results = tf.zeros((tf.shape(inputs)[0], max_len_target), tf.int32)
out_list = []
for t in range(max_len_target):
    predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_output)
    predicted_id = tf.argmax(predictions, axis=1)
    out_list.append(predicted_id)
    dec_input = tf.expand_dims(predicted_id, 1)
predicts = tf.transpose(tf.stack(out_list))

In [26]:
# x_batch, y_batch = get_batch(input_tensor, target_tensor, batch_size=batch_size, epochs=1)
# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     x, y = sess.run([x_batch, y_batch])
#     feed_dict = {inputs: x, labels: y}
#     temp = sess.run(predicts, feed_dict)

## Training

In [27]:
epochs = 10
max_iter = 500000

optimizer = tf.train.AdamOptimizer().minimize(loss)

x_batch, y_batch = get_batch(input_tensor, target_tensor, batch_size=batch_size, epochs=epochs)

saver = tf.train.Saver()

In [28]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    n_iter = 0
    try:
        while n_iter < max_iter:
            x, y = sess.run([x_batch, y_batch])
            feed_dict = {inputs: x, labels: y}
            _, _loss = sess.run([optimizer, loss], feed_dict)
            if n_iter %100 == 0:
                print('Iteration %d, loss: %f' %(n_iter, _loss))
            n_iter += 1
    except tf.errors.OutOfRangeError:
        print('--- Read all epochs ---')
    print('--- Training Ends ---')
    saver.save(sess, 'checkpoints/attention.ckpt')

Iteration 0, loss: 137.240677
Iteration 100, loss: 29.680830
Iteration 200, loss: 26.125179
Iteration 300, loss: 26.297970
Iteration 400, loss: 23.713600
Iteration 500, loss: 23.031403
Iteration 600, loss: 24.013575
Iteration 700, loss: 23.377451
Iteration 800, loss: 21.455414
Iteration 900, loss: 20.955591
Iteration 1000, loss: 20.509119
Iteration 1100, loss: 19.184927
Iteration 1200, loss: 19.717228
Iteration 1300, loss: 18.034071
Iteration 1400, loss: 18.910034
Iteration 1500, loss: 15.334837
Iteration 1600, loss: 18.521866
Iteration 1700, loss: 18.076632
Iteration 1800, loss: 17.313713
Iteration 1900, loss: 14.627246
Iteration 2000, loss: 16.352417
Iteration 2100, loss: 14.876765
Iteration 2200, loss: 13.496007
Iteration 2300, loss: 13.763338
Iteration 2400, loss: 13.385315
Iteration 2500, loss: 11.982315
Iteration 2600, loss: 13.110905
Iteration 2700, loss: 14.675159
Iteration 2800, loss: 12.254469
Iteration 2900, loss: 11.638268
Iteration 3000, loss: 9.245835
Iteration 3100, loss

## Prediction

In [29]:
X_batch, _ = get_batch(input_tensor, target_tensor, batch_size=batch_size, shuffle=False)
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    x = sess.run(X_batch)
    feed_dict = {inputs: x}
    out = sess.run(predicts, feed_dict)

INFO:tensorflow:Restoring parameters from checkpoints/attention.ckpt


In [30]:
out

array([[9090,    3,    4, ...,    0,    0,    0],
       [9090,    3,    4, ...,    0,    0,    0],
       [9090,    3,    4, ...,    0,    0,    0],
       ..., 
       [6211, 5498, 9071, ...,    0,    0,    0],
       [6211, 5498, 9071, ...,    0,    0,    0],
       [6211, 5498, 9071, ...,    0,    0,    0]])

In [32]:
[[' '.join(target_ind.ind2word[item] for item in temp if item != 0) for temp in out]]

[['ve . <EOS>',
  've . <EOS>',
  've . <EOS>',
  've . <EOS>',
  'hola . <EOS>',
  'corre ! <EOS>',
  'corre . <EOS>',
  '¿ quien es ? <EOS>',
  'fuego ! <EOS>',
  'fuego ! <EOS>',
  'fuego ! <EOS>',
  'ayuda ! <EOS>',
  'ayuda ! <EOS>',
  'ayuda ! <EOS>',
  'coge ! <EOS>',
  'coge . <EOS>',
  'dejen de nuevo . <EOS>',
  'dejen de nuevo . <EOS>',
  'dejen de nuevo . <EOS>',
  'espera ! <EOS>',
  'espera . <EOS>',
  'entra . <EOS>',
  'entra . <EOS>',
  'hola ! <EOS>',
  'me quede . <EOS>',
  'me quede . <EOS>',
  'yo lo entiendo . <EOS>',
  'yo mismo ! <EOS>',
  'oh , no ! <EOS>',
  'tomatelo con soda . <EOS>',
  'sonrie . <EOS>',
  'se quedaron ! <EOS>',
  'se quedaron ! <EOS>',
  'sal de otro lado . <EOS>',
  'vete ahora mismo . <EOS>',
  'simplifica ! <EOS>',
  '¿ entendiste ? <EOS>',
  '¿ entendiste ? <EOS>',
  'el se cayo . <EOS>',
  'subete a tiempo . <EOS>',
  'abrazame . <EOS>',
  'me quede . <EOS>',
  'se que se que se . <EOS>',
  'me dejaron . <EOS>',
  'me acuerdo de menos 

In [34]:
[[' '.join(target_ind.ind2word[item] for item in temp if item != 0) 
                      for temp in target_tensor[:batch_size]]]

[['<SOS> ve . <EOS>',
  '<SOS> vete . <EOS>',
  '<SOS> vaya . <EOS>',
  '<SOS> vayase . <EOS>',
  '<SOS> hola . <EOS>',
  '<SOS> corre ! <EOS>',
  '<SOS> corred . <EOS>',
  '<SOS> ¿ quien ? <EOS>',
  '<SOS> fuego ! <EOS>',
  '<SOS> incendio ! <EOS>',
  '<SOS> disparad ! <EOS>',
  '<SOS> ayuda ! <EOS>',
  '<SOS> socorro ! auxilio ! <EOS>',
  '<SOS> auxilio ! <EOS>',
  '<SOS> salta ! <EOS>',
  '<SOS> salte . <EOS>',
  '<SOS> parad ! <EOS>',
  '<SOS> para ! <EOS>',
  '<SOS> pare ! <EOS>',
  '<SOS> espera ! <EOS>',
  '<SOS> esperen . <EOS>',
  '<SOS> continua . <EOS>',
  '<SOS> continue . <EOS>',
  '<SOS> hola . <EOS>',
  '<SOS> corri . <EOS>',
  '<SOS> corria . <EOS>',
  '<SOS> lo intento . <EOS>',
  '<SOS> he ganado ! <EOS>',
  '<SOS> oh , no ! <EOS>',
  '<SOS> tomatelo con soda . <EOS>',
  '<SOS> sonrie . <EOS>',
  '<SOS> al ataque ! <EOS>',
  '<SOS> atacad ! <EOS>',
  '<SOS> levanta . <EOS>',
  '<SOS> ve ahora mismo . <EOS>',
  '<SOS> lo tengo ! <EOS>',
  '<SOS> ¿ lo pillas ? <EOS>',
 

## Attention Weights

In [35]:
X_batch, _ = get_batch(input_tensor, target_tensor, batch_size=batch_size, shuffle=False)
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    x = sess.run(X_batch)
    feed_dict = {inputs: x}
    temp = sess.run(attention_weights, feed_dict)

INFO:tensorflow:Restoring parameters from checkpoints/attention.ckpt


Attention of input words (11 words) for the last word

In [41]:
temp[0]

array([[ 0.71082669],
       [ 0.0059496 ],
       [ 0.2104039 ],
       [ 0.04182935],
       [ 0.01627783],
       [ 0.0026149 ],
       [ 0.00209989],
       [ 0.00236544],
       [ 0.00249651],
       [ 0.00255246],
       [ 0.00258339]], dtype=float32)