## Language Translation using a Sequence-to-Sequence encoder-decoder Architecture

### Grab the data populated using the Data_Provider.ipynb notebook

In [1]:
import helper
import tensorflow as tf

In [60]:
(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = helper.load_preprocess()

### Prepare the inputs

In [3]:
batch_size = 32
max_source_seq_len = max([len(sentence) for sentence in source_int_text])
max_target_seq_len = max([len(sentence) for sentence in target_int_text])
print("source seq len: ", max_source_seq_len)
print("target seq len: ", max_target_seq_len)

source seq len:  17
target seq len:  22


In [4]:
source_seq_len_ = tf.placeholder_with_default(max_source_seq_len, None)
target_seq_len_ = tf.placeholder_with_default(max_target_seq_len, None)

In [5]:
inputs_ = tf.placeholder(tf.int32, [None, None], name="input")
targets_ = tf.placeholder(tf.int32, [None, None])

In [6]:
lr_ = tf.placeholder(tf.float32)
dropout_ = tf.placeholder(tf.float32, name="dropout")

In [7]:
print(len(source_int_text))
print(len(target_int_text))

4999
4999


### Processing target input

For some reason, unknown at the moment, we transform the target by
- removing the last word from each sentence
- placing the `<GO>` ID to the beginning of each sentence

In [8]:
def process_decoding_input(target_data, target_vocab_to_int, batch_size):
    """
    Preprocess target data for dencoding
    :param target_data: Target Placehoder
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :param batch_size: Batch Size
    :return: Preprocessed target data
    """
    begin_indice = [0, 0]
    end_indice = [batch_size, -1]
    stride = [1, 1]
    strided_slices = tf.strided_slice(target_data, begin_indice, end_indice, stride)
    print("strided_slices: ", strided_slices)    
    
    target_tensor_rank = [batch_size, 1]
    value_to_fill_with = target_vocab_to_int['<GO>']
    dummy_filled_tensor = tf.fill(target_tensor_rank, value_to_fill_with)
    
    dec_input = tf.concat( [dummy_filled_tensor, strided_slices], 1)
    print("dec_input: ",dec_input)    

    return dec_input


In [9]:
# #My own little experiment to understand the process_decoding_input method:

def test_process_decoding_input(process_decoding_input):
    ''' 
    Method is used to modify the input to the decoder.
    The input to the decoder is the target translated language
    '''

    batch_size = 5
    seq_length = 9
    target_vocab_to_int = {'<GO>': -99}
    with tf.Graph().as_default():
        target_data = tf.placeholder(tf.int32, [None, None])
        dec_input = process_decoding_input(target_data, target_vocab_to_int, batch_size)

        test_target_data = [[10, 20, 30, 99], [40, 18, 23,23],[12,14,15,33],[33,44,55,87],[11,22,33,78]]
        with tf.Session() as sess:
            test_dec_input = sess.run(dec_input, {target_data: test_target_data})
       
        print("dec_input_got: \n",test_dec_input)
        print ("done with my little experiment")

test_process_decoding_input(process_decoding_input)

strided_slices:  Tensor("StridedSlice:0", shape=(?, ?), dtype=int32)
dec_input:  Tensor("concat:0", shape=(5, ?), dtype=int32)
dec_input_got: 
 [[-99  10  20  30]
 [-99  40  18  23]
 [-99  12  14  15]
 [-99  33  44  55]
 [-99  11  22  33]]
done with my little experiment


### Apply embedding to input data

In [10]:
source_vocab_size = len(source_vocab_to_int)
print("source vocab size", source_vocab_size)

source vocab size 231


In [11]:
enc_embed_size = 27

In [12]:
# shape: (batch_size, seq_length, embedding_size)
enc_embed_input = tf.contrib.layers.embed_sequence(inputs_, source_vocab_size, enc_embed_size)

In [13]:
print("enc embed input: ", enc_embed_input.shape.as_list())

enc embed input:  [None, None, 27]


### Implement the "encoder" layer of the encoder-decoder architecture, using the embeddings from above.
This is just going to be a regular LSTM cell with some dropout

In [14]:
lstm_size = 64
num_layers = 1
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=0.5)
enc_cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)

In [15]:
# shape of enc_state is [batch_size, cell.state_size (=64 in this case)]
_ , enc_cell_state = tf.nn.dynamic_rnn(enc_cell, enc_embed_input, dtype=tf.float32)

In [16]:
print("enc cell state: ", enc_cell_state)

enc cell state:  (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_2:0' shape=(?, 64) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 64) dtype=float32>),)


### Get the input to the decoder by running it through the decoding_input_processor

In [17]:
dec_input = process_decoding_input(target_data=targets_, target_vocab_to_int=target_vocab_to_int, batch_size=batch_size)

strided_slices:  Tensor("StridedSlice:0", shape=(?, ?), dtype=int32)
dec_input:  Tensor("concat:0", shape=(32, ?), dtype=int32)


In [18]:
# shape: [batch_size, seq_length]
print(dec_input.shape.as_list())

[32, None]


### Embed the input to the decoding sequence (just like the input to the encoder)

In [19]:
target_vocab_size = len(target_vocab_to_int)
dec_embed_size = 30

In [20]:
print("target vocab size: ",target_vocab_size)

target vocab size:  358


In [21]:
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, dec_embed_size]))

In [22]:
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)

In [23]:
print("dec embed input: ",dec_embed_input.shape.as_list())

dec embed input:  [32, None, 30]


### Decoder RNN Cell

In [24]:
lstm_size_dec = 56

In [25]:
lstm_dec = tf.contrib.rnn.BasicLSTMCell(lstm_size_dec)
drop_dec = tf.contrib.rnn.DropoutWrapper(lstm, dropout_)
dec_cell = tf.contrib.rnn.MultiRNNCell([drop_dec] * num_layers)

### Decoder training 

In [26]:
train_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_train(encoder_state=enc_cell_state)

### dynamic_rnn_decoder
This method is listed as being comparable to the 'dynamic_rnn' method that I've used several times before for the dynamic unrolling of RNNs. Doc at: https://www.tensorflow.org/versions/r1.0/api_docs/python/tf/contrib/seq2seq/dynamic_rnn_decoder. Method seems
to require a 'decoder_fn', which is provided by the invocation of the tf.contrib.seq2seq.simple_decoder_fn_train(..) method call.

The most important output at the end of this is the var: **train_logits**

In [27]:
output_fn = lambda x: tf.contrib.layers.fully_connected(x, target_vocab_size, None, scope=decoding_scope)

In [28]:
with tf.variable_scope("decoding") as decoding_scope:
    train_pred, _ , _ = tf.contrib.seq2seq.dynamic_rnn_decoder(cell=dec_cell, decoder_fn=train_decoder_fn, inputs=dec_embed_input, sequence_length=target_seq_len_, scope=decoding_scope)
    train_logits = output_fn(train_pred)

In [29]:
print(train_logits)

Tensor("decoding/Reshape_1:0", shape=(32, ?, 358), dtype=float32)


In [30]:
print(train_logits.shape.as_list())

[32, None, 358]


### Decoding inference
The most important value at the end of this setup is: **infer_logits**. The infer_logits is the node that will be used to do validation (i.e compute accuracy, form translation etc.)

In [31]:
start_sequence_id = target_vocab_to_int['<GO>']

In [32]:
end_sequence_id = target_vocab_to_int['<EOS>']

In [33]:
print (start_sequence_id, end_sequence_id)

3 1


In [34]:
infer_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_inference(output_fn, enc_cell_state, dec_embeddings, start_sequence_id, end_sequence_id, target_seq_len_, target_vocab_size)


In [35]:
print(infer_decoder_fn)

<function simple_decoder_fn_inference.<locals>.decoder_fn at 0x1211a8268>


In [36]:
with tf.variable_scope("decoding", reuse=True) as decoding_scope:
    infer_logits, _, _= tf.contrib.seq2seq.dynamic_rnn_decoder(cell=dec_cell, decoder_fn=infer_decoder_fn, inputs=None, sequence_length=target_seq_len_,scope=decoding_scope)

In [37]:
print(infer_logits.shape.as_list())

[None, None, 358]


## Specify the loss and optimizer methods

In [38]:
# just give the tensor a name.. After the model is saved and loaded back, we can then get this tensor by its name
# i.e. 'logits:0' which can be used to form translations for us. 
tf.identity(infer_logits, 'logits') 

<tf.Tensor 'logits:0' shape=(?, ?, 358) dtype=float32>

In [39]:
with tf.name_scope("optimization"):
    cost = tf.contrib.seq2seq.sequence_loss(train_logits, targets_, tf.ones([batch_size, target_seq_len_]))
    optimizer = tf.train.AdamOptimizer(lr_)
    gradients = optimizer.compute_gradients(cost)    
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

### Note
Train_logits is conditioned on the **encoder state** (which accepts input from the _source_ language), and also accepts input also from the **targets** (i.e. the _target_ language). Thus it's reasonable to assume that tf.contrib.seq2seq.sequence_loss legitimately evaluates the cost.

Also, from the documentation of tf.contrib.seq2seq.sequence_loss, the last parameter (weights) constitutes the weighting of each prediction in the sequence. I am not sure I understand this completely...

In [40]:
print("train_logits: ",train_logits.shape.as_list())
print("targets: ", targets_.shape.as_list())
print("weights: ", [batch_size, target_seq_len_]) 

train_logits:  [32, None, 358]
targets:  [None, None]
weights:  [32, <tf.Tensor 'PlaceholderWithDefault_1:0' shape=<unknown> dtype=int32>]


## Train

In [41]:
batch_data = helper.batch_data(source_int_text, target_int_text, batch_size)

In [42]:
num_batch_per_epoch = len(source_int_text) // batch_size

In [43]:
print(num_batch_per_epoch)

156


In [44]:
print(num_batch_per_epoch)

156


In [45]:
num_epochs = 2

In [46]:
saver = tf.train.Saver()

In [47]:
import time
import numpy as np
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1]), (0,0)],
            'constant')

    return np.mean(np.equal(target, np.argmax(logits, 2)))

In [48]:
train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]

valid_source = helper.pad_sentence_batch(source_int_text[:batch_size])
valid_target = helper.pad_sentence_batch(target_int_text[:batch_size])

In [49]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_i in range(num_epochs):
        batch_data = helper.batch_data(train_source, target_int_text, batch_size)
        for batch_i, (source_batch, target_batch) in enumerate(batch_data):
            if(batch_i < batch_size):
                # ------- compute loss -----------------
                _ , loss = sess.run([train_op, cost], {
                    inputs_ : source_batch,
                    targets_ : target_batch,
                    source_seq_len_ : len(source_batch[batch_i]),
                    target_seq_len_ : len(target_batch[batch_i]),
                    lr_ : 0.001,
                    dropout_ : 0.5
                })

            batch_train_logits = sess.run(
                infer_logits,
                {inputs_: source_batch, dropout_: 1.0})
            batch_valid_logits = sess.run(
                infer_logits,
                {inputs_: valid_source, dropout_: 1.0})
                
            train_acc = get_accuracy(target_batch, batch_train_logits)
            valid_acc = get_accuracy(np.array(valid_target), batch_valid_logits)
                
            # ------- print stats ------------------            
            print('Epoch {:>3} Train Acc: {:>6.3f} Valid Acc: {:>6.3f} Loss: {:>6.3f}'.format(epoch_i, train_acc, valid_acc, loss))
    
    # ------- save the model from the last epoch --------- 
    saver.save(sess,"./saved_models/models")

Epoch   0 Train Acc:  0.001 Valid Acc:  0.004 Loss:  5.900
Epoch   0 Train Acc:  0.057 Valid Acc:  0.058 Loss:  5.850
Epoch   0 Train Acc:  0.268 Valid Acc:  0.277 Loss:  5.828
Epoch   0 Train Acc:  0.318 Valid Acc:  0.300 Loss:  5.789
Epoch   0 Train Acc:  0.270 Valid Acc:  0.300 Loss:  5.731
Epoch   0 Train Acc:  0.355 Valid Acc:  0.325 Loss:  5.701
Epoch   0 Train Acc:  0.357 Valid Acc:  0.323 Loss:  5.650
Epoch   0 Train Acc:  0.365 Valid Acc:  0.323 Loss:  5.630
Epoch   0 Train Acc:  0.322 Valid Acc:  0.323 Loss:  5.622
Epoch   0 Train Acc:  0.332 Valid Acc:  0.323 Loss:  5.556
Epoch   0 Train Acc:  0.307 Valid Acc:  0.323 Loss:  5.538
Epoch   0 Train Acc:  0.332 Valid Acc:  0.323 Loss:  5.427
Epoch   0 Train Acc:  0.344 Valid Acc:  0.323 Loss:  5.386
Epoch   0 Train Acc:  0.353 Valid Acc:  0.323 Loss:  5.319
Epoch   0 Train Acc:  0.344 Valid Acc:  0.323 Loss:  5.278
Epoch   0 Train Acc:  0.338 Valid Acc:  0.323 Loss:  5.266
Epoch   0 Train Acc:  0.361 Valid Acc:  0.323 Loss:  5.1

Epoch   0 Train Acc:  0.371 Valid Acc:  0.342 Loss:  4.179
Epoch   0 Train Acc:  0.304 Valid Acc:  0.342 Loss:  4.179
Epoch   0 Train Acc:  0.394 Valid Acc:  0.342 Loss:  4.179
Epoch   0 Train Acc:  0.370 Valid Acc:  0.342 Loss:  4.179
Epoch   0 Train Acc:  0.367 Valid Acc:  0.342 Loss:  4.179
Epoch   0 Train Acc:  0.370 Valid Acc:  0.342 Loss:  4.179
Epoch   0 Train Acc:  0.398 Valid Acc:  0.342 Loss:  4.179
Epoch   0 Train Acc:  0.349 Valid Acc:  0.342 Loss:  4.179
Epoch   0 Train Acc:  0.345 Valid Acc:  0.342 Loss:  4.179
Epoch   0 Train Acc:  0.372 Valid Acc:  0.342 Loss:  4.179
Epoch   1 Train Acc:  0.352 Valid Acc:  0.351 Loss:  4.083
Epoch   1 Train Acc:  0.340 Valid Acc:  0.352 Loss:  3.802
Epoch   1 Train Acc:  0.404 Valid Acc:  0.352 Loss:  3.856
Epoch   1 Train Acc:  0.360 Valid Acc:  0.352 Loss:  3.839
Epoch   1 Train Acc:  0.376 Valid Acc:  0.352 Loss:  3.653
Epoch   1 Train Acc:  0.378 Valid Acc:  0.352 Loss:  3.734
Epoch   1 Train Acc:  0.386 Valid Acc:  0.352 Loss:  3.6

Epoch   1 Train Acc:  0.361 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.387 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.352 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.367 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.398 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.389 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.404 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.356 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.356 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.356 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.394 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.351 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.356 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.372 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.361 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.372 Valid Acc:  0.356 Loss:  3.530
Epoch   1 Train Acc:  0.380 Valid Acc:  0.356 Loss:  3.5

## Language Translation

In [50]:
def sentence_to_seq(sentence, vocab_to_int):
    """
    Convert a sentence to a sequence of ids
    :param sentence: String
    :param vocab_to_int: Dictionary to go from the words to an id
    :return: List of word ids
    """
    # TODO: Implement Function
    ids = []
    for word in sentence.split(" "):
        word_id = -99
        
        if word in vocab_to_int:
            word_id = vocab_to_int[word]
        else:
            word_id = vocab_to_int['<UNK>']

        ids.append(word_id)
    
    return ids

In [51]:
translate_sentence = 'he saw a old yellow truck .'

In [52]:
translate_sentence = sentence_to_seq(translate_sentence, source_vocab_to_int)

In [54]:
load_path = "./saved_models/models"

In [59]:
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_path + '.meta')
    loader.restore(sess, load_path)

    # inputs_ and dropout_ were identified by the names: input and dropout.
    # That's why we can use the below..
    input_data = loaded_graph.get_tensor_by_name('input:0')
    keep_prob = loaded_graph.get_tensor_by_name('dropout:0')

    # tf.identity(infer_logits,"logits") establishes this relationship
    logits = loaded_graph.get_tensor_by_name('logits:0')     

    translate_logits = sess.run(logits, {input_data: [translate_sentence], keep_prob: 1.0})[0]

print('Input')
print('  Word Ids:      {}'.format([i for i in translate_sentence]))
print('  English Words: {}'.format([source_int_to_vocab[i] for i in translate_sentence]))

print('\nPrediction')
print('  Word Ids:      {}'.format([i for i in np.argmax(translate_logits, 1)]))
print('  French Words: {}'.format([target_int_to_vocab[i] for i in np.argmax(translate_logits, 1)]))

Input
  Word Ids:      [163, 99, 96, 225, 134, 50, 180]
  English Words: ['he', 'saw', 'a', 'old', 'yellow', 'truck', '.']

Prediction
  Word Ids:      [52, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  French Words: ['est', 'est', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
