## Language Translation using a Sequence-to-Sequence encoder-decoder Architecture

Grab the data populated using the Data_Provider.ipynb notebook

In [1]:
import helper
import tensorflow as tf

- **source_int_text**: Represents the source text (in English), with each word mapped to a unique int ID
- **target_int_text**: Represents the source text (in French), where each word is also mapped to a unique int ID

**source_vocab_to_int** and **target_vocab_to_int** is a map that defines what unique ID each word in the source and target corpus is mapped to. **Target_int_to_vocab** and **Source_int_to_vocab** likewise, is the complimenting map that defines the mapping of words to IDs for the source and target corpus

In [2]:
(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = helper.load_preprocess()

## 1. Prepare the inputs

In [3]:
batch_size = 300
max_source_seq_len = max([len(sentence) for sentence in source_int_text])
max_target_seq_len = max([len(sentence) for sentence in target_int_text])
print("source seq len: ", max_source_seq_len)
print("target seq len: ", max_target_seq_len)

source_seq_len_ = tf.placeholder_with_default(max_source_seq_len, None)
target_seq_len_ = tf.placeholder_with_default(max_target_seq_len, None)

inputs_ = tf.placeholder(tf.int32, [None, None], name="input")
targets_ = tf.placeholder(tf.int32, [None, None], name="targets")

lr_ = tf.placeholder(tf.float32,name="learn_rate")
dropout_ = tf.placeholder(tf.float32, name="dropout")

source seq len:  17
target seq len:  24


### Processing target input

We transform the target by
- removing the last word from each sentence
- placing the `<GO>` ID to the beginning of each sentence

Also, note that we initially appended each target sentence with a `<EOS>` token. Removing the last word, thus, would only have the effect of getting rid of this.

In [4]:
def process_decoding_input(target_data, target_vocab_to_int, batch_size):
    """
    Preprocess target data for dencoding
    :param target_data: Target Placehoder
    :param target_vocab_to_int: Dictionary to go from the target words to an id
    :param batch_size: Batch Size
    :return: Preprocessed target data
    """
    begin_indice = [0, 0]
    end_indice = [batch_size, -1]
    stride = [1, 1]
    strided_slices = tf.strided_slice(target_data, begin_indice, end_indice, stride)
    target_tensor_rank = [batch_size, 1]
    value_to_fill_with = target_vocab_to_int['<GO>']
    dummy_filled_tensor = tf.fill(target_tensor_rank, value_to_fill_with)
    
    dec_input = tf.concat( [dummy_filled_tensor, strided_slices], 1)
    return dec_input


In [5]:
# #My own little experiment to understand the process_decoding_input method:

def test_process_decoding_input(process_decoding_input):
    ''' 
    Method is used to modify the input to the decoder.
    The input to the decoder is the target translated language
    '''

    batch_size = 5
    seq_length = 9
    target_vocab_to_int = {'<GO>': -99}
    with tf.Graph().as_default():
        target_data = tf.placeholder(tf.int32, [None, None])
        dec_input = process_decoding_input(target_data, target_vocab_to_int, batch_size)

        test_target_data = [[10, 20, 30, 99], [40, 18, 23,23],[12,14,15,33],[33,44,55,87],[11,22,33,78]]
        with tf.Session() as sess:
            test_dec_input = sess.run(dec_input, {target_data: test_target_data})
       
        print("dec_input_got: \n",test_dec_input)
        print ("done with my little experiment")

test_process_decoding_input(process_decoding_input)

dec_input_got: 
 [[-99  10  20  30]
 [-99  40  18  23]
 [-99  12  14  15]
 [-99  33  44  55]
 [-99  11  22  33]]
done with my little experiment


### Apply embedding to input data

In [6]:
source_vocab_size = len(source_vocab_to_int)
enc_embed_size = 255
enc_embed_input = tf.contrib.layers.embed_sequence(inputs_, source_vocab_size, enc_embed_size)

# shape: (batch_size, seq_length, embedding_size)
print("enc embed input: ", enc_embed_input.shape.as_list())

enc embed input:  [None, None, 255]


## 2. Implement the "encoder" layer of the encoder-decoder architecture.
This is just going to be a regular LSTM cell with some dropout. Also, we'll be using the embedding from above to feed the input to this unit

In [7]:
lstm_size = 128
num_layers = 2
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=0.5)
enc_cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)

In [8]:
# shape of enc_state is [batch_size, cell.state_size (=64 in this case)]
_ , enc_cell_state = tf.nn.dynamic_rnn(enc_cell, enc_embed_input, dtype=tf.float32)

## 3. Implementing the "decoder" layer of the architecture.

#### a. Get the input to the decoder by running it through the decoding_input_processor.
Remember that this is simply preprocessing, that could (and should) probably have been accomplished through pure numpy functions. The usage of the _striated_stride_, _fill_, and _concat_ operators thoroughly confuse me, especially not being able to see explicitly what is happening to the inputs

In [9]:
dec_input = process_decoding_input(target_data=targets_, target_vocab_to_int=target_vocab_to_int, batch_size=batch_size)
# shape: [batch_size, seq_length]
print(dec_input.shape.as_list())

[300, None]


#### b. Embed the input to the decoding sequence (just like the input to the encoder)

In [10]:
target_vocab_size = len(target_vocab_to_int)
dec_embed_size = 225
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, dec_embed_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)

#### c. Define the decoder RNN Cell and training function

In [11]:
lstm_size_dec = 128
num_layers = 2
lstm_dec = tf.contrib.rnn.BasicLSTMCell(lstm_size_dec)
drop_dec = tf.contrib.rnn.DropoutWrapper(lstm_dec, output_keep_prob=0.5)
dec_cell = tf.contrib.rnn.MultiRNNCell([drop_dec] * num_layers)

####  c. Decoder training function
This is really weird, and a symptom of tensorflow's API being too low-level (or just plain confusing). Anyways, doing what needs to be done!!

In [12]:
train_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_train(encoder_state=enc_cell_state)

#### d. Constructing the dynamic_rnn_decoder
This method is listed as being comparable to the 'dynamic_rnn' method that I've used several times before for the dynamic unrolling of RNNs. Doc at: https://www.tensorflow.org/versions/r1.0/api_docs/python/tf/contrib/seq2seq/dynamic_rnn_decoder. Method seems
to require a 'decoder_fn', which is provided by the invocation of the tf.contrib.seq2seq.simple_decoder_fn_train(..) method call.

The most important output at the end of this is the var: **train_logits**. We will be using **train_logits** to evaluate the loss and ultimately run the optimizer.

In [13]:
output_fn = lambda x: tf.contrib.layers.fully_connected(x, target_vocab_size, None, scope=decoding_scope)

with tf.variable_scope("decoding") as decoding_scope:
    train_pred, _ , _ = tf.contrib.seq2seq.dynamic_rnn_decoder(cell=dec_cell, decoder_fn=train_decoder_fn, inputs=dec_embed_input, sequence_length=target_seq_len_, scope=decoding_scope)
    train_logits = output_fn(train_pred)

#### e. Decoding inference
The most important value at the end of this setup is: **infer_logits**. The infer_logits is the node that will be used for getting model outputs (i.e get translations from the model). These outputs can further be used to evaluate validation loss and accuracy.

In [14]:
start_sequence_id = target_vocab_to_int['<GO>']
end_sequence_id = target_vocab_to_int['<EOS>']

infer_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_inference(output_fn, enc_cell_state, dec_embeddings, start_sequence_id, end_sequence_id, target_seq_len_, target_vocab_size)

with tf.variable_scope("decoding", reuse=True) as decoding_scope:
    infer_logits, _, _= tf.contrib.seq2seq.dynamic_rnn_decoder(cell=dec_cell, decoder_fn=infer_decoder_fn, inputs=None, sequence_length=target_seq_len_,scope=decoding_scope)
    

## 4. Specify the loss and optimizer methods

In [15]:
# just give the tensor a name.. After the model is saved and loaded back, we can then get this tensor by its name
# i.e. 'logits:0' which can be used to form translations for us. 
tf.identity(infer_logits, 'logits') 

<tf.Tensor 'logits:0' shape=(?, ?, 358) dtype=float32>

In [16]:
with tf.name_scope("optimization"):
    cost = tf.contrib.seq2seq.sequence_loss(train_logits, targets_, tf.ones([batch_size, target_seq_len_]))
    optimizer = tf.train.AdamOptimizer(lr_)
    gradients = optimizer.compute_gradients(cost)    
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

#### Note
Train_logits is conditioned on the **encoder state** (which accepts input from the _source_ language), and also accepts input also from the **targets** (i.e. the _target_ language). Thus it's reasonable to assume that tf.contrib.seq2seq.sequence_loss legitimately evaluates the cost.

Also, from the documentation of tf.contrib.seq2seq.sequence_loss, the last parameter (weights) constitutes the weighting of each prediction in the sequence. I am not sure I understand this completely...

In [17]:
print("train_logits: ",train_logits.shape.as_list())
print("targets: ", targets_.shape.as_list())
print("weights: ", [batch_size, target_seq_len_]) 

train_logits:  [300, None, 358]
targets:  [None, None]
weights:  [300, <tf.Tensor 'PlaceholderWithDefault_1:0' shape=<unknown> dtype=int32>]


## 5. Training

In [18]:
batch_data = helper.batch_data(source_int_text, target_int_text, batch_size)
num_batch_per_epoch = len(source_int_text) // batch_size
train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]
valid_source = helper.pad_sentence_batch(source_int_text[:batch_size])
valid_target = helper.pad_sentence_batch(target_int_text[:batch_size])

In [19]:
import time
import numpy as np
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1]), (0,0)],
            'constant')

    return np.mean(np.equal(target, np.argmax(logits, 2)))

In [20]:
saver = tf.train.Saver()

sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [25]:
start_epochs = 0
end_epochs = 10

In [26]:
for epoch_i in range(start_epochs, end_epochs):
    batch_data = helper.batch_data(train_source, train_target, batch_size)
    cntr = 0
    for batch_i, (source_batch, target_batch) in enumerate(batch_data):
        if(batch_i < batch_size):
            # ------- compute loss -----------------
            _ , loss = sess.run([train_op, cost], {
                inputs_ : source_batch,
                targets_ : target_batch,
                source_seq_len_ : source_batch.shape[1],
                target_seq_len_ : target_batch.shape[1],
                lr_ : 0.005,
                dropout_ : 0.5
            })

        batch_train_logits = sess.run(
            infer_logits,
            {inputs_: source_batch, dropout_: 1.0})
        batch_valid_logits = sess.run(
            infer_logits,
            {inputs_: valid_source, dropout_: 1.0})

        train_acc = get_accuracy(target_batch, batch_train_logits)
        valid_acc = get_accuracy(np.array(valid_target), batch_valid_logits)

        # ------- print stats ------------------ 
        cntr+=1
        if(cntr%100==0):                
            print('Epoch {:>3} Train Acc: {:>6.3f} Valid Acc: {:>6.3f} Loss: {:>6.3f}'.format(epoch_i, train_acc, valid_acc, loss))

Epoch   0 Train Acc:  0.518 Valid Acc:  0.517 Loss:  1.458
Epoch   0 Train Acc:  0.566 Valid Acc:  0.503 Loss:  0.901
Epoch   0 Train Acc:  0.537 Valid Acc:  0.594 Loss:  0.747
Epoch   0 Train Acc:  0.544 Valid Acc:  0.545 Loss:  0.747
Epoch   1 Train Acc:  0.579 Valid Acc:  0.623 Loss:  0.668
Epoch   1 Train Acc:  0.586 Valid Acc:  0.588 Loss:  0.626
Epoch   1 Train Acc:  0.636 Valid Acc:  0.632 Loss:  0.551
Epoch   1 Train Acc:  0.632 Valid Acc:  0.627 Loss:  0.551
Epoch   2 Train Acc:  0.647 Valid Acc:  0.660 Loss:  0.492
Epoch   2 Train Acc:  0.678 Valid Acc:  0.671 Loss:  0.428
Epoch   2 Train Acc:  0.709 Valid Acc:  0.741 Loss:  0.400
Epoch   2 Train Acc:  0.714 Valid Acc:  0.687 Loss:  0.400
Epoch   3 Train Acc:  0.730 Valid Acc:  0.738 Loss:  0.357
Epoch   3 Train Acc:  0.768 Valid Acc:  0.765 Loss:  0.303
Epoch   3 Train Acc:  0.788 Valid Acc:  0.777 Loss:  0.281
Epoch   3 Train Acc:  0.788 Valid Acc:  0.762 Loss:  0.281
Epoch   4 Train Acc:  0.780 Valid Acc:  0.806 Loss:  0.2

In [27]:
# ------- save the model from the last epoch --------- 
saver.save(sess,"./saved_models/models")

'./saved_models/models'

## 6. Inference and Language Translation

Now that you have the model all set-up, have a little fun with the translation. Of course, you should consider training the model for much longer, or use a more exhaustive data corpus. Think there is a bigger one (English->French) which is many millions of lines long (ours is ~130,000 sentences)

In [28]:
import numpy as np

In [29]:
def sentence_to_seq(sentence, vocab_to_int):
    """
    Convert a sentence to a sequence of ids
    :param sentence: String
    :param vocab_to_int: Dictionary to go from the words to an id
    :return: List of word ids
    """
    # TODO: Implement Function
    ids = []
    for word in sentence.split(" "):
        word_id = -99
        
        if word in vocab_to_int:
            word_id = vocab_to_int[word]
        else:
            word_id = vocab_to_int['<UNK>']

        ids.append(word_id)
    
    return ids

In [38]:
def in_french(english_sentence):

    translate_sentence = sentence_to_seq(english_sentence, source_vocab_to_int)
    load_path = "./saved_models/models"

    loaded_graph = tf.Graph()
    with tf.Session(graph=loaded_graph) as sess:
        # Load saved model
        loader = tf.train.import_meta_graph(load_path + '.meta')
        loader.restore(sess, load_path)

        # inputs_ and dropout_ were identified by the names: input and dropout.
        # That's why we can use the below..
        input_data = loaded_graph.get_tensor_by_name('input:0')
        keep_prob = loaded_graph.get_tensor_by_name('dropout:0')

        # tf.identity(infer_logits,"logits") establishes this relationship
        logits = loaded_graph.get_tensor_by_name('logits:0')     

        translate_logits = sess.run(logits, {input_data: [translate_sentence], keep_prob: 1.0})[0]
        
        translate_text = 'In French: {}'.format([target_int_to_vocab[i] for i in np.argmax(translate_logits, 1)])
        
        return translate_text

In [39]:
in_french("I like yellow apples")

'In French: ["j\'aime", "l\'", \'aimée\', \'.\', \'<EOS>\']'

In [40]:
in_french("where is your man?")

"In French: ['nos', 'est', 'moins', 'été', '.', '<EOS>']"

In [42]:
in_french("I love yellow trucks and cars.")

"In French: ['comment', ',', 'de', 'mangues', 'et', 'les', 'à', 'sec', '.', '<EOS>']"