## Data Processing

We begin by loading data from disk & preprocessing it, to provide input to the encoding layer.

In [0]:
import os;
# util function to load a file
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='utf-8') as f:
        return f.read()
      


In [13]:
from google.colab import files

uploaded = files.upload() 

Saving small_vocab_fr to small_vocab_fr
Saving small_vocab_en to small_vocab_en


In [0]:
#text sentences, one per line. English in 'small_vocab_en', French in 'small_vocab_fr'
source_path = 'small_vocab_en'
source_text = load_data(source_path)

target_path = 'small_vocab_fr'
target_text = load_data(target_path)

Printing some stats about the dataset, and splitting data for training/testing.


In [42]:
import numpy as np

training_source = list();
validation_source=list();
    
training_target = list();
validation_target = list();
    
def print_dataset_stats():
    print('Dataset Stats')
    sentences = source_text.split('\n')
    target_sentences = target_text.split('\n')
    
   
    # split data set - 85.8% for training, 14.2% for testing
    for i in range(len(sentences)):
        if(i %  7 == 0): 
            validation_target.append(target_sentences[i])
            validation_source.append(sentences[i])
        else:
            training_target.append(target_sentences[i])
            training_source.append(sentences[i])

        
    word_count_arr = [len(s.split()) for s in training_source]
    
    print('Unique words: {}'.format(len(np.unique(np.array(source_text.split())))))
    
    print('Number of sentences: {}'.format(len(training_source)))

    print('Average words / sentence: {}'.format(np.average(word_count_arr)))
    
print_dataset_stats()


Dataset Stats
Unique words: 227
Number of sentences: 118166
Average words / sentence: 13.22173044699829


#### Data preprocessing
The following operations are performed on the dataset to make it ready for the encoding layer:


*   Create vocabulary tables (look up dictionaries for word -> id & id -> word). 
*   Preprocess data for the model: convert source & target sentences to a list of integer representations of the sentence's words.



In [0]:

# Create & return vocabulary dicts, mapping int id to word & word to int id.

import copy
import pickle

# special symbols indicating padding, end of sentence, unknown word & start of sentence, respectively

SYMBOLS = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }

def generate_vocabulary_mappings(text):
    word_to_int = copy.copy(SYMBOLS)
    
    vocab = set(text.split())

    for integer, word in enumerate(vocab, len(SYMBOLS)):
        word_to_int[word] = integer

    int_to_word = {index: word for word, index in word_to_int.items()}
    

    return word_to_int, int_to_word

In [0]:
"""
Convert sentences to list of word ids
:param text_sentences: Array of all text sentences.
:param vocab_map: Dictionary from the words to an id
:return: A list of processed sentences_to_ids. Append <UNK> if unknown word is encountered.
"""
def sentences_to_ids(text_sentences, vocab_map):
    source_text_ids = list()
   
    source_text_sentences = text_sentences.split('\n')
    
    for sentence in source_text_sentences:
        sentence_ids = list()
        for word in sentence.split(' '):
            try: 
                sentence_ids.append(vocab_map[word])
            except:
                sentence_ids.append(vocab_map['<UNK>'])
        source_text_ids.append(sentence_ids)

    return source_text_ids
    
    return None

In [0]:
# Preprocess Text Data. This will form the input to the encoding layer. 

source_text_lc = ('\n').join(training_source).lower();
target_text_lc = ('\n').join(training_target).lower();

source_mapping, reverse_source_mapping = generate_vocabulary_mappings(source_text_lc)
target_mapping, reverse_target_mapping = generate_vocabulary_mappings(target_text_lc)

source_text_p = sentences_to_ids(source_text_lc, source_mapping)
target_text_p = sentences_to_ids(target_text_lc, target_mapping)


In [0]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense


## Building the Sequence to Sequence Model

The encoding layer of the Sequence to Sequence model.
RNN size, number of layers, keep_probability for the dropout layer, source sequence lengths, vocabilary size, embedding size of the source data are the parameters that can be used to build/modify this layer.

In [0]:
"""
:return: tuple (RNN output, RNN state)
"""
def encoding_layer(rnn_inputs, nn_size, layer_count, dropout_kp, source_length,
                   source_vocab_size, encoding_embedding_size):
     
    encoder_embedding = tf.contrib.layers.embed_sequence(rnn_inputs,source_vocab_size,encoding_embedding_size)
    
    multi_rnn_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(nn_size), dropout_kp) for _ in range(layer_count) ])
    
    output, state = tf.nn.dynamic_rnn(multi_rnn_cell, encoder_embedding, sequence_length=source_length, dtype=tf.float32)
    
    return (output,state)

The decoding layer of the model. It is composed of two layers & we need different kinds of input data for training/inference phases: 
 - Training layer: Takes in inputs from the encoder's state & uses it for training. In the training phase, the input is provided as embedded target labels. 
 - Inference layer:  Takes in output from previous time step as input, makes predictions. The inputs again need to be embeded and the embedding vector should be shared between two different phases.
 
 


In [0]:
"""
:return: Tuple (Training Output, Inference Output) - (BasicDecoderOutput)
"""
def decoding_layer(decoder_input, encoder_state,
                   target_length, max_target_length,
                   rnn_size,
                   num_layers, target_vocab_to_int, target_vocab_size,
                   batch_size, keep_prob, embedding_size):
   
    dec_embeddings=tf.Variable(tf.random_uniform([target_vocab_size, embedding_size]))
    dec_embed_input=tf.nn.embedding_lookup(dec_embeddings,decoder_input)
    
    decoder_cell=tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size),keep_prob) for _ in range(num_layers) ])
    
    output_layer=Dense(target_vocab_size,kernel_initializer=tf.truncated_normal_initializer(mean=0.0,stddev=0.1))
    
    # training layer
    with tf.variable_scope('decode'):
        train_help=tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,sequence_length=target_length)
        basic_dec=tf.contrib.seq2seq.BasicDecoder(decoder_cell,train_help,encoder_state,output_layer)
        training_decoder_output=tf.contrib.seq2seq.dynamic_decode(basic_dec,maximum_iterations=max_target_length)[0]
    
    # inference layer
    with tf.variable_scope('decode',reuse=True):
        start_sentence_id = target_vocab_to_int['<GO>']
        end_sentence_id = target_vocab_to_int['<EOS>']
        start_tokens=tf.tile(tf.constant([start_sentence_id],dtype=tf.int32),[batch_size])
        helper=tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_embeddings,start_tokens,end_sentence_id)
        infer_decoder=tf.contrib.seq2seq.BasicDecoder(decoder_cell,helper,encoder_state,output_layer)
        inference_decoder_output=tf.contrib.seq2seq.dynamic_decode(infer_decoder,maximum_iterations=max_target_length)[0]
    
    
    return (training_decoder_output,inference_decoder_output)

Integrate the above two encoder-decoder layers to build the Sequence to Sequence model.

In [0]:
"""
:return: Tuple (Training Output, Inference Output) - (BasicDecoderOutput)
"""
def sequence_model(input_data, target_data, dropout_keep_prob, batch_size,
                  source_length, target_length,
                  max_target_sentence_length,
                  source_vocab_size, target_vocab_size,
                  enc_embed_size, dec_embed_size,
                  rnn_size, layer_count, target_mapping):
    # prepare input to decoder
    target_slice = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill(
        [batch_size, 1], target_mapping['<GO>']), target_slice], 1)
  
    # get encoder state
    _unused ,encoder_state = encoding_layer(input_data, rnn_size, layer_count, dropout_keep_prob, 
                   source_length, source_vocab_size, 
                   enc_embed_size)
    
    # get decoder output
    (training_decoder_output,inference_decoder_output)=decoding_layer(dec_input, encoder_state,
                   target_length, max_target_sentence_length,
                   rnn_size, layer_count, target_mapping, target_vocab_size,
                   batch_size, dropout_keep_prob, dec_embed_size)
    
    return (training_decoder_output,inference_decoder_output)

#### Configure Hyperparameters

Define number of layers, batch size, RNN size, embedding size for encoder/decoder, keep probability for the dropout layer, learning rate.


In [0]:

epochs = 3
batch_size = 256

rnn_size = 196
# Number of Layers
num_layers = 1

encoding_embedding_size = 196
decoding_embedding_size = 196

# Dropout Keep Probability
keep_probability = 0.75
learning_rate = 0.002
display_step = 50

#### Process & Batch Data
 We feed the data to the network in batches, while processing each batch such that all sentences have the same length. This is achieved by appending PAD keyword Integer ID to all sentences in the batch, ensuring that all sentence sequences (or int array representations of sentences) are of the same length as the longest sentence in the batch.


In [0]:
def add_batch_padding(sentence_batch, pad_id):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_id] * (max_sentence - len(sentence)) for sentence in sentence_batch]
   
def process_curr_batch(batch, pad_id):
    padded_batch = np.array(add_batch_padding(batch, pad_id))
    return padded_batch, [len(sentence) for sentence in padded_batch]

def get_batches(sources, targets, batch_size, source_pad_id, target_pad_id):
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size

        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]

        pad_sources_batch, pad_source_lengths = process_curr_batch(sources_batch, source_pad_id)
        pad_targets_batch, pad_targets_lengths = process_curr_batch(targets_batch, target_pad_id)

        yield pad_sources_batch, pad_targets_batch, pad_source_lengths, pad_targets_lengths

#### Initialize Placeholders and Build the Graph
This is where we initialize the graph & define the optimization to be used for training.
First, we create TF placeholders/ initialize the required variables. Then, we take the output of the sequence to sequence into tensors for training & inference respectively. We use the AdamOptimizer from tf.train to feed back the gradients to the model.


In [0]:
save_path = '/tmp/model.ckpt'
train_graph = tf.Graph()
with train_graph.as_default():
  
    input_data = tf.placeholder(tf.int32,[None,None],name='input')
    targets = tf.placeholder(tf.int32,[None,None],name='target')
    
    lr = tf.placeholder(tf.float32,name='learning_rate')
    keep_prob = tf.placeholder(tf.float32,name='keep_prob')
    
    target_length = tf.placeholder(tf.int32,[None,],name='target_length')
    max_target_length = tf.reduce_max(target_length,name='max_target_len')
    source_length = tf.placeholder(tf.int32,[None,],name='source_length')
    
    input_shape = tf.shape(input_data)
    
    reverse_data = tf.reverse(input_data, [-1])
    
    train_logits, inference_logits = sequence_model(reverse_data, targets,
                                                   keep_prob, batch_size,
                                                   source_length, target_length,
                                                   max_target_length,
                                                   len(source_mapping), len(target_mapping),
                                                   encoding_embedding_size,
                                                   decoding_embedding_size,
                                                   rnn_size, num_layers,
                                                   target_mapping)


    training_logits = tf.identity(train_logits.rnn_output, name='logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')

    masks = tf.sequence_mask(target_length, max_target_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.apply_gradients(optimizer.compute_gradients(cost))
    


#### Train the model
Now, we train the model with the hyperparameters defined above, tweaking them based on the model output. 
*tf.contrib.seq2seq.sequence_loss * is used to compute the loss. It calculates the weighted cross-entropy loss for a sequence of logits.
<br><br>

Here we also compute the accuracy for training & validation steps, using the *BLEU score*. 

BLEU (bilingual evaluation understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. In other words, it is a metric for evaluating a generated sentence to a reference sentence.
A perfect match results in a score of 1.0, whereas a perfect mismatch results in a score of 0.0. 

To compute the BLEU score, we use the corpus_bleu implementation provided by the Natural Language Toolkit
library

In [0]:
def un_pad(myList):
    new_list = list()
    for s in myList:
        new_s = list()
        for w in s:
            if(w != '<PAD>'):
                new_s.append(w)
        new_list.append(new_s)
    return new_list
 

def un_pad_target(myList):
    new_list = list()
    for s in myList:
        new_s = list()
        container_list = list()
        for w in s:
            if(w != '<PAD>'):
                new_s.append(w)
        container_list.append(new_s)
        new_list.append(container_list)
    return new_list

In [0]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

def get_accuracy(target, logits, target_int_to_word):
    """
    Calculate accuracy
    """
    a = np.array(target);
    a_p = [[target_int_to_word[j] for j in i] for i in a]
    b = un_pad_target(a_p)
    l_p = [[target_int_to_word[j] for j in i] for i in logits]
    l_pp = un_pad(l_p)
    bleu = corpus_bleu(b, l_pp)
    return bleu


In [61]:


# Split data to training and validation sets
processed_source_text_sentences = source_text_p;
processed_target_text_sentences = target_text_p;

train_source = processed_source_text_sentences[batch_size:]
train_target = processed_target_text_sentences[batch_size:]

valid_source = processed_source_text_sentences[:batch_size]
valid_target = processed_target_text_sentences[:batch_size]

(valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths ) = next(get_batches(valid_source,
                                                                                                             valid_target,
                                                                                                             batch_size,
                                                                                                             source_mapping['<PAD>'],
                                                                                                             target_mapping['<PAD>']))                                                                                                  
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate(
                get_batches(train_source, train_target, batch_size,
                            source_mapping['<PAD>'],
                            target_mapping['<PAD>'])):

            _, loss = sess.run([train_op, cost], {input_data: source_batch,
                 targets: target_batch, lr: learning_rate,
                 target_length: targets_lengths, source_length: sources_lengths,
                 keep_prob: keep_probability})


            if batch_i % display_step == 0 and batch_i > 0:

                batch_train_logits = sess.run(inference_logits, {input_data: source_batch,
                     source_length: sources_lengths, target_length: targets_lengths,
                     keep_prob: 1.0})
                
                batch_valid_logits = sess.run( inference_logits, {input_data: valid_sources_batch,
                     source_length: valid_sources_lengths, target_length: valid_targets_lengths,
                     keep_prob: 1.0})

                train_acc = get_accuracy(target_batch, batch_train_logits, reverse_target_mapping)

                valid_acc = get_accuracy(valid_targets_batch, batch_valid_logits, reverse_target_mapping)

                print('Epoch {:>3} Batch {:>4}/{} Training Accuracy(BLEU): {:>6.4f} Validation Accuracy(BLEU): {:>6.4f}, Loss: {:>6.4f}'
                      .format(epoch_i, batch_i, len(source_text_p) // batch_size, train_acc, valid_acc, loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print('Model Trained and Saved')

Epoch   0 Batch   50/461 Training Accuracy(BLEU): 0.1383 Validation Accuracy(BLEU): 0.1303, Loss: 1.8518
Epoch   0 Batch  100/461 Training Accuracy(BLEU): 0.2357 Validation Accuracy(BLEU): 0.2412, Loss: 0.9703
Epoch   0 Batch  150/461 Training Accuracy(BLEU): 0.3280 Validation Accuracy(BLEU): 0.3056, Loss: 0.7900
Epoch   0 Batch  200/461 Training Accuracy(BLEU): 0.3973 Validation Accuracy(BLEU): 0.3984, Loss: 0.6529
Epoch   0 Batch  250/461 Training Accuracy(BLEU): 0.4866 Validation Accuracy(BLEU): 0.4782, Loss: 0.4926
Epoch   0 Batch  300/461 Training Accuracy(BLEU): 0.5708 Validation Accuracy(BLEU): 0.5836, Loss: 0.4457
Epoch   0 Batch  350/461 Training Accuracy(BLEU): 0.6900 Validation Accuracy(BLEU): 0.6652, Loss: 0.3716
Epoch   0 Batch  400/461 Training Accuracy(BLEU): 0.7528 Validation Accuracy(BLEU): 0.7539, Loss: 0.2692
Epoch   0 Batch  450/461 Training Accuracy(BLEU): 0.8318 Validation Accuracy(BLEU): 0.8187, Loss: 0.2192
Epoch   1 Batch   50/461 Training Accuracy(BLEU): 0.897

## Evaluate Model
Now that the model is saved after the training, we calculate the model's accuracy by feeding it the test data we split from the dataset above. This data is never seen by the model during training & validation.

In [0]:
def sentence_to_seq(sentence, mapping):
    """
    Convert a sentence to a sequence of ids
    :return: List of word ids
    """
    sentence_lower=sentence.lower()
    sequence=[]
    for word in sentence_lower.split():
        if word in mapping:
            sequence.append(mapping[word])
        else:
            sequence.append(mapping['<UNK>'])
    
    
    return sequence

In [0]:
def validate_batch(translate_sentences):
  load_path = '/tmp/model.ckpt'
  loaded_graph = tf.Graph()
  with tf.Session(graph=loaded_graph) as sess:
    
    
      # Load saved model
      loader = tf.train.import_meta_graph(load_path + '.meta')
      loader.restore(sess, load_path)

      input_data = loaded_graph.get_tensor_by_name('input:0')
      logits = loaded_graph.get_tensor_by_name('predictions:0')
      target_length = loaded_graph.get_tensor_by_name('target_length:0')
      source_length = loaded_graph.get_tensor_by_name('source_length:0')
      keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
      
      predictions = list()
      for sntc in translate_sentences:
          translate_sentence = sentence_to_seq(sntc, source_mapping)
          translate_logits = sess.run(logits, {input_data: [translate_sentence]*batch_size,
                                         target_length: [len(translate_sentence)*2]*batch_size,
                                         source_length: [len(translate_sentence)]*batch_size,
                                         keep_prob: 1.0})[0]
    
          translated_list = ['0']*len(translate_logits);
          j = 0;
          for i in translate_logits:
            translated_list[j] = (reverse_target_mapping[i])
            j += 1
          predictions.append(un_pad([translated_list])[0])
  return predictions


In [69]:
batchsize = 1000
list_bleu_batch = list()
for i in range(0, len(validation_source), batchsize):
    v_source_batch = validation_source[i:i+batchsize]
    v_target_batch = validation_target[i:i+batchsize]
    
    predictions = validate_batch(v_source_batch)

    targ_sentences = list()
    for ts in v_target_batch:
      targ_sentences.append([ts.split(' ')])

    current_bleu = corpus_bleu(targ_sentences, predictions)
    print("CURRENT: ", current_bleu)
    list_bleu_batch.append(current_bleu)
    print("MEAN: ",  np.mean(np.array(list_bleu_batch)))

INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
CURRENT:  0.903072877861242
MEAN:  0.903072877861242
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
CURRENT:  0.8925883323898511
MEAN:  0.8978306051255466
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
CURRENT:  0.8937599081135543
MEAN:  0.8964737061215491
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
CURRENT:  0.9083818981449657
MEAN:  0.8994507541274033
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
CURRENT:  0.9042040357490387
MEAN:  0.9004014104517303
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
CURRENT:  0.8952196649854834
MEAN:  0.8995377862073558
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
CURRENT:  0.8940507976946362
MEAN:  0.8987539307055388
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
CURRENT:  0.8916387621432388
MEAN:  0.8978645346352513
INFO:tensorflow:Restoring parameters from /tmp/model.ckpt
CURRENT:  0.8995092218458804
MEAN:  0.89

## Accuracy: MEAN BLEU  0.896215397409363

## Benchmark Model
We use the characted-based sequence to sequence example provided by Keras as our benchmark model. We process and feed it the same data as above. The file en_fr is a tsv of the above dataset, with the first column being the target sentences (in French), and the second column being source (English). This is generated using the fileprocessor.py file provided with this notebook.

In [0]:
'''Sequence to sequence example in Keras (character-level). 
Hosted at: https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py

# Summary of the algorithm

# References

- Sequence to Sequence Learning with Neural Networks
    https://arxiv.org/abs/1409.3215
- Learning Phrase Representations using
    RNN Encoder-Decoder for Statistical Machine Translation
    https://arxiv.org/abs/1406.1078
'''

from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

batch_size = 1024  # Batch size for training.
epochs = 20 # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 110000  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'en_fr'

# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    target_text, input_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)
    
print(input_texts[30:40])
print(target_texts[30:40])

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)
# Save model
model.save('s2s.h5')

# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

Using TensorFlow backend.


['he dislikes apples , peaches , and grapes .', 'california is usually freezing during december , and it is busy in april .', 'your most feared animal is that shark .', 'paris is usually wet during august , and it is never dry in november .', 'paris is usually beautiful during september , and it is usually snowy in november .', 'the united states is never wet during january , but it is usually hot in october .', 'we like oranges , mangoes , and grapes .', 'they like pears , apples , and mangoes .', 'she dislikes that little red truck .', 'the grapefruit is my most loved fruit , but the banana is her most loved .']
['\til déteste les pommes , les pêches et les raisins .\n', '\tla californie est le gel habituellement en décembre , et il est occupé en avril .\n', '\tvotre animal le plus redouté est que le requin .\n', "\tparis est généralement humide au mois d' août , et il est jamais sec en novembre .\n", '\tparis est généralement beau en septembre , et il est généralement enneigée en no

  str(node.arguments) + '. They will not be included '


In [0]:
import numpy as np

benchmark_batchsize = 100
bleu_batch_list = list()
for index in range(0, 10000, benchmark_batchsize):
    input_batch = encoder_input_data[index: index + benchmark_batchsize]
    target_batch = target_texts[index: index + benchmark_batchsize]

    input_batch_ac = input_texts[index: index + benchmark_batchsize]

    targets = list()
    predictions = list()
    inputs = list()

    for seq_index in range(benchmark_batchsize):
      # Take one sequence (part of the training set)
      # for trying out decoding.
        input_seq = input_batch[seq_index: seq_index + 1]
        target_text = target_batch[seq_index: seq_index + 1]
        input_text = input_batch_ac[seq_index: seq_index + 1]
        decoded_sentence = decode_sequence(input_seq)
        prediction_s = list()
  
    for words in decoded_sentence.split('\n')[0].split(' '):
        prediction_s.append(words)
      
    predictions.append(prediction_s)
    
    target_s_list = list()
    
    for t_w in target_text[0].split('\t')[1].split('\n')[0].split(' '):
        target_s_list.append(t_w)
    
    targets.append([target_s_list]);
    inputs.append(input_text)
  
  corp_bleu = corpus_bleu(targets, predictions)
  print("CORPUS BLEU: ", corp_bleu)
  bleu_batch_list.append(corp_bleu)
  print("CURRENT MEAN BLEU", np.mean(np.array(bleu_batch_list)))


CORPUS BLEU:  0.17685921733223353
CURRENT MEAN BLEU 0.17685921733223353
CORPUS BLEU:  0.20364630199448508
CURRENT MEAN BLEU 0.1902527596633593
CORPUS BLEU:  0.16877319393715523
CURRENT MEAN BLEU 0.1830929044212913
CORPUS BLEU:  0.1665112331524292
CURRENT MEAN BLEU 0.17894748660407578
CORPUS BLEU:  0.16994520127282883
CURRENT MEAN BLEU 0.1771470295378264
CORPUS BLEU:  0.18668272240141323
CURRENT MEAN BLEU 0.17873631168175752
CORPUS BLEU:  0.191778441237082
CURRENT MEAN BLEU 0.18059947304680388
CORPUS BLEU:  0.19854238406678962
CURRENT MEAN BLEU 0.18284233692430207
CORPUS BLEU:  0.1713664300619961
CURRENT MEAN BLEU 0.18156723616182363
CORPUS BLEU:  0.20759440729110767
CURRENT MEAN BLEU 0.18416995327475205
CORPUS BLEU:  0.20597848201905886
CURRENT MEAN BLEU 0.18615254679696178
CORPUS BLEU:  0.20653314090614192
CURRENT MEAN BLEU 0.18785092963939343
CORPUS BLEU:  0.2050670061759916
CURRENT MEAN BLEU 0.18917524321913173
CORPUS BLEU:  0.19771951403424023
CURRENT MEAN BLEU 0.18978554827735378


## Accuracy: MEAN BLEU  0.19103319326113646