# NEURAL MACHINE TRANSLATION
Implementing an English-French translator


In [2]:
import helper
import problem_unittests as tests 
source_path = ('./data/small_vocab_en')
target_path = ('./data/small_vocab_fr')
source_text = helper.load_data(source_path)
target_text = helper.load_data(target_path)

### Exploring the Data

In [3]:
view_sentence_range = (0, 10)

import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in source_text.split()})))

sentences = source_text.split('\n')
word_counts = [len(sentence.split()) for sentence in sentences]
print('Number of sentences: {}'.format(len(sentences)))
print('Average number of words in a sentence: {}'.format(np.average(word_counts)))

print()
print('English sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(source_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))
print()
print('French sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(target_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 227
Number of sentences: 137861
Average number of words in a sentence: 13.225277634719028

English sentences 0 to 10:
new jersey is sometimes quiet during autumn , and it is snowy in april .
the united states is usually chilly during july , and it is usually freezing in november .
california is usually quiet during march , and it is usually hot in june .
the united states is sometimes mild during june , and it is cold in september .
your least liked fruit is the grape , but my least liked is the apple .
his favorite fruit is the orange , but my favorite is the grape .
paris is relaxing during december , but it is usually chilly in july .
new jersey is busy during spring , and it is never hot in march .
our least liked fruit is the lemon , but my least liked is the grape .
the united states is sometimes busy during january , and it is sometimes warm in november .

French sentences 0 to 10:
new jersey est parfois calme pendant l' automne 

### Implementing text_to_ids function 
to transform source_text and target_text by replacing each letter with its corresponding integers index (from source/target_vocab_to_int)


In [4]:
def text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int):
    
    source_id_text = [[source_vocab_to_int.get(letter) for letter in line.split()] for line in source_text.split('\n')]

    target_id_text = [[target_vocab_to_int.get(letter) for letter in line.split()] for line in target_text.split('\n')]
    
    for rows in target_id_text:
        rows.insert(len(rows), target_vocab_to_int['<EOS>'])
    
    return source_id_text, target_id_text


tests.test_text_to_ids(text_to_ids)

Tests Passed


In [5]:
helper.preprocess_and_save_data(source_path, target_path, text_to_ids)

### Checkpoint

In [6]:
import numpy as np
import helper

(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess()

### Checking if the version of tensorflow is correct and if it has access to GPU

In [7]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) in [LooseVersion('1.0.0'), LooseVersion('1.0.1')], 'This project requires TensorFlow version 1.0  You are using {}'.format(tf.__version__)
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.0


  # This is added back by InteractiveShellApp.init_path()


In [8]:
def model_inputs():
    inputs = tf.placeholder(tf.int32, shape=[None, None], name = 'input')
    targets = tf.placeholder(tf.int32, shape = [None, None], name='targets')
    keep_prob = tf.placeholder(tf.float32,  shape= None, name= 'keep_prob')
    learning_rate = tf.placeholder(tf.float32, shape = None, name= 'learning_rate')
    return inputs, targets, learning_rate, keep_prob

tests.test_model_inputs(model_inputs)

Tests Passed


### Process Decoding Input
Removing the last word ID from the end of each batch and concatenating the Go ID to the beginning of each batch

In [9]:
def process_decoding_input(target_data, target_vocab_to_int, batch_size):
    ending = tf.strided_slice(target_data, [0,0], [batch_size, -1], [1,1])
    go_ = tf.concat([tf.fill([batch_size,1], target_vocab_to_int['<GO>']), ending], 1)
    
    return go_

    
tests.test_process_decoding_input(process_decoding_input)

Tests Passed


### Encoding Layer
Encoding using the LSTM Neural Net

In [10]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob):    
    lstm_1_layer = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    drop_out = tf.contrib.rnn.DropoutWrapper(lstm_1_layer, output_keep_prob=keep_prob)
    multi_layered_lstm = tf.contrib.rnn.MultiRNNCell([drop_out]*num_layers)
    
    
    #initial_state = multi_layered_lstm.zero_state(rnn_inputs, dtype=tf.float32)
       
    encoded, rnn_state = tf.nn.dynamic_rnn(multi_layered_lstm, rnn_inputs, dtype=tf.float32)


    return rnn_state

tests.test_encoding_layer(encoding_layer)

Tests Passed


### Decoding - Training
Training the LSTM Decoder 

In [11]:
def decoding_layer_train(encoder_state, dec_cell, dec_embed_input, sequence_length, 
                         decoding_scope, output_fn, keep_prob):

    
    decoder = tf.contrib.seq2seq.simple_decoder_fn_train(encoder_state)
    
    outputs, final_state_train, context_state_train = tf.contrib.seq2seq.dynamic_rnn_decoder(
        cell= dec_cell, decoder_fn=decoder, inputs= dec_embed_input, sequence_length = sequence_length, scope = decoding_scope)
    
    dropout_outputs = tf.nn.dropout(outputs, keep_prob=keep_prob)
    train_logits = output_fn(dropout_outputs)
    
    return train_logits

tests.test_decoding_layer_train(decoding_layer_train)

Tests Passed


### Decoding Inference

In [12]:
def decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, 
                         end_of_sequence_id, maximum_length, vocab_size, decoding_scope, output_fn, keep_prob):

    decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_inference(output_fn, encoder_state, dec_embeddings, 
                                                                start_of_sequence_id, end_of_sequence_id, 
                                                                maximum_length, num_decoder_symbols=vocab_size)
    
    out, final_state, context_final_state = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell, decoder_fn, 
                                                                                   scope=decoding_scope)
    inference_logits = tf.nn.dropout(out, keep_prob)

    
    return inference_logits
                                                               


tests.test_decoding_layer_infer(decoding_layer_infer)

Tests Passed


### Building the Decoder Layer 

In [13]:
def decoding_layer(dec_embed_input, dec_embeddings, encoder_state, vocab_size, sequence_length, rnn_size, 
                   num_layers, target_vocab_to_int, keep_prob):
    

    start_of_sequence_id = target_vocab_to_int['<EOS>']
    end_of_sequence_id = target_vocab_to_int['<GO>']
    
    
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    dec_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell]*num_layers)
    
    with tf.variable_scope("decoding") as decoding_scope:
        
        output_fn = lambda x: tf.contrib.layers.fully_connected(x, vocab_size, None, scope=decoding_scope) 
        
        
    with tf.variable_scope("decoding") as decoding_scope:
        train_logits = decoding_layer_train(encoder_state, dec_cell, dec_embed_input, sequence_length, decoding_scope, 
                                           output_fn, keep_prob)
        
        
    with tf.variable_scope("decoding", reuse=True) as decoding_scope:    
        inference_logits = decoding_layer_infer(encoder_state, dec_cell, dec_embeddings, start_of_sequence_id, 
                                                end_of_sequence_id, sequence_length, vocab_size, decoding_scope, output_fn, 
                                               keep_prob)
    
    
    return train_logits, inference_logits

tests.test_decoding_layer(decoding_layer)

Tests Passed


### Building the Neural Network


In [19]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size, sequence_length,
                   source_vocab_size, target_vocab_size, enc_embedding_size, dec_embedding_size, 
                  rnn_size, num_layers, target_vocab_to_int):
    
    
    embedding_input = tf.contrib.layers.embed_sequence(input_data, vocab_size=source_vocab_size, 
                                                       embed_dim=enc_embedding_size)
    
    encoder_state = encoding_layer(embedding_input, rnn_size, num_layers, keep_prob)
    
    process_target_data = process_decoding_input(target_data, target_vocab_to_int, batch_size)
    

    
    embedding_output = tf.contrib.layers.embed_sequence(process_target_data, vocab_size=target_vocab_size, 
                                                        embed_dim=dec_embedding_size)
    
                                   
    
    weights = tf.Variable(tf.random_uniform([target_vocab_size, dec_embedding_size], -1., 1.))
    training_logits, inference_logits = decoding_layer(embedding_output, weights, encoder_state, 
                                                       target_vocab_size, sequence_length, rnn_size, 
                                                       num_layers, target_vocab_to_int, keep_prob)
    return (training_logits, inference_logits)


tests.test_seq2seq_model(seq2seq_model)

Tests Passed


### Hyperparameters

In [30]:
epochs= 5
learning_rate = 0.001
batch_size = 256
keep_probability = 0.7
encoding_embedding_size = 300
decoding_embedding_size = 300
num_layers = 3
rnn_size = 512


### Building the graph

In [31]:
save_path = 'checkpoints/dev'
(source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess()
max_source_sentence_length = max([len(sentence) for sentence in source_int_text])
train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, lr, keep_prob = model_inputs()
    sequence_length = tf.placeholder_with_default(max_source_sentence_length, None, name='sequence_length')
    input_shape = tf.shape(input_data)
    
    train_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]), targets, keep_prob, batch_size, sequence_length,
                   len(source_vocab_to_int), len(target_vocab_to_int), encoding_embedding_size, decoding_embedding_size, 
                  rnn_size, num_layers, target_vocab_to_int)
    
    tf.identity(inference_logits, 'logits')
    with tf.name_scope('optimization'):
        #Loss function
        cost = tf.contrib.seq2seq.sequence_loss(train_logits, targets, tf.ones([input_shape[0], sequence_length]))
        
        optimizer = tf.train.AdamOptimizer(lr)
        
        #gradient clipping

        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

### Training 

In [32]:
import time

def get_accuracy(target, logits):
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(target, [(0,0), (0,max_seq-target.shape[1])], 'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(logits, [(0,0),(0,max_seq - logits.shape[1]), (0,0)], 'constant')
        
    return np.mean(np.equal(target, np.argmax(logits, 2)))    

    

train_source = source_int_text[batch_size:]
train_target = target_int_text[batch_size:]

valid_source = helper.pad_sentence_batch(source_int_text[:batch_size])
valid_target = helper.pad_sentence_batch(target_int_text[:batch_size])

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(epochs):
        for batch_i, (source_batch, target_batch) in enumerate(helper.batch_data(train_source, train_target, batch_size)):
            start_time = time.time()
            _, loss = sess.run([train_op, cost], feed_dict={lr:learning_rate, input_data:source_batch, 
                                                             targets: target_batch, keep_prob:keep_probability, 
                                                            sequence_length:target_batch.shape[1]})
            
            batch_train_logits = sess.run(inference_logits, {input_data:source_batch, keep_prob:1.0})
            batch_valid_logits = sess.run(inference_logits, {input_data: valid_source, keep_prob:1.0})
            
            
            train_acc = get_accuracy(target_batch, batch_train_logits)
            valid_acc = get_accuracy(np.array(target_batch), batch_valid_logits)
            
            end_time = time.time()
            
            print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.3f}, Validation Accuracy: {:>6.3f}, Loss: {:>6.3f}'
                  .format(epoch_i, batch_i, len(source_int_text) // batch_size, train_acc, valid_acc, loss))
                 
    
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print("Model trained and saved")
        

Epoch   0 Batch    0/538 - Train Accuracy:  0.138, Validation Accuracy:  0.131, Loss:  5.880
Epoch   0 Batch    1/538 - Train Accuracy:  0.268, Validation Accuracy:  0.257, Loss:  5.832
Epoch   0 Batch    2/538 - Train Accuracy:  0.264, Validation Accuracy:  0.264, Loss:  5.332
Epoch   0 Batch    3/538 - Train Accuracy:  0.265, Validation Accuracy:  0.266, Loss:  4.591
Epoch   0 Batch    4/538 - Train Accuracy:  0.278, Validation Accuracy:  0.278, Loss:  4.130
Epoch   0 Batch    5/538 - Train Accuracy:  0.300, Validation Accuracy:  0.298, Loss:  3.810
Epoch   0 Batch    6/538 - Train Accuracy:  0.339, Validation Accuracy:  0.335, Loss:  3.510
Epoch   0 Batch    7/538 - Train Accuracy:  0.290, Validation Accuracy:  0.247, Loss:  3.534
Epoch   0 Batch    8/538 - Train Accuracy:  0.229, Validation Accuracy:  0.199, Loss:  3.375
Epoch   0 Batch    9/538 - Train Accuracy:  0.266, Validation Accuracy:  0.204, Loss:  3.326
Epoch   0 Batch   10/538 - Train Accuracy:  0.244, Validation Accuracy

In [34]:
helper.save_params(save_path)

### Checkpoint

In [35]:
import tensorflow as tf
import numpy as np
import helper
import problem_unittests as tests

_, (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = helper.load_preprocess()
load_path = helper.load_params()

### Implementing sentence_to_seq 
to process new sentences before feeding them into the model for translation

In [36]:
def sentence_to_seq(sentence, vocab_to_int):
    sentence_list = []
    sentence = sentence.lower()
    for word in sentence.split(' '):
        if word in vocab_to_int.keys():
            sentence_list.append(vocab_to_int[word])
        else:
            sentence_list.append(vocab_to_int['<UNK>'])    
          
    return sentence_list       


tests.test_sentence_to_seq(sentence_to_seq)

Tests Passed


### Translating
from english to french

In [39]:
translate_sentence = 'he saw a old yellow truck .'

translate_sentence = sentence_to_seq(translate_sentence, source_vocab_to_int)

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    loader = tf.train.import_meta_graph(load_path +'.meta')
    loader.restore(sess, load_path)
    
    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('logits:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    translate_logits = sess.run(logits, {input_data: [translate_sentence], keep_prob:1.0})[0]
    
    
print('Input')
print('  Word Ids:      {}'.format([i for i in translate_sentence]))
print('  English Words: {}'.format([source_int_to_vocab[i] for i in translate_sentence]))

print('\nPrediction')
print('  Word Ids:      {}'.format([i for i in np.argmax(translate_logits, 1)]))
print('  French Words: {}'.format([target_int_to_vocab[i] for i in np.argmax(translate_logits, 1)]))

Input
  Word Ids:      [180, 41, 177, 172, 186, 130, 120]
  English Words: ['he', 'saw', 'a', 'old', 'yellow', 'truck', '.']

Prediction
  Word Ids:      [137, 253, 37, 146, 340, 160, 317, 317, 317, 317, 317, 317, 317, 317, 317, 317, 317, 317]
  French Words: ['elle', 'a', 'vu', 'volant', 'au', 'camion', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']


### Imperfect Translation
You might notice that some sentences translate better than others.  
Since the dataset only has a vocabulary of 227 English words of the thousands that you use, 
you're only going to see good results using these words.  
Additionally, the translations in this data set were made by Google translate, 
so the translations themselves aren't particularly good.  

To create a better translation model, it's needed better data.

[WMT10 French-English corpus](http://www.statmt.org/wmt10/training-giga-fren.tar).  
This dataset has more vocabulary and richer in topics discussed.  However, this will take days to train.