# **Machine Translation from English to French**

In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
import collections
import numpy as np

from keras.layers import Input, Dense, Bidirectional, LSTM
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


### Verify access to the GPU

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9953843907868990253
, name: "/gpu:0"
device_type: "GPU"
memory_limit: 357302272
locality {
  bus_id: 1
}
incarnation: 14935192867186758903
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0"
]


# 1. Load Data
Due to limited computing power for AWS free tier EC2 instance that I am using, dataset used for this task contains small vocabulary (200 ~ 300 words).

In [None]:
with open('data/small_vocab_en', 'r') as f:
    eng_sentences = f.read().split('\n')

with open('data/small_vocab_fr', 'r') as f:
    fre_sentences = f.read().split('\n')

print('Dataset Loaded')

Dataset Loaded


In [None]:
for sample_i in range(2):
    print('English Sentence {} :  {}'.format(sample_i+1, eng_sentences[sample_i]))
    print('French Sentence {}  :  {}\n'.format(sample_i+1, fre_sentences[sample_i]))

English Sentence 1 :  new jersey is sometimes quiet during autumn , and it is snowy in april .
French Sentence 1  :  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .

English Sentence 2 :  the united states is usually chilly during july , and it is usually freezing in november .
French Sentence 2  :  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .



# 2. Pre-process text
## 2.1. Tokenize function

In [None]:
from keras.preprocessing.text import Tokenizer

def tokenize(x, encode_start_end = False):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :param encode_start_end: if True, pad the start & end of sentence as separate tokens
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    if encode_start_end:
        x = ["startofsentence " + sentence + " endofsentence" for sentence in x]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    tokenized_x = tokenizer.texts_to_sequences(x)

    return tokenized_x, tokenizer

Using TensorFlow backend.


## 2.2. Padding  function

In [None]:
from keras.preprocessing.sequence import pad_sequences

def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    if length is None:
        length = max([len(sentence) for sentence in x])

    padded_x = pad_sequences(x, maxlen = length, padding = 'post', truncating = 'post')

    return padded_x

## 2.3. Execute both functions

In [None]:
eng_tokenized, eng_tokenizer = tokenize(eng_sentences)
fre_tokenized, fre_tokenizer = tokenize(fre_sentences, encode_start_end = True)

eng_encoded = pad(eng_tokenized)
fre_encoded = pad(fre_tokenized)

eng_vocab_size = len(eng_tokenizer.word_index)
fre_vocab_size = len(fre_tokenizer.word_index)

print("English vocabulary size: ", eng_vocab_size)
print("frenish vocabulary size: ", fre_vocab_size)
print()

eng_seq_len = len(eng_encoded[0])
fre_seq_len = len(fre_encoded[0])

print("Length of longest English sentence: ", eng_seq_len)
print("Length of longest frenish sentence: ", fre_seq_len)
print()

English vocabulary size:  199
frenish vocabulary size:  346

Length of longest English sentence:  15
Length of longest frenish sentence:  23



# 3. Build Seq2Seq Model & Train
## 3.1. Training model

In [None]:
from keras.layers import LSTM

english_input = ed_preproc_english_sentences   # (137861, 15,)

decoder_french_input = ed_preproc_french_sentences[:, :-1, :]
decoder_french_target = ed_preproc_french_sentences[:, 1:, :]

# 1. Define Encoder
input_seq_encoder = Input(shape = (None, ),
                          name = "encoder_input")     # (batch_size, sentence_length, 1)

embed_dim = 200
embedded_seq_encoder = Embedding(input_dim = english_vocab_size,
                                 output_dim = embed_dim)(input_seq_encoder)

encoder_lstm = LSTM(units = 256,
                    activation = 'relu',
                    return_sequences = False,
                    return_state = True,
                    name = "encoder_LSTM")

_, last_hidden_encoder, last_cell_encoder = encoder_lstm(embedded_seq_encoder)


# 2. Define Decoder
input_seq_decoder = Input(shape = (None, 1),
                          name = "decoder_input")     # (batch_size, sentence_length, 1)

decoder_lstm = LSTM(units = 256,
                    activation = 'relu',
                    return_sequences = True,
                    return_state = True,
                    name = "decoder_LSTM")

all_hidden_decoder, _, _ = decoder_lstm(input_seq_decoder,
                                        initial_state = [last_hidden_encoder, last_cell_encoder])

decoder_dense = Dense(ed_french_vocab_size,   # NOT TIMEDISTRIBUTED (NOT RECURSIVE)
                      activation = 'softmax',
                      name = "decoder_dense")
logits = decoder_dense(all_hidden_decoder)


# 3. Define Model
final_rnn_model = Model(input = [input_seq_encoder, input_seq_decoder],
                        output = logits)

final_rnn_model.compile(loss = sparse_categorical_crossentropy,
                        optimizer = Adam(lr = 0.002),
                        metrics = ['accuracy'])

# 4. Fit the Model
final_rnn_model.fit([english_input, decoder_french_input],
                    decoder_french_target,
                    batch_size = 1024,
                    epochs = 16,
                    validation_split = 0.2)



Train on 110288 samples, validate on 27573 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<keras.callbacks.History at 0x7feddf9ac438>

In [None]:
final_rnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    39800       encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, None, 1)      0                                            
__________________________________________________________________________________________________
encoder_LSTM (LSTM)             [(None, 256), (None, 467968      embedding_1[0][0]                
__________________________________________________________________________________________________
decoder_LS

## 3.2. Inference model
### 2.1. Encoder Model for inference

In [None]:
last_states_encoder = [last_hidden_encoder, last_cell_encoder]
inference_encoder_model = Model(input = input_seq_encoder,
                                output = last_states_encoder)

  This is separate from the ipykernel package so we can avoid doing imports until


### 2.2. Decoder Model for inference

In [None]:
decoder_initial_state = [Input(shape = (256,)), Input(shape = (256,))]
all_hidden_decoder, last_hidden_decoder, last_cell_decoder = decoder_lstm(input_seq_decoder,
                                                                          initial_state = decoder_initial_state)

logits = decoder_dense(all_hidden_decoder)

inference_decoder_model = Model(input  = [input_seq_decoder] + decoder_initial_state,

                                output = [logits,
                                          last_hidden_decoder,
                                          last_cell_decoder])

  # This is added back by InteractiveShellApp.init_path()


### 2.3. Decode Sequence Function

In [None]:
target_id_to_word = {idx:word for word, idx in ed_french_tokenizer.word_index.items()}

def decode_sequence(input_seq):
    """
    Gets predictions using the final model defined above
    :param input_seq: (list) encoded english sentence (list of word ids)
    returns : translated French sentence
    """
    decoder_input = inference_encoder_model.predict(input_seq)

    # Initialize decoder input as a length 1 sentence containing "startofsentence",
    # --> feeding the start token as the first predicted word
    prev_word = np.zeros((1, 1, 1))
    prev_word[0, 0, 0] = ed_french_tokenizer.word_index["startofsentence"]

    stop_condition = False
    translation = []
    while not stop_condition:
        # 1. predict the next word using decoder model
        logits, last_h, last_c = inference_decoder_model.predict([prev_word] + decoder_input)

        # 2. Update prev_word with the predicted word
        predicted_id = np.argmax(logits[0, 0, :])
        predicted_word = target_id_to_word[predicted_id]
        decoded_sentence.append(predicted_word)

        # 3. Enable End Condition: (1) if predicted word is "endofsentence" OR
        #                          (2) if translated sentence reached maximum sentence length
        if (predicted_word == 'endofsentence' or len(translation) > decoder_french_target.shape[1]):
            stop_condition = True

        # 4. Update prev_word with the predicted word
        prev_word[0, 0, 0] = predicted_id

        # 5. Update initial_states with the previously predicted word's encoder output
        decoder_input = [last_h, last_c]

    return " ".join(decoded_sentence).replace('endofsentence', '')

### 2.4. Prediction

In [None]:
# TODO: Print prediction(s)
for i in [293, 296, 393, 418]:
    english_seq = ed_preproc_english_sentences[i].reshape(1, ed_preproc_english_sentences.shape[1])
    french_translation = decode_sequence(english_seq)

    print("English Sentence            : ", english_sentences[i])
    print("Predicted French Translation: ", french_translation)
    print("Correct French Translation  : ", french_sentences[i])
    print()

English Sentence            :  i plan to visit france in spring .
Predicted French Translation:  je prévois de visiter la france au printemps 
Correct French Translation  :  je prévois de visiter la france au printemps .

English Sentence            :  she likes grapes , apples , and grapefruit.
Predicted French Translation:  elle aime les raisins les pommes et le pamplemousse 
Correct French Translation  :  elle aime les raisins , les pommes et le pamplemousse .

English Sentence            :  my most loved animal was that bird .
Predicted French Translation:  mon animal le plus aimé était cet oiseau 
Correct French Translation  :  mon animal le plus aimé était cet oiseau .

English Sentence            :  france is pleasant during july , but it is usually dry in december .
Predicted French Translation:  la france est agréable en juillet mais il est généralement sec en décembre 
Correct French Translation  :  la france est agréable en juillet , mais il est généralement sec en décembre 

seems like perfect translations for the standarad of the simple dataset with limited vocabulary!