<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Neural Machine Translation-Optimization</H1></u></center>

In [1]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
sess = tf.Session(config=config)
set_session(sess)

Using TensorFlow backend.


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, CuDNNLSTM, CuDNNGRU, Dropout, BatchNormalization, RepeatVector, TimeDistributed
from keras.layers.wrappers import Bidirectional
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import string
import re
import numpy as np
from unicodedata import normalize
from random import shuffle

In [3]:
file = open('../data/machine_translation/spa.txt', mode='rt', encoding='utf-8')
text = file.read()
file.close()

## Splitting the text:

In [4]:
lines = text.strip().split('\n')
text_split = [line.split('\t') for line in  lines]
text_split

[['Go.', 'Ve.'],
 ['Go.', 'Vete.'],
 ['Go.', 'Vaya.'],
 ['Go.', 'Váyase.'],
 ['Hi.', 'Hola.'],
 ['Run!', '¡Corre!'],
 ['Run.', 'Corred.'],
 ['Who?', '¿Quién?'],
 ['Wow!', '¡Órale!'],
 ['Fire!', '¡Fuego!'],
 ['Fire!', '¡Incendio!'],
 ['Fire!', '¡Disparad!'],
 ['Help!', '¡Ayuda!'],
 ['Help!', '¡Socorro! ¡Auxilio!'],
 ['Help!', '¡Auxilio!'],
 ['Jump!', '¡Salta!'],
 ['Jump.', 'Salte.'],
 ['Stop!', '¡Parad!'],
 ['Stop!', '¡Para!'],
 ['Stop!', '¡Pare!'],
 ['Wait!', '¡Espera!'],
 ['Wait.', 'Esperen.'],
 ['Go on.', 'Continúa.'],
 ['Go on.', 'Continúe.'],
 ['Hello!', 'Hola.'],
 ['I ran.', 'Corrí.'],
 ['I ran.', 'Corría.'],
 ['I try.', 'Lo intento.'],
 ['I won!', '¡He ganado!'],
 ['Oh no!', '¡Oh, no!'],
 ['Relax.', 'Tomátelo con soda.'],
 ['Smile.', 'Sonríe.'],
 ['Attack!', '¡Al ataque!'],
 ['Attack!', '¡Atacad!'],
 ['Get up.', 'Levanta.'],
 ['Go now.', 'Ve ahora mismo.'],
 ['Got it!', '¡Lo tengo!'],
 ['Got it?', '¿Lo pillas?'],
 ['Got it?', '¿Entendiste?'],
 ['He ran.', 'Él corrió.'],
 ['Hop in

## Cleaning the text:

In [5]:
def clean_text(lines):   
    text_cleaned = []
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            clean_pair.append(' '.join(line))
        text_cleaned.append(clean_pair)
    
    return np.array(text_cleaned)

In [6]:
text_cleaned = clean_text(text_split)
text_cleaned[:30]

array([['go', 've'],
       ['go', 'vete'],
       ['go', 'vaya'],
       ['go', 'vayase'],
       ['hi', 'hola'],
       ['run', 'corre'],
       ['run', 'corred'],
       ['who', 'quien'],
       ['wow', 'orale'],
       ['fire', 'fuego'],
       ['fire', 'incendio'],
       ['fire', 'disparad'],
       ['help', 'ayuda'],
       ['help', 'socorro auxilio'],
       ['help', 'auxilio'],
       ['jump', 'salta'],
       ['jump', 'salte'],
       ['stop', 'parad'],
       ['stop', 'para'],
       ['stop', 'pare'],
       ['wait', 'espera'],
       ['wait', 'esperen'],
       ['go on', 'continua'],
       ['go on', 'continue'],
       ['hello', 'hola'],
       ['i ran', 'corri'],
       ['i ran', 'corria'],
       ['i try', 'lo intento'],
       ['i won', 'he ganado'],
       ['oh no', 'oh no']], dtype='<U328')

In [7]:
text_cleaned[:-10]

array([['go', 've'],
       ['go', 'vete'],
       ['go', 'vaya'],
       ...,
       ['you cant easily put photos on an ipad from more than one computer however you can email photos to yourself from various computers and download these photos to your ipad',
        'usted no puede poner facilmente fotografias en un ipad de mas de una computadora sin embargo puede enviarse fotografias de varias computadoras y descargarlas a su ipad'],
       ['you cant view flash content on an ipad however you can easily email yourself the urls of these web pages and view that content on your regular computer when you get home',
        'usted no puede ver contenido flash en un ipad sin embargo puede enviarse el url de esas paginas web y ver el contenido en su computadora ordinaria al llegar a su hogar'],
       ['you cant view flash content on an ipad however you can easily email yourself the urls of these web pages and view that content on your regular computer when you get home',
        'no puedes 

In [8]:
text_cleaned.shape

(119936, 2)

## Creating training/testing datasets:

In [9]:
n_sentences = 10000
dataset = text_cleaned[:n_sentences, :]

In [10]:
shuffle(dataset)

train, test = dataset[:8000], dataset[8000:]

In [11]:
dataset[0]

array(['go', 've'], dtype='<U328')

## Preprocessing the data:

In [12]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [13]:
def max_length(lines):
    return max(len(line.split()) for line in lines)

## English Tokenizer:

In [14]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab = len(eng_tokenizer.word_index) + 1
eng_len = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab)
print('English Max Length: %d' % (eng_len))

English Vocabulary Size: 1591
English Max Length: 5


## Spanish Tokenizer:

In [15]:
spa_tokenizer = create_tokenizer(dataset[:, 1])
spa_vocab = len(spa_tokenizer.word_index) + 1
spa_len = max_length(dataset[:, 1])
print('Spanish Vocabulary Size: %d' % spa_vocab)
print('Spanish Max Length: %d' % (spa_len))

Spanish Vocabulary Size: 2953
Spanish Max Length: 7


## Creating Datasets:

In [16]:
def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [17]:
def encode_output(sequences, vocab_size):
    y_list = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        y_list.append(encoded)
    y = np.array(y_list)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [18]:
# Training data
train_X = encode_sequences(spa_tokenizer, spa_len, train[:, 1])
train_Y = encode_sequences(eng_tokenizer, eng_len, train[:, 0])
train_Y = encode_output(train_Y, eng_vocab)

In [19]:
# Test data
test_X = encode_sequences(spa_tokenizer, spa_len, test[:, 1])
test_Y = encode_sequences(eng_tokenizer, eng_len, test[:, 0])
test_Y = encode_output(test_Y, eng_vocab)

In [20]:
test_X[0]

array([ 6, 47,  0,  0,  0,  0,  0])

In [21]:
test_Y[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Creating the model

In [22]:
embeed_size = 150

In [23]:
model = Sequential()

In [24]:
model.add(Embedding(spa_vocab, embeed_size, input_length=spa_len))

In [25]:
model.add(BatchNormalization())

In [26]:
model.add(Bidirectional(CuDNNLSTM(300)))

In [27]:
model.add(BatchNormalization())

In [28]:
model.add(RepeatVector(eng_len))

In [29]:
model.add(Bidirectional(CuDNNLSTM(300, return_sequences=True)))

In [30]:
model.add(BatchNormalization())

In [31]:
model.add(TimeDistributed(Dense(eng_vocab, activation='softmax')))

In [32]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 7, 150)            442950    
_________________________________________________________________
batch_normalization_1 (Batch (None, 7, 150)            600       
_________________________________________________________________
bidirectional_1 (Bidirection (None, 600)               1084800   
_________________________________________________________________
batch_normalization_2 (Batch (None, 600)               2400      
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 600)            0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 5, 600)            2164800   
_________________________________________________________________
batch_normalization_3 (Batch (None, 5, 600)            2400      
__________

In [33]:
model.compile(optimizer=Adam(0.005), loss='categorical_crossentropy', metrics = ['accuracy'])

## Training the model:

In [34]:
filename = '../data/machine_translation/model_translation.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [35]:
%%time
model.fit(train_X, train_Y, epochs=20, batch_size=128, validation_data=(test_X, test_Y), callbacks=[checkpoint])

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Wall time: 1min 2s


<keras.callbacks.History at 0x29e491e83c8>

## Predicting text with the Test Dataset:

In [36]:
#model_ = load_model('../data/machine_translation/model_translation.h5')

In [37]:
def word_int(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [38]:
def predict_sequence(model, tokenizer, value):
    prediction = model.predict(value, verbose=0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        word = word_int(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [39]:
actual, predicted = [], []
for i, value in enumerate(test_X):
        value = value.reshape((1, value.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, value)
        _target, _src = test[i]
        if i < 20:
            print('src=[%s], target=[%s], prediction=[%s]' % (_src, _target, translation))
        actual.append(_target.split())
        predicted.append(translation.split())

src=[no voy], target=[im not going], prediction=[im not]
src=[encontralo a tomas], target=[find tom], prediction=[find tom]
src=[me gusta el vino], target=[i like wine], prediction=[i like wine]
src=[copia este archivo], target=[copy this file], prediction=[i the one]
src=[eso es estupido], target=[thats stupid], prediction=[thats good]
src=[soy la ex de tom], target=[im toms ex], prediction=[im toms tom]
src=[orale], target=[lets go], prediction=[awesome]
src=[soy un prisionero], target=[im a prisoner], prediction=[im a prisoner]
src=[no lo aguanto], target=[i cant bear it], prediction=[dont do it]
src=[tom comio solo], target=[tom ate alone], prediction=[tom ate]
src=[necesito un bano], target=[i need a bath], prediction=[i need a cab]
src=[estaban enfermos], target=[they were sick], prediction=[they were sick]
src=[me gusta la historia], target=[i like history], prediction=[i like fruit]
src=[esto es mio], target=[this is mine], prediction=[this is mine]
src=[ella adora los gatos], 

## Reference:
#### Sentence pairs: http://www.manythings.org/anki/