###  Created by Luis Alejandro (alejand@umich.edu)

In [1]:
import tensorflow as tf
import numpy as np
from utils import unicode_to_ascii
from dataset import DatasetBuilder
from basic_translation import preprocess
from basic_translation import Translator
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [2]:
# GPU?
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
# Creates dataset for training
files = ['../../datasets/nlp/english-spanish.txt']
builder = DatasetBuilder(files, preprocessors=(preprocess,preprocess), batch_size=64, max_obs=30000, test_obs=20)
train_dataset, test_dataset = builder.build()

In [4]:
# Load embedding matrix (glove vectors) -> these will only be used for English (source/input)
embedding_size = 100
word_to_vector = {}
with open('../../datasets/glove.6B/glove.6B.%sd.txt' % embedding_size, encoding='utf8') as file:
    for line in file:
        values = line.split()
        word = unicode_to_ascii(values[0])
        vector = np.asarray(values[1:],dtype=np.float32)
        word_to_vector[word] = vector
        
embedding_vectors = np.zeros((len(builder.source_tokenizer.word_to_index), embedding_size))
for word, index in builder.source_tokenizer.word_to_index.items():
    if word.decode() in word_to_vector:
        embedding_vectors[index,:] = word_to_vector[word.decode()]

In [5]:
# Defining model
translator = Translator(builder.source_tokenizer.word_to_index,
                        builder.target_tokenizer.word_to_index,
                        source_embedding_matrix=embedding_vectors,
                        target_embedding_size=embedding_size,
                        max_output_length=builder.target_tokenizer.max_seq,
                        restore=True)

In [6]:
# Training model
translator.train(20, train_dataset, test_dataset)

Epoch 1 out of 20 complete (24.03 secs) -- Train Loss: 1.9192 -- Train Acc: 0.74 -- Test Loss: 1.2630 -- Test Acc: 0.81
Epoch 2 out of 20 complete (17.88 secs) -- Train Loss: 1.3576 -- Train Acc: 0.80 -- Test Loss: 1.0558 -- Test Acc: 0.83
Epoch 3 out of 20 complete (18.21 secs) -- Train Loss: 1.1492 -- Train Acc: 0.82 -- Test Loss: 0.9122 -- Test Acc: 0.85
Epoch 4 out of 20 complete (17.84 secs) -- Train Loss: 0.9945 -- Train Acc: 0.83 -- Test Loss: 0.7961 -- Test Acc: 0.85
Epoch 5 out of 20 complete (17.86 secs) -- Train Loss: 0.8676 -- Train Acc: 0.85 -- Test Loss: 0.7142 -- Test Acc: 0.86
Epoch 6 out of 20 complete (18.19 secs) -- Train Loss: 0.7596 -- Train Acc: 0.86 -- Test Loss: 0.6424 -- Test Acc: 0.87
Epoch 7 out of 20 complete (18.75 secs) -- Train Loss: 0.6659 -- Train Acc: 0.87 -- Test Loss: 0.5750 -- Test Acc: 0.90
Epoch 8 out of 20 complete (17.68 secs) -- Train Loss: 0.5828 -- Train Acc: 0.88 -- Test Loss: 0.5389 -- Test Acc: 0.90
Epoch 9 out of 20 complete (17.86 secs) 

In [7]:
# Creating some input
source = b"I'm very happy to see you."
source = preprocess(tf.constant(source))
source = tf.strings.split(tf.constant(source))
print(source)
source = builder.source_tokenizer.encode(source.numpy())
source = tf.constant(np.pad(source,(0,builder.source_tokenizer.max_seq - len(source))),
                     shape=[1,builder.source_tokenizer.max_seq])
print(source)

tf.Tensor([b'<start>' b'i' b'm' b'very' b'happy' b'to' b'see' b'you' b'.' b'<end>'], shape=(10,), dtype=string)
tf.Tensor([[  0  17  49 946 295 255 104 105   2   3   0]], shape=(1, 11), dtype=int32)


In [8]:
# Outputing model translation
prediction = translator.translate(source)
print(' '.join(builder.target_tokenizer.index_to_word[word].decode() for word in prediction))

estoy muy feliz de verte . <end>


In [9]:
# Check translation for elements in test set
for batch in test_dataset:
    for source, target in zip(batch[0], batch[1]):
        # Prepares input
        source = tf.expand_dims(source,0)
        # Prints expected translation
        words = []
        for word in target.numpy():
            decoded = builder.target_tokenizer.index_to_word[word].decode()
            words.append(decoded)
            if decoded == '<end>':
                break
        print('Expected:', ' '.join(words[1:-1]))
        # Prints actual translation
        words = []
        prediction = translator.translate(source)
        for word in prediction:
            decoded = builder.target_tokenizer.index_to_word[word].decode()
            words.append(decoded)
        print('Translation:', ' '.join(words[:-1]), end='\n\n')   

Expected: es canadiense .
Translation: es canadiense .

Expected: ¿ esta loco tom ?
Translation: ¿ tom esta loco ?

Expected: los dos estan vivos .
Translation: los hombres estan vivos .

Expected: ¿ es eso un anillo ?
Translation: ¿ es ese ese sombrero ?

Expected: parese aqui mismo .
Translation: parate aqui mismo .

Expected: estudiad mas .
Translation: estudia mas .

Expected: estoy sin dinero .
Translation: no tengo un chavo .

Expected: esto es mejor .
Translation: esto es mejor .

Expected: confia en dios .
Translation: confia en dios en dios .

Expected: ¿ le conozco ?
Translation: ¿ lo conozco ?

Expected: sos un perdedor .
Translation: tu eres un perdedor .

Expected: sabiamos eso .
Translation: lo sabiamos .

Expected: no era nuestra .
Translation: no fue nuestro .

Expected: ella esta en peligro .
Translation: esta en peligro .

Expected: yo extrano a mi mama .
Translation: echo mi mama .

Expected: ese perro salto .
Translation: ese perro brinco .

Expected: no estoy minti