###  Created by Luis Alejandro (alejand@umich.edu)

In [1]:
import tensorflow as tf
import numpy as np
from utils import unicode_to_ascii
from dataset import DatasetBuilder
from translation import preprocess
from translation import Encoder, Decoder, Translator
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [2]:
# GPU?
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
# Creates dataset for training
files = ['../../datasets/nlp/english-spanish.txt']
builder = DatasetBuilder(files, preprocessors=(preprocess,preprocess), batch_size=64, max_obs=30000, test_obs=20)
train_dataset, test_dataset = builder.build()

In [4]:
# Load embedding matrix (glove vectors) -> these will only be used for English (source/input)
embedding_size = 100
word_to_vector = {}
with open('../../datasets/glove.6B/glove.6B.%sd.txt' % embedding_size, encoding='utf8') as file:
    for line in file:
        values = line.split()
        word = unicode_to_ascii(values[0])
        vector = np.asarray(values[1:],dtype=np.float32)
        word_to_vector[word] = vector
        
embedding_vectors = np.zeros((len(builder.source_tokenizer.word_to_index), embedding_size))
for word, index in builder.source_tokenizer.word_to_index.items():
    if word.decode() in word_to_vector:
        embedding_vectors[index,:] = word_to_vector[word.decode()]

In [5]:
# Defining model
translator = Translator(builder.source_tokenizer.word_to_index,
                        builder.target_tokenizer.word_to_index,
                        source_embedding_matrix=embedding_vectors,
                        target_embedding_dim=embedding_size,
                        max_output_length=builder.target_tokenizer.max_seq,
                        restore=True)

In [6]:
# Training model
translator.train(4,train_dataset, test_dataset)

Epoch 1 out of 4 complete (23.83 secs) -- Train Loss: 0.3125 -- Train Acc: 0.93 -- Test Loss: 0.3426 -- Test Acc: 0.93
Epoch 2 out of 4 complete (15.45 secs) -- Train Loss: 0.2775 -- Train Acc: 0.93 -- Test Loss: 0.3466 -- Test Acc: 0.92
Epoch 3 out of 4 complete (15.53 secs) -- Train Loss: 0.2485 -- Train Acc: 0.94 -- Test Loss: 0.3682 -- Test Acc: 0.92
Epoch 4 out of 4 complete (15.46 secs) -- Train Loss: 0.2229 -- Train Acc: 0.94 -- Test Loss: 0.3692 -- Test Acc: 0.93
Creating final checkpoint!


In [7]:
# Creating some input
source = b"I'm very happy to see you."
source = preprocess(tf.constant(source))
source = tf.strings.split(tf.constant(source))
print(source)
source = builder.source_tokenizer.encode(source.numpy())
source = tf.constant(np.pad(source,(0,builder.source_tokenizer.max_seq - len(source))),
                     shape=[1,builder.source_tokenizer.max_seq])
print(source)

tf.Tensor([b'<start>' b'i' b'm' b'very' b'happy' b'to' b'see' b'you' b'.' b'<end>'], shape=(10,), dtype=string)
tf.Tensor([[  0  17  49 946 295 255 104 105   2   3   0]], shape=(1, 11), dtype=int32)


In [8]:
# Outputing model translation
print(' '.join(builder.target_tokenizer.index_to_word[word].decode() for word in translator.translate(source))) 

estoy feliz de verte .


In [9]:
# Check translation for elements in test set
for batch in test_dataset:
    for source, target in zip(batch[0], batch[1]):
        source = tf.expand_dims(source,0)
        print('Original', ' '.join(builder.target_tokenizer.index_to_word[word].decode() for word in target.numpy() 
                                   if word != builder.target_tokenizer.word_to_index[b'<start>'] 
                                   and word != builder.target_tokenizer.word_to_index[b'<end>'])) 
        print('Translation:', ' '.join(builder.target_tokenizer.index_to_word[word].decode() 
                                       for word in translator.translate(source)),end='\n\n')      

Original deja de quejarte .
Translation: deja de chismorrear .

Original ¿ quien es tom ?
Translation: ¿ quien es tom ?

Original ¿ a que viene esto ?
Translation: ¿ por que esto es ?

Original yo os quise .
Translation: yo te queria .

Original renunciemos .
Translation: desistamos .

Original eso es bueno .
Translation: es perfecto .

Original soy adulta .
Translation: soy adulto .

Original odio el futbol .
Translation: odio el futbol .

Original prueba esto .
Translation: prueben esto .

Original nos estamos mudando .
Translation: nos estamos acercando .

Original yo te respeto .
Translation: te respeto .

Original despues veremos .
Translation: veremos .

Original soy malo .
Translation: estoy de acuerdo .

Original me escape .
Translation: me escapaba .

Original mira aqui .
Translation: mira aqui .

Original ¿ a poco si ?
Translation: ¿ es correcta ?

Original funciona bien .
Translation: funciona bien .

Original ven temprano .
Translation: ven temprano .

Original ¿ quienes so