###  Created by Luis Alejandro (alejand@umich.edu)

In [1]:
import tensorflow as tf
import numpy as np
from utils import unicode_to_ascii
from dataset import DatasetBuilder
from translation import preprocess
from translation import Encoder, Decoder, Translator
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [2]:
# GPU?
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
# Creates dataset for training
files = ['../../datasets/nlp/english-spanish.txt']
builder = DatasetBuilder(files, preprocessors=(preprocess,preprocess), batch_size=64, max_obs=100000)
dataset = builder.build()

In [4]:
# Load embedding matrix (glove vectors) -> these will only be used for English (source/input)
embedding_size = 100
word_to_vector = {}
with open('../../datasets/glove.6B/glove.6B.%sd.txt' % embedding_size, encoding='utf8') as file:
    for line in file:
        values = line.split()
        word = unicode_to_ascii(values[0])
        vector = np.asarray(values[1:],dtype=np.float32)
        word_to_vector[word] = vector
        
embedding_vectors = np.zeros((len(builder.source_tokenizer.word_to_index), embedding_size))
for word, index in builder.source_tokenizer.word_to_index.items():
    if word.decode() in word_to_vector:
        embedding_vectors[index,:] = word_to_vector[word.decode()]

In [5]:
# Defining model
translator = Translator(builder.source_tokenizer.word_to_index,
                        builder.target_tokenizer.word_to_index,
                        source_embedding_matrix=embedding_vectors,
                        target_embedding_dim=embedding_size,
                        max_output_length=builder.target_tokenizer.max_seq,
                        restore=True)

In [6]:
# Training model
translator.train(4,dataset)

Epoch 1 out of 4 complete (118.91 secs) -- Loss: 0.5697 -- Accuracy: 0.88
Epoch 2 out of 4 complete (104.14 secs) -- Loss: 0.4824 -- Accuracy: 0.89
Epoch 3 out of 4 complete (106.35 secs) -- Loss: 0.4160 -- Accuracy: 0.90
Epoch 4 out of 4 complete (103.82 secs) -- Loss: 0.3651 -- Accuracy: 0.91
Creating final checkpoint!


In [7]:
# Creating some input
source = b"I'm very happy to see you."
source = preprocess(tf.constant(source))
source = tf.strings.split(tf.constant(source))
print(source)
source = builder.source_tokenizer.encode(source.numpy())
source = tf.constant(np.pad(source,(0,builder.source_tokenizer.max_seq - len(source))),
                     shape=[1,builder.source_tokenizer.max_seq])
print(source)

tf.Tensor([b'<start>' b'i' b'm' b'very' b'happy' b'to' b'see' b'you' b'.' b'<end>'], shape=(10,), dtype=string)
tf.Tensor([[  0  17  49 946 295 255 104 105   2   3   0   0   0   0   0   0]], shape=(1, 16), dtype=int32)


In [8]:
# Outputing model translation
print(' '.join(builder.target_tokenizer.index_to_word[word].decode() for word in translator.translate(source))) 

estoy muy feliz de conocerte .
