In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model,optimizers,layers,losses,preprocessing
from tensorflow.keras.layers import *

import os
import time
from codecs import open
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
def preprocess_sentence(text):
    ens = []
    cns = []
    
    for line in text:
        line = line.strip()
        en = line.split('\t')[0].lower()
        en_words = []
        for word in en.split():
            if word[-1] in [',','.','!','?']:
                word = word[:-1]+' '+word[-1]
            en_words.append(word)
        en = ' '.join(en_words)
        en = '<start> '+en+' <end>'
        cn = line.split('\t')[1]
        cn_words = []
        for ind in range(len(cn)):
            cn_words.append(cn[ind])
        cn = ' '.join(cn_words)
        cn = '<start> '+cn+' <end>'

        ens.append(en)
        cns.append(cn)
        
    return ens, cns

def tokenize(text):
    tokenizer = preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(text)
    tensor = tokenizer.texts_to_sequences(text)
    tensor = preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    return tensor, tokenizer

def max_length(tensor):
    return max(len(t) for t in tensor)

f = open('input/cmn.txt','r',encoding='utf-8')
lines = f.readlines()
f.close()
en_sentences ,cn_sentences = preprocess_sentence(lines)
en_tensor, en_dict = tokenize(en_sentences)
cn_tensor, cn_dict = tokenize(cn_sentences)
en_train,en_dev,cn_train,cn_dev = train_test_split(en_tensor,cn_tensor,test_size=0.01)
en_max_length, cn_max_length = max_length(en_tensor), max_length(cn_tensor)

In [17]:
BUFFER_SIZE = len(en_train)
BATCH_SIZE = 64
steps_per_epoch = len(en_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
en_vocab_size = len(en_dict.word_index)+1
cn_vocab_size = len(cn_dict.word_index)+1

train_tensor = tf.data.Dataset.from_tensor_slices((en_train, cn_train))
train_tensor = train_tensor.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dev_tensor = tf.data.Dataset.from_tensor_slices((en_dev, cn_dev))
dev_tensor = dev_tensor.shuffle(len(en_dev)).batch(BATCH_SIZE)

# example_en_batch, example_cn_batch = next(iter(train_tensor))
# example_en_batch.shape, example_cn_batch.shape

In [18]:
class Encoder(Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(self.enc_units, return_sequences=True, 
                       return_state=True, recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden_state):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden_state)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

encoder = Encoder(en_vocab_size, embedding_dim, units, BATCH_SIZE)
# sample_hidden_state = encoder.initialize_hidden_state()
# sample_output, sample_hidden_state = encoder(example_en_batch, sample_hidden_state)
# sample_output.shape,sample_hidden_state.shape

In [19]:
class Attention(Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.D1 = Dense(units)
        self.D2 = Dense(units)
        self.D = Dense(1)
    
    def call(self, hidden_state, output):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden_state, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.D
        # the shape of the tensor before applying self.D is (batch_size, max_length, units)
        score = self.D(tf.nn.tanh(self.D1(output) + self.D2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights
    
# attention_layer = Attention(10)
# attention_result, attention_weights = attention_layer(sample_hidden_state, sample_output)
# attention_result.shape,attention_weights.shape

In [20]:
class Decoder(Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(self.dec_units, return_sequences=True,
                      return_state=True, recurrent_initializer='glorot_uniform')
        
        self.fc = Dense(vocab_size)
        self.attention = Attention(self.dec_units)
    
    def call(self, x, hidden_state, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden_state, enc_output)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
    
decoder = Decoder(cn_vocab_size, embedding_dim, units, BATCH_SIZE)
# sample_decoder_output, _, _ = decoder(tf.random.uniform((64,1)),
#                                      sample_hidden_state, sample_output)
# sample_decoder_output.shape

In [21]:
optimizer = optimizers.Adam()
loss_object = losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

In [22]:
@tf.function
def train_step(en_input, cn_target, enc_hidden_state):
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden_state = encoder(en_input, enc_hidden_state)
        
        dec_hidden_state = enc_hidden_state
        
        dec_input = tf.expand_dims([cn_dict.word_index['<start>']] *BATCH_SIZE, 1)
        
        # Teacher forcing - feeding the target as the next input
        for t in range(1, cn_target.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden_state, enc_output)
            
            loss += loss_function(cn_target[:,t], predictions)
            
            # using teacher forcing
            dec_input = tf.expand_dims(cn_target[:, t], 1)
            
    batch_loss = (loss/int(cn_target.shape[1]))
    
    variables = encoder.trainable_variables + decoder.trainable_variables
    
    gradients = tape.gradient(loss, variables)
    
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

In [None]:
EPOCHS = 10
for epoch in range(EPOCHS):
    start = time.time()
    
    enc_hidden_state = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (en_input, cn_target)) in enumerate(train_tensor.take(steps_per_epoch)):
        batch_loss = train_step(en_input, cn_target, enc_hidden_state)
        total_loss += batch_loss
        
        if batch%100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1, batch, batch_loss.numpy()))
    
    # saving (checkpoint) the model every 2 epochs
    if (epoch+1)%2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
    
    print('Epoch {} Train loss {:.4f}'.format(epoch+1, total_loss/steps_per_epoch))
    
    print('Time taken for 1 epoch {:d} sec\n'.format(time.time()-start))

Epoch 1 Batch 0 Loss 1.9656


In [72]:
def evaluate(sentence):
    attention_plot = np.zeros((cn_max_length, en_max_length))
    
    words = sentence.lower().split()
    en_words = []
    for word in words:
        if word[-1] in [',','.','!','?']:
            word = word[:-1]+' '+word[-1]
        en_words.append(word)
    sentence = ' '.join(en_words)
    sentence = '<start> '+sentence+' <end>'
    inputs = [en_dict.word_index[i] for i in sentence.split()]
    inputs = preprocessing.sequence.pad_sequences([inputs], 
                                                  maxlen=en_max_length, padding='post')
    inputs = tf.convert_to_tensor(inputs)
#     print(inputs)
    result = ''
    
    hidden_state = [tf.zeros((1, units))]
    enc_out, enc_hidden_state = encoder(inputs, hidden_state)
    
    dec_hidden_state = enc_hidden_state
    dec_input = tf.expand_dims([cn_dict.word_index['<start>']], 0)
    
    for t in range(cn_max_length):
        predictions, dec_hidden_state, attention_weights = decoder(dec_input, 
                                                                   dec_hidden_state, enc_out)
        
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        
        predicted_id = tf.argmax(predictions[0]).numpy()
        
        result += cn_dict.index_word[predicted_id]+' '
        
        if cn_dict.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot
    
        # the predicted ID is fed back into the model
        dec_input_state = tf.expand_dims([predicted_id], 0)
    
    return result, sentence, attention_plot

In [21]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    
    fontdict = {'fontsize': 14}
    
    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [22]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))
    
    attention_plot = attention_plot[:len(result.split()), :len(sentence.split())]
    plot_attention(attention_plot, sentence.split(), result.split())

In [24]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

translate(u'hace mucho frio aqui.')