In [1]:
import tensorflow as tf

In [2]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

In [3]:
import re
import os
import io
import time
import numpy as np
import unicodedata

In [4]:
path_to_zip=tf.keras.utils.get_file('spa-eng.zip', 
                                    origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
                                    extract=True)

In [5]:
path_to_file=os.path.dirname(path_to_zip)+"\\spa-eng\\spa.txt"

In [6]:
path_to_file

'C:\\Users\\personal\\.keras\\datasets\\spa-eng\\spa.txt'

In [7]:
#convert unicode file to ascii
#u"Klüft skräms inför på fédéral électoral große" ---> unicode version
#u"Kluft skrams infor pa federal electoral groe" ----> ascii version
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [8]:
def preprocess_sentence(w):
    w=unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w

In [9]:
en_sentence = u"May I borrow this book?"
nl_sentence = u"¿Puedo tomar prestado este libro?"

In [10]:
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(nl_sentence))

<start> may i borrow this book ? <end>
<start> ¿ puedo tomar prestado este libro ? <end>


In [11]:
unk_sentence="Klüft skräms inför på fédéral électoral große"
preprocess_sentence(unk_sentence)

'<start> kluft skrams infor pa federal electoral gro e <end>'

In [12]:
def create_dataset(path,num_examples):
    lines=io.open(path,encoding='UTF-8').read().strip().split('\n')
    word_pairs=[[preprocess_sentence(w) for w in line.split('\t')] for line in lines[:num_examples]]
    return zip(*word_pairs)

In [13]:
en,nl=create_dataset(path_to_file,None)
l=len(en)
print(l)
for i in range(5):
    n=np.random.randint(l)
    print(en[n])
    print(nl[n])
    print("-----------------------------")


118964
<start> i wonder if tom can hear us . <end>
<start> me pregunto si tom nos puede oir . <end>
-----------------------------
<start> it s really not that cold . <end>
<start> si no esta tan helado . <end>
-----------------------------
<start> tom wanted to see mary again . <end>
<start> tom queria ver a mary de nuevo . <end>
-----------------------------
<start> you re my enemy . <end>
<start> vosotras sois mis enemigas . <end>
-----------------------------
<start> we were never allowed to celebrate christmas . <end>
<start> nunca se nos permitio celebrar la navidad . <end>
-----------------------------


In [14]:
def tokenize(language):
    language_tokenizer=tf.keras.preprocessing.text.Tokenizer(filters='')
    language_tokenizer.fit_on_texts(language)
    tensor=language_tokenizer.texts_to_sequences(language)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')
    return tensor,language_tokenizer

In [15]:
tokenize(en[0])

(array([[ 1],
        [ 4],
        [ 2],
        [ 5],
        [ 6],
        [ 2],
        [ 3],
        [ 0],
        [ 7],
        [ 8],
        [ 0],
        [ 9],
        [ 0],
        [ 1],
        [10],
        [11],
        [12],
        [ 3]]), <keras_preprocessing.text.Tokenizer at 0x28a48a64a90>)

In [16]:
def load_dataset(path,num_examples=None):
    targ_lang,inp_lang=create_dataset(path,num_examples)
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
    return input_tensor,target_tensor,inp_lang_tokenizer,targ_lang_tokenizer

In [17]:
num_examples = 30000
input_tensor,target_tensor,inp_lang,targ_lang=load_dataset(path_to_file, num_examples)

In [18]:
max_length_targ,max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
print(max_length_targ,max_length_inp)

11 16


In [19]:
input_tensor_train,input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

24000 24000 6000 6000


In [20]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [21]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print (" ")
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
13 ----> la
248 ----> television
8 ----> no
387 ----> funciona
3 ----> .
2 ----> <end>
 
Target Language; index to word mapping
1 ----> <start>
13 ----> the
215 ----> tv
176 ----> doesn
12 ----> t
93 ----> work
3 ----> .
2 ----> <end>


In [22]:
buffer_size=len(input_tensor_train)
batch_size=64
steps_per_epoch = len(input_tensor_train)//batch_size
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(buffer_size)
dataset = dataset.batch(batch_size, drop_remainder=True)

In [23]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 16]), TensorShape([64, 11]))

Bahdanau attention for the encoder

FC = Fully connected (dense) layer

EO = Encoder output

H = hidden state

X = input to the decoder

pseudo-code:

score = FC(tanh(FC(EO) + FC(H)))

attention weights = softmax(score, axis = 1). Softmax by default is applied on the last axis but here we want to apply it on the 1st axis, since the shape of score is (batch_size, max_length, hidden_size). Max_length is the length of our input. Since we are trying to assign a weight to each input, softmax should be applied on that axis.

context vector = sum(attention weights * EO, axis = 1). Same reason as above for choosing axis as 1.

embedding output = The input to the decoder X is passed through an embedding layer.

merged vector = concat(embedding output, context vector)

This merged vector is then given to the GRU

References:

https://www.tensorflow.org/tutorials/text/nmt_with_attention

https://arxiv.org/pdf/1409.0473.pdf

https://github.com/tensorflow/nmt

In [27]:
class Encoder(tf.keras.Model):
    
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        
    def call(self,x,hidden):
        x=self.embedding(x)
        output,state=self.gru(x,initial_state=hidden)
        return output,state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [28]:
encoder = Encoder(vocab_inp_size, embedding_dim, units,batch_size)

In [29]:
sample_hidden=encoder.initialize_hidden_state()

In [30]:
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

In [32]:
print (f'Encoder output shape: (batch size, sequence length, units) {sample_output.shape}')
print (f'Encoder Hidden state shape: (batch size, units) {sample_hidden.shape}')

Encoder output shape: (batch size, sequence length, units) (64, 16, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [35]:
class BahdanauAttention(tf.keras.layers.Layer):

    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):

        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [36]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 16, 1)


In [37]:
class Decoder(tf.keras.Model):

    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):

        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)

        return x, state, attention_weights

In [38]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)

sample_decoder_output, _, _ = decoder(tf.random.uniform((batch_size, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 4935)


In [39]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [40]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)