In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [2]:
import pickle
with open('features.pickle', 'rb') as handle:
    input_sent = pickle.load(handle)


with open('labels.pickle', 'rb') as handle:
    output_sent = pickle.load(handle)
    
input_sent=input_sent[:40000]
output_sent=output_sent[:40000]
print(len(input_sent),input_sent[0])

40000 <start> can we make this quick   roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again  <end>


In [3]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
    lang_tokenizer.fit_on_texts(lang)
    
    tensor = lang_tokenizer.texts_to_sequences(lang)

    
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')
    return tensor, lang_tokenizer

def dataset():
    input_tensor, inp_lang_tokenizer = tokenize(input_sent)
    target_tensor, targ_lang_tokenizer = tokenize(output_sent)
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer


input_tensor, output_tensor, inp_lang, out_lang = dataset()


In [4]:
print(input_tensor[0],inp_lang)
print(len(output_tensor))

def max_len(tensor):
    return max(len(t) for t in tensor)

max_len_inp = max_len(input_tensor)
max_len_out = max_len(output_tensor)
    
print(max_len_inp,max_len_out)

[   1   35   19  114   23  912 8163 8164   16 4499 2858   30  393   87
 4500 8165 1103  509   58   37    6 5754  197    2    0    0    0    0
    0    0    0    0    0    0    0    0] <keras_preprocessing.text.Tokenizer object at 0x000001A1DB63BE88>
40000
36 35


In [5]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, output_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))


32000 32000 8000 8000


In [6]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))


print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(out_lang, target_tensor_train[0])


Input Language; index to word mapping
1 ----> <start>
4 ----> you
27 ----> have
7 ----> to
1203 ----> quickly
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
176 ----> .
857 ----> imagine
10 ----> a
144 ----> better
7065 ----> grape
25 ----> for
6 ----> the
5453 ----> region
2 ----> <end>


In [7]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 32
units = 256
vocab_inp_size=len(inp_lang.word_index)+1
vocab_tar_size=len(out_lang.word_index)+1


dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

for element in dataset:
    print(element)
    break
    
vocab_inp_size

(<tf.Tensor: shape=(64, 36), dtype=int32, numpy=
array([[  1,  10,  42, ...,   0,   0,   0],
       [  1,  19,  21, ...,   0,   0,   0],
       [  1,   5,  21, ...,   0,   0,   0],
       ...,
       [  1,  41, 104, ...,   0,   0,   0],
       [  1,   4,  89, ...,   0,   0,   0],
       [  1, 170,   2, ...,   0,   0,   0]])>, <tf.Tensor: shape=(64, 35), dtype=int32, numpy=
array([[    1,     5,    35, ...,     0,     0,     0],
       [    1,    63,     3, ...,     0,     0,     0],
       [    1,     5,    23, ...,     0,     0,     0],
       ...,
       [    1,    56,     4, ...,     0,     0,     0],
       [    1,    10, 14798, ...,     0,     0,     0],
       [    1,    13,    23, ...,     0,     0,     0]])>)


14848

In [8]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape,example_input_batch

(TensorShape([64, 36]),
 TensorShape([64, 35]),
 <tf.Tensor: shape=(64, 36), dtype=int32, numpy=
 array([[  1,  69,  74, ...,   0,   0,   0],
        [  1,  10,   8, ...,   0,   0,   0],
        [  1,  13, 190, ...,   0,   0,   0],
        ...,
        [  1,   4,  84, ...,   0,   0,   0],
        [  1,  64,   3, ...,   0,   0,   0],
        [  1, 971, 980, ...,   0,   0,   0]])>)

In [9]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_size):
        super(Encoder,self).__init__()
        self.batch_size=batch_size
        self.enc_units=enc_units
        self.embedding=tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru=tf.keras.layers.GRU(self.enc_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
    def call(self,x,hidden):
        x=self.embedding(x)
        output,state=self.gru(x,initial_state=hidden)
        return output,state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size,self.enc_units))
    
        

In [10]:
encoder=Encoder(vocab_inp_size,embedding_dim, units, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))



Encoder output shape: (batch size, sequence length, units) (64, 36, 256)
Encoder Hidden state shape: (batch size, units) (64, 256)


In [11]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [12]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))


Attention result shape: (batch size, units) (64, 256)
Attention weights shape: (batch_size, sequence_length, 1) (64, 36, 1)


In [13]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)
    
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [14]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))


Decoder output shape: (batch_size, vocab size) (64, 14988)


In [15]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)


In [16]:
checkpoint_dir = './training_checkpoints2'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [17]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([out_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, 
                                                 dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)
        
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
        
    gradients = tape.gradient(loss, variables)
    
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

In [101]:
EPOCHS = 100
import gc
gc.collect()
for epoch in range(EPOCHS):
    start = time.time()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.4510
Epoch 1 Batch 100 Loss 1.7852
Epoch 1 Batch 200 Loss 1.6879
Epoch 1 Batch 300 Loss 1.5609
Epoch 1 Batch 400 Loss 1.6729
Epoch 1 Loss 1.6774
Time taken for 1 epoch 116.02587938308716 sec

Epoch 2 Batch 0 Loss 1.6240
Epoch 2 Batch 100 Loss 1.5182
Epoch 2 Batch 200 Loss 1.7699
Epoch 2 Batch 300 Loss 1.3747
Epoch 2 Batch 400 Loss 1.3332
Epoch 2 Loss 1.5347
Time taken for 1 epoch 89.34082388877869 sec

Epoch 3 Batch 0 Loss 1.4305
Epoch 3 Batch 100 Loss 1.5475
Epoch 3 Batch 200 Loss 1.6970
Epoch 3 Batch 300 Loss 1.6634
Epoch 3 Batch 400 Loss 1.4628
Epoch 3 Loss 1.4637
Time taken for 1 epoch 87.19940662384033 sec

Epoch 4 Batch 0 Loss 1.5956
Epoch 4 Batch 100 Loss 1.5673
Epoch 4 Batch 200 Loss 1.5086
Epoch 4 Batch 300 Loss 1.5579
Epoch 4 Batch 400 Loss 1.2083
Epoch 4 Loss 1.4087
Time taken for 1 epoch 87.71989631652832 sec

Epoch 5 Batch 0 Loss 1.3770
Epoch 5 Batch 100 Loss 1.2152
Epoch 5 Batch 200 Loss 1.4408
Epoch 5 Batch 300 Loss 1.3056
Epoch 5 Batch 400 Loss 1.

Epoch 38 Batch 400 Loss 0.9101
Epoch 38 Loss 0.7952
Time taken for 1 epoch 92.69184494018555 sec

Epoch 39 Batch 0 Loss 0.7493
Epoch 39 Batch 100 Loss 0.8005
Epoch 39 Batch 200 Loss 0.8133
Epoch 39 Batch 300 Loss 0.7640
Epoch 39 Batch 400 Loss 0.9139
Epoch 39 Loss 0.7844
Time taken for 1 epoch 89.68784165382385 sec

Epoch 40 Batch 0 Loss 0.6822
Epoch 40 Batch 100 Loss 0.7587
Epoch 40 Batch 200 Loss 0.6765
Epoch 40 Batch 300 Loss 0.7595
Epoch 40 Batch 400 Loss 0.7652
Epoch 40 Loss 0.7742
Time taken for 1 epoch 90.88142848014832 sec

Epoch 41 Batch 0 Loss 0.7202
Epoch 41 Batch 100 Loss 0.7427
Epoch 41 Batch 200 Loss 0.7088
Epoch 41 Batch 300 Loss 0.8278
Epoch 41 Batch 400 Loss 0.6635
Epoch 41 Loss 0.7635
Time taken for 1 epoch 92.21189403533936 sec

Epoch 42 Batch 0 Loss 0.6391
Epoch 42 Batch 100 Loss 0.7857
Epoch 42 Batch 200 Loss 0.7312
Epoch 42 Batch 300 Loss 0.7104
Epoch 42 Batch 400 Loss 0.8752
Epoch 42 Loss 0.7540
Time taken for 1 epoch 91.06829953193665 sec

Epoch 43 Batch 0 Loss 

Epoch 76 Batch 0 Loss 0.4234
Epoch 76 Batch 100 Loss 0.5668
Epoch 76 Batch 200 Loss 0.4980
Epoch 76 Batch 300 Loss 0.5389
Epoch 76 Batch 400 Loss 0.4809
Epoch 76 Loss 0.5175
Time taken for 1 epoch 91.792245388031 sec

Epoch 77 Batch 0 Loss 0.5164
Epoch 77 Batch 100 Loss 0.5051
Epoch 77 Batch 200 Loss 0.4824
Epoch 77 Batch 300 Loss 0.4684
Epoch 77 Batch 400 Loss 0.4107
Epoch 77 Loss 0.5160
Time taken for 1 epoch 89.91190481185913 sec

Epoch 78 Batch 0 Loss 0.4616
Epoch 78 Batch 100 Loss 0.4108
Epoch 78 Batch 200 Loss 0.4901
Epoch 78 Batch 300 Loss 0.5315
Epoch 78 Batch 400 Loss 0.5347
Epoch 78 Loss 0.5102
Time taken for 1 epoch 91.0253746509552 sec

Epoch 79 Batch 0 Loss 0.4768
Epoch 79 Batch 100 Loss 0.5867
Epoch 79 Batch 200 Loss 0.4539
Epoch 79 Batch 300 Loss 0.5167
Epoch 79 Batch 400 Loss 0.4715
Epoch 79 Loss 0.5029
Time taken for 1 epoch 89.02837800979614 sec

Epoch 80 Batch 0 Loss 0.4522
Epoch 80 Batch 100 Loss 0.5198
Epoch 80 Batch 200 Loss 0.5425
Epoch 80 Batch 300 Loss 0.5372
E

In [18]:
def evaluate(sentence):
    attention_plot = np.zeros((max_len_out, max_len_inp))
    
    
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    
    
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_len_inp,
                                                         padding='post')
    
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''
    
    hidden = [tf.zeros((1, units))]
    
    enc_out, enc_hidden = encoder(inputs, hidden)
    
    dec_hidden = enc_hidden
    
    dec_input = tf.expand_dims([out_lang.word_index['<start>']], 0)
    
    for t in range(max_len_out):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += out_lang.index_word[predicted_id] + ' '

        if out_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [19]:
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')
    
    fontdict = {'fontsize': 14}
    
    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
    
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    
    
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    plt.show()


In [20]:
def translate(sentence):
    sentence=sentence.lower()
    result, sentence, attention_plot = evaluate(sentence)
    
    print('Input: %s' % (sentence))
    
    print('Predicted translation: {}'.format(result))
    
    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    
#     plot_attention(attention_plot, sentence.split(' '), result.split(' '))


In [21]:
checkpoint.restore(tf.train.latest_checkpoint('./training_checkpoints1'))





<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1a1da979ec8>

In [39]:
translate(u'<start> what are you doing <end>')

Input: <start> what are you doing <end>
Predicted translation: i don t know i m a real berserk <end> 
