<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Loading-the-dataset" data-toc-modified-id="Loading-the-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Loading the dataset</a></span></li></ul></div>

The datasets used in this notebook can be found <a href="https://github.com/susanli2016/NLP-with-Python/tree/master/data">here</a>

# Imports

In [1]:
import numpy as np

import tensorflow as tf
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

from distutils.version import LooseVersion
import warnings

import os

data_dir = '../data'

In [2]:
# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('2.0'), 'Please use TensorFlow version 2.0 or newer.  You are using {}'.format(tf.__version__)
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 2.0.0
Default GPU Device: /device:GPU:0


# Loading the dataset

In [3]:
with open(os.path.join(data_dir, 'small_vocab_en'), 'r') as f:
    en_corpus = f.read()

In [4]:
with open(os.path.join(data_dir, 'small_vocab_fr'), 'r') as f:
    fr_corpus = f.read()

In [5]:
for i in range(5):
    print('-English sentence: ', end='')
    print(en_corpus.split('\n')[i])
    print('-French translation: ', end='')
    print(fr_corpus.split('\n')[i])
    print()

-English sentence: new jersey is sometimes quiet during autumn , and it is snowy in april .
-French translation: new jersey est parfois calme pendant l' automne , et il est neigeux en avril .

-English sentence: the united states is usually chilly during july , and it is usually freezing in november .
-French translation: les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .

-English sentence: california is usually quiet during march , and it is usually hot in june .
-French translation: california est généralement calme en mars , et il est généralement chaud en juin .

-English sentence: the united states is sometimes mild during june , and it is cold in september .
-French translation: les états-unis est parfois légère en juin , et il fait froid en septembre .

-English sentence: your least liked fruit is the grape , but my least liked is the apple .
-French translation: votre moins aimé fruit est le raisin , mais mon moins aimé est la pomme .



In [6]:
en_sentences = en_corpus.split('\n')
fr_sentences = fr_corpus.split('\n')

In [7]:
#Adding <EOS> to the end of each sentence
en_sentences = [stn.split()+['<EOS>'] for stn in en_sentences]
fr_sentences = [stn.split()+['<EOS>'] for stn in fr_sentences]

In [8]:
en_vocab = set()
for stn in en_sentences:
    for word in stn:
        en_vocab.add(word)
        
        
fr_vocab = set()
for stn in fr_sentences:
    for word in stn:
        fr_vocab.add(word)

In [9]:
print("There are {} words in the english vocab".format(len(en_vocab)))
print("There are {} word in the french vocab".format(len(fr_vocab)))

There are 228 words in the english vocab
There are 356 word in the french vocab


In [10]:
en_vocab_to_int = {w: i for i,w in enumerate(en_vocab)}
en_int_to_vocab = dict(enumerate(en_vocab))
fr_vocab_to_int = {w: i for i,w in enumerate(fr_vocab)}
fr_int_to_vocab = dict(enumerate(fr_vocab))

In [11]:
en_encoded = [[en_vocab_to_int[word] for word in stn] for stn in en_sentences]
fr_encoded = [[fr_vocab_to_int[word] for word in stn] for stn in fr_sentences]

In [12]:
en_padded = tf.keras.preprocessing.sequence.pad_sequences(en_encoded, padding='post')
fr_padded = tf.keras.preprocessing.sequence.pad_sequences(fr_encoded, padding='post')

In [13]:
X = np.array(en_padded)
y = np.array(fr_padded)

In [14]:
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, Tx):
        super(AttentionLayer, self).__init__()
        
        self.repeat = layers.RepeatVector(Tx)
        self.concat = layers.Concatenate(axis=-1)
        self.fc1 = layers.Dense(50, activation='relu')
        self.fc2 = layers.Dense(1, activation='tanh')
        self.activate = layers.Softmax(axis=1)
        self.dotor = layers.Dot(axes=1)
        
    def call(self, a, s):
        '''
        Returns the context using the lstm previous hidden state (s) and the bidirectional lstm output (a)
        '''
        s = self.repeat(s)
        conc = self.concat([a, s])
        x = self.fc2(self.fc1(conc))
        alphas = self.activate(x)
        context = self.dotor([alphas, a])
        
        return context

In [46]:
class AttentionModel(tf.keras.models.Model):
    def __init__(self, Tx, Ty, input_size, output_size, att_size, hidden_size, embed_size):
        super(AttentionModel, self).__init__()
        
        self.hidden_size = hidden_size
        self.Ty = Ty
        
        self.embed = layers.Embedding(input_size, embed_size)
        
        self.att_layer = AttentionLayer(Tx)
        
        self.bidir = layers.Bidirectional(layers.LSTM(att_size, return_sequences=True))
        
        self.post_activation = layers.LSTM(hidden_size, return_state=True)
        
        self.fc = layers.Dense(output_size, activation='softmax')
        
    def call(self, x):
        
        s = tf.zeros(shape=(x.shape[0],self.hidden_size))
        c = tf.zeros(shape=(x.shape[0],self.hidden_size))
        
        a = self.bidir(self.embed(x))
        
        outputs = []
        
        for t in range(Ty):
            
            context = self.att_layer(a, s)
            
            s, hid, c = self.post_activation(context, initial_state=[s,c])
            
            out = self.fc(hid)
            
            outputs.append(out)
            
        return tf.transpose(tf.stack(outputs), [1,0,2])

In [47]:
att_size = 64
hidden_size = 128
Tx = X.shape[1]
Ty = y.shape[1]
input_size = len(en_int_to_vocab)
output_size = len(fr_int_to_vocab)
embed_size = 100
epochs = 30
batch_size = 64

In [48]:
model = AttentionModel(Tx, Ty, input_size, output_size, att_size, hidden_size, embed_size)

In [54]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

ckpt = tf.train.Checkpoint(step=tf.Variable(0), optimizer=optimizer, net=model)
manager = tf.train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=3)

In [55]:
def get_batches(X, y, batch_size):
    for i in range(0, len(X), batch_size):
        yield X[i:i+batch_size], y[i:i+batch_size]

In [60]:
X[0]

array([  1,  70,  77,  97, 190,   0,  58,  68, 139, 222,  77,  48,  78,
       227, 158,  81,   0,   0], dtype=int32)

In [84]:
def sample_from_model(X, y, model):
    
    idx = np.random.choice(np.arange(len(X)), size=1)[0]
    output = tf.argmax(model(X[idx:idx+1]), axis=-1).numpy()
    output = np.reshape(output, -1).tolist()
    en_stn = []
    fr_stn = []
    model_stn = []
    
    for i in X[idx]:
        if en_int_to_vocab[i]=='<EOS>':
            break
        en_stn.append(en_int_to_vocab[i])
        
    for i in y[idx]:
        if fr_int_to_vocab[i]=='<EOS>':
            break
        fr_stn.append(fr_int_to_vocab[i])
        
    for i in output:
        if fr_int_to_vocab[i]=='<EOS>':
            break
        model_stn.append(fr_int_to_vocab[i])
      
    print()
    print('-English sentence: ', end='')
    print(' '.join(en_stn))
    print('-French translation: ', end='')
    print(' '.join(fr_stn))
    print('-Model translation: ', end='')
    print(' '.join(model_stn))
    print()

In [None]:
losses = []

ckpt.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))
else:
    print("Initializing from scratch.")
    
    
for e in range(epochs):
    for inputs, targets in get_batches(X, y, batch_size):
        
        ckpt.step.assign_add(1)
        
        with tf.GradientTape() as tape:
            outputs = model(inputs)
            loss = loss_object(targets, outputs)
        
        grads = tape.gradient(loss, model.trainable_variables)
        
        losses.append(loss/inputs.shape[0])
        
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        if int(ckpt.step)%100==0:
            print("Epoch: {}/{} ... Avg. Loss: {}".format(e+1, epochs, np.mean(losses[-100:])))
                  
            save_path = manager.save()
            print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
            
        if int(ckpt.step)%1000==0:
            sample_from_model(X, y, model)

Restored from ./tf_ckpts/ckpt-13
Epoch: 1/30 ... Avg. Loss: 0.021463120356202126
Saved checkpoint for step 1400: ./tf_ckpts/ckpt-14
Epoch: 1/30 ... Avg. Loss: 0.020839454606175423
Saved checkpoint for step 1500: ./tf_ckpts/ckpt-15
Epoch: 1/30 ... Avg. Loss: 0.020445620641112328
Saved checkpoint for step 1600: ./tf_ckpts/ckpt-16
Epoch: 1/30 ... Avg. Loss: 0.019953647628426552
Saved checkpoint for step 1700: ./tf_ckpts/ckpt-17
Epoch: 1/30 ... Avg. Loss: 0.019731489941477776
Saved checkpoint for step 1800: ./tf_ckpts/ckpt-18
Epoch: 1/30 ... Avg. Loss: 0.01939842291176319
Saved checkpoint for step 1900: ./tf_ckpts/ckpt-19
Epoch: 1/30 ... Avg. Loss: 0.018929995596408844
Saved checkpoint for step 2000: ./tf_ckpts/ckpt-20

-English sentence: california is sometimes dry during summer , but it is never warm in february .
-French translation: californie est parfois sec pendant l' été , mais il est jamais chaud en février .
-Model translation: paris est jamais agréable au mois de , mais il est es