In [29]:
''' 
 I borrowed a lot from Tensorflow official tutorial
 https://www.tensorflow.org/tutorials/text/nmt_with_attention
'''

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time


#################### Ignore from this part to ####################


# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  return zip(*word_pairs)

en, sp = create_dataset(path_to_file, None)

def max_length(tensor):
    return max(len(t) for t in tensor)

def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)
    
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

# Try experimenting with the size of that dataset
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print("%d ----> %s" % (t, lang.index_word[t]))
    

#################### this part. ####################


In [2]:
'''
 My article is not on NLP, so please just suppose that you get the four tensors below. 
 These shapes mean that the max length of the input sentences is 16, 
 and the max length of the target sentneces is 11. 
'''
input_tensor_train.shape, input_tensor_val.shape, target_tensor_train.shape, target_tensor_val.shape

((24000, 16), (6000, 16), (24000, 11), (6000, 11))

In [3]:
'''
 Each sentence is implemented like below.
 Each row denotes a sentence, and each integer denotes a token, in this case a word.
'''
input_tensor_train

array([[   1,   54,   18, ...,    0,    0,    0],
       [   1,    6,   29, ...,    0,    0,    0],
       [   1,    8,  494, ...,    0,    0,    0],
       ...,
       [   1,   29, 1243, ...,    0,    0,    0],
       [   1, 1626, 1670, ...,    0,    0,    0],
       [   1,   26,   24, ...,    0,    0,    0]], dtype=int32)

In [4]:
'''
 Let's see the first Spanish sentence of training data. 
 The integer '1' and '2' correspond to "<start>" and "<end>" respectively. 
'''
input_tensor_train[0]

array([  1,  54,  18, 380,  14, 395,   3,   2,   0,   0,   0,   0,   0,
         0,   0,   0], dtype=int32)

In [5]:
'''
 You can see what each code denotes with convert()
'''
convert(inp_lang, input_tensor_train[0])

1 ----> <start>
54 ----> ellos
18 ----> lo
380 ----> quieren
14 ----> de
395 ----> vuelta
3 ----> .
2 ----> <end>


In [6]:
'''
 Let's see the first and last 8 tokens in the dictionary. 
'''
list(inp_lang.word_index.items())[:8]

[('<start>', 1),
 ('<end>', 2),
 ('.', 3),
 ('tom', 4),
 ('?', 5),
 ('¿', 6),
 ('es', 7),
 ('no', 8)]

In [7]:
list(inp_lang.word_index.items())[-8:]

[('caminamos', 9406),
 ('divertir', 9407),
 ('divertiremos', 9408),
 ('divertirnos', 9409),
 ('decepcionaremos', 9410),
 ('viviremos', 9411),
 ('reyes', 9412),
 ('perderemos', 9413)]

In [8]:
'''
 Let's conversely encode a Spanish sentence into integer codes. 
 For the neural net which we are going to make, "Todo sobre mi madre." is comprehensed as 
 [1, 74, 514, 19, 237, 3, 2]
''' 
sentence = preprocess_sentence('Todo sobre mi madre.')
inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
print(inputs)

[1, 74, 514, 19, 237, 3, 2]


In [9]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256 # You compress the 9414 dimensional input vectors into 256 dimensional vectors. 
units = 1024 # The dimension of the hidden state/vector
vocab_inp_size = len(inp_lang.word_index)+1 # 9414
vocab_tar_size = len(targ_lang.word_index)+1 # 4935

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [10]:
dataset

<BatchDataset shapes: ((64, 16), (64, 11)), types: (tf.int32, tf.int32)>

In [11]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz # 64
    self.enc_units = enc_units # 24000 // 64 = 375
    # As I explained in last article, you propagate input 9414 dimensional vectors to 256 embedding vectors. 
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) # (9414, 256)
    
    '''
      We use a RNN model named GRU for this seq2seq translation model. 
      All you have to keep in mind is, in this implentation, at time step t, a GRU cell takes embedding_dim(=256) dimensional 
      vector as an input, and gives out a 16 dimensional (the maximum size of input sentences) output vector and 
      succeeds a 
    '''
    
    
    
    self.gru = tf.keras.layers.GRU(self.enc_units, # 1024: the dimension of the hidden vector/state
                                   return_sequences=True, 
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [12]:
encoder = Encoder(vocab_inp_size, # 9414
                  embedding_dim, # 256
                  units, # 1024
                  BATCH_SIZE # 24000
                 )

In [13]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [14]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    '''
    You first 
    '''
    
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [15]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [16]:
vocab_tar_size

4935

In [17]:
vocab_inp_size

9414

In [18]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [19]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [20]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)


In [23]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  # You input a (batch size, max input length) (=(64, 16)) matrix as an input
  # and a (batch size, max output length) (=(64, 11)) as an output, and get a loss. 
  with tf.GradientTape() as tape:
        
    # You put input sentences as an input (64, 16) tensor. 
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    # You pass the last hidden state/vector of the encoder to the decoder as its 
    # inittial layer/vector.
    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
    
    '''
    You 
    '''
    
    #print("The shape of 'dec_input' is " + str(dec_input.shape))
    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, 
                                           dec_hidden, 
                                           enc_output) # You need encoder outputs to calculate attentions. 

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  '''
  Updating the weigths with the three lines below.   
  '''
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss


In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()
  # You initialize the 'unit' dimensional hidden layer (1024 dimensional) as a 
  # 'unit' dimensional zero vector. 
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    '''
     You input a (batch size, max input length) (=(64, 16)) matrix as an input
     and a (batch size, max output length) (=(64, 11)) as an output, and get a loss. 
     'enc_hidden' is the last 'units' dimensional hidden state vector (1024 dimensional) of the encoder. 
    '''
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
