In [None]:
%tensorflow_version 2.x
import tensorflow as tf
from nltk.translate.bleu_score import sentence_bleu
import collections
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import re
import numpy as np
import os
import io
import time

Dataset taken from here

In [None]:
zip_path = tf.keras.utils.get_file('spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip', extract=True)
 
file_path = os.path.dirname(zip_path)+"/spa-eng/spa.txt"

Pre Processsing the data

In [None]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')
  
def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())
 
  w = re.sub(r"([?.!,¿])", r" \1 ", w)             #add space before and after special charachters
  w = re.sub(r'[" "]+', " ", w)                    #replace more than 1 consecutive space with a single space
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)           #replace all charachters except these with space
 
  w = w.strip()
 
  #w = '<start> ' + w + ' <end>'
  return w
 
def preprocess_sentence2(w):
  w = unicode_to_ascii(w.lower().strip())
 
  w = re.sub(r"([?.!,¿])", r" \1 ", w)             #add space before and after special charachters
  w = re.sub(r'[" "]+', " ", w)                    #replace more than 1 consecutive space with a single space
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)           #replace all charachters except these with space
 
  w = w.strip()
 
  w = '<start> ' + w + ' <end>'
  return w

Change number of senetences in the following block

In [None]:
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
 
  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
  print(word_pairs[25])
  return zip(*word_pairs)
 
en, sp = create_dataset(file_path, None)          #change number of sentences here
 
en2 = en
sp2 = sp

['i ran .', 'corria .']


Implementation to Learn the Byte Pair Encoding

In [None]:
from collections import defaultdict, Counter
import re, copy

def get_vocabulary(fobj):
    """Read text and return dictionary that encodes vocabulary
    """
    vocab_en = Counter()
    for line in fobj:
        for word in line.split():
            vocab_en[word] += 1
    return vocab_en

def update_pair_statistics(pair, changed, stats, indices):
    """Minimally update the indices_en and frequency of symbol pairs

    if we merge a pair of symbols_en, only pairs that overlap with occurrences
    of this pair are affected, and need to be updated.
    """
    stats[pair] = 0
    indices[pair] = defaultdict(int)
    first, second = pair
    new_pair = first+second
    for j, word, old_word, freq in changed:

        # find all instances of pair, and update frequency/indices_en around it
        i = 0
        while True:
            try:
                i = old_word.index(first, i)
            except ValueError:
                break
            if i < len(old_word)-1 and old_word[i+1] == second:
                if i:
                    prev = old_word[i-1:i+1]
                    stats[prev] -= freq
                    indices[prev][j] -= 1
                if i < len(old_word)-2:
                    # don't double-count consecutive pairs
                    if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second:
                        nex = old_word[i+1:i+3]
                        stats[nex] -= freq
                        indices[nex][j] -= 1
                i += 2
            else:
                i += 1

        i = 0
        while True:
            try:
                i = word.index(new_pair, i)
            except ValueError:
                break
            if i:
                prev = word[i-1:i+1]
                stats[prev] += freq
                indices[prev][j] += 1
            # don't double-count consecutive pairs
            if i < len(word)-1 and word[i+1] != new_pair:
                nex = word[i:i+2]
                stats[nex] += freq
                indices[nex][j] += 1
            i += 1


def get_pair_statistics(vocab):
    """Count frequency of all symbol pairs, and create index"""

    # data structure of pair frequencies
    stats = defaultdict(int)

    #index from pairs to words
    indices = defaultdict(lambda: defaultdict(int))

    for i, (word, freq) in enumerate(vocab):
        prev_char = word[0]
        for char in word[1:]:
            stats[prev_char, char] += freq
            indices[prev_char, char][i] += 1
            prev_char = char

    return stats, indices


def replace_pair(pair, vocab, indices):
    """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'"""
    first, second = pair
    pair_str = ''.join(pair)
    pair_str = pair_str.replace('\\','\\\\')
    changes = []
    pattern = re.compile(r'(?<!\S)' + re.escape(first + ' ' + second) + r'(?!\S)')
    # if sys.version_info < (3, 0):
    #     iterator = indices[pair].iteritems()
    # else:
    iterator = indices[pair].items()
    for j, freq in iterator:
        if freq < 1:
            continue
        word, freq = vocab[j]
        new_word = ' '.join(word)
        new_word = pattern.sub(pair_str, new_word)
        new_word = tuple(new_word.split())

        vocab[j] = (new_word, freq)
        changes.append((j, new_word, word, freq))

    return changes

def prune_stats(stats, big_stats, threshold):
    """Prune statistics dict for efficiency of max()

    The frequency of a symbol pair never increases, so pruning is generally safe
    (until we the most frequent pair is less frequent than a pair we previously pruned)
    big_stats_en keeps full statistics for when we need to access pruned items
    """
    for item,freq in list(stats.items()):
        if freq < threshold:
            del stats[item]
            if freq < 0:
                big_stats_en[item] += freq
            else:
                big_stats_en[item] = freq


#ENGLISH


vocab_en = get_vocabulary(en)
vocab_en = dict([(tuple(x)+('</w>',) ,y) for (x,y) in vocab_en.items()])
sorted_vocab_en = sorted(vocab_en.items(), key=lambda x: x[1], reverse=True)

stats_en, indices_en = get_pair_statistics(sorted_vocab_en)
big_stats_en = copy.deepcopy(stats_en)
# threshold is inspired by Zipfian assumption, but should only affect speed
threshold_en = max(stats_en.values()) / 10

outF_en = open("codes_en.txt" , "w")
symbols_en = 3000

for i in range(symbols_en):
    if stats_en:
        most_frequent_en = max(stats_en, key=stats_en.get)

    # we probably missed the best pair because of pruning; go back to full statistics
    if not stats_en or (i and stats_en[most_frequent_en] < threshold_en):
        prune_stats(stats_en, big_stats_en, threshold_en)
        stats_en = copy.deepcopy(big_stats_en)
        most_frequent_en = max(stats_en, key=stats_en.get)
        # threshold_en is inspired by Zipfian assumption, but should only affect speed
        threshold_en = stats_en[most_frequent_en] * i/(i+10000.0)
        prune_stats(stats_en, big_stats_en, threshold_en)
    
    if stats_en[most_frequent_en] < 2:
        print('no pair has frequency > 1. Stopping\n')
        break
    
    outF_en.write('{0} {1}\n'.format(*most_frequent_en))
    changes_en = replace_pair(most_frequent_en, sorted_vocab_en, indices_en)
    update_pair_statistics(most_frequent_en, changes_en, stats_en, indices_en)
    stats_en[most_frequent_en] = 0
    if not i % 100:
        prune_stats(stats_en, big_stats_en, threshold_en)


#SPANISH


vocab_sp = get_vocabulary(sp)
vocab_sp = dict([(tuple(x)+('</w>',) ,y) for (x,y) in vocab_sp.items()])
sorted_vocab_sp = sorted(vocab_sp.items(), key=lambda x: x[1], reverse=True)

stats_sp, indices_sp = get_pair_statistics(sorted_vocab_sp)
big_stats_sp = copy.deepcopy(stats_sp)
# threshold is inspired by Zipfian assumption, but should only affect speed
threshold_sp = max(stats_sp.values()) / 10

outF_sp = open("codes_sp.txt" , "w")
symbols_sp = 3000

for i in range(symbols_sp):
    if stats_sp:
        most_frequent_sp = max(stats_sp, key=stats_sp.get)

    # we probably missed the best pair because of pruning; go back to full statistics
    if not stats_sp or (i and stats_sp[most_frequent_sp] < threshold_sp):
        prune_stats(stats_sp, big_stats_sp, threshold_sp)
        stats_sp = copy.deepcopy(big_stats_sp)
        most_frequent_sp = max(stats_sp, key=stats_sp.get)
        # threshold_sp is inspired by Zipfian assumption, but should only affect speed
        threshold_sp = stats_sp[most_frequent_sp] * i/(i+10000.0)
        prune_stats(stats_sp, big_stats_sp, threshold_sp)
    
    if stats_sp[most_frequent_sp] < 2:
        print('no pair has frequency > 1. Stopping\n')
        break
    
    outF_sp.write('{0} {1}\n'.format(*most_frequent_sp))
    changes_sp = replace_pair(most_frequent_sp, sorted_vocab_sp, indices_sp)
    update_pair_statistics(most_frequent_sp, changes_sp, stats_sp, indices_sp)
    stats_sp[most_frequent_sp] = 0
    if not i % 100:
        prune_stats(stats_sp, big_stats_sp, threshold_sp)

Implementation of the Encoder System using the Learned Codes from BPE

In [None]:
class BPE(object):

    def __init__(self, codes, separator='@@'):
        self.bpe_codes = [tuple(item.split()) for item in codes]
        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))])

        self.separator = separator

    def segment(self, sentence):
        """segment single sentence (whitespace-tokenized string) with BPE encoding"""

        output = []
        for word in sentence.split():
            new_word = encode(word, self.bpe_codes)

            for item in new_word[:-1]:
                output.append(item + self.separator)
            output.append(new_word[-1])

        return ' '.join(output)

def get_pairs(word):
    """Return set of symbol pairs in a word.

    word is represented as tuple of symbols (symbols being variable-length strings)
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

def encode(orig, bpe_codes, cache={}):
    """Encode word based on list of BPE merge operations, which are applied consecutively
    """

    if orig in cache:
        return cache[orig]

    word = tuple(orig) + ('</w>',)
    pairs = get_pairs(word)

    while True:
        bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
        if bigram not in bpe_codes:
            break
        first, second = bigram
        new_word = []
        i = 0
        while i < len(word):
            try:
                j = word.index(first, i)
                new_word.extend(word[i:j])
                i = j
            except:
                new_word.extend(word[i:])
                break

            if word[i] == first and i < len(word)-1 and word[i+1] == second:
                new_word.append(first+second)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_word = tuple(new_word)
        word = new_word
        if len(word) == 1:
            break
        else:
            pairs = get_pairs(word)

    # don't print end-of-word symbols
    if word[-1] == '</w>':
        word = word[:-1]
    elif word[-1].endswith('</w>'):
        word = word[:-1] + (word[-1].replace('</w>',''),)

    cache[orig] = word
    return word


#ENGLISH

inF_en = open("codes_en.txt" , "r")

outF2_en = open("output_en.txt" , "w")
seperator_en = '|'
bpe_en = BPE(inF_en, seperator_en)

for line in en:
  outF2_en.write(bpe_en.segment(line).strip())
  outF2_en.write('\n')

#SPANISH

inF_sp = open("codes_sp.txt" , "r")

outF2_sp = open("output_sp.txt" , "w")
seperator_sp = '|'
bpe_sp = BPE(inF_sp, seperator_sp)

for line in sp:
  outF2_sp.write(bpe_sp.segment(line).strip())
  outF2_sp.write('\n')


In [None]:
C = 0

inF2_en = open("output_en.txt", "r")
for line in inF2_en:
  print(line)
  C = C+1
  if (C>=500):
    break

inF2_sp = open("output_sp.txt", "r")
for line in inF2_sp:
  print(line)
  C = C+1
  if (C>=1000):
    break

go .

go .

go .

go .

h| i .

run !

run .

who ?

fire !

fire !

fire !

help !

help !

help !

ju| mp !

ju| mp .

stop !

stop !

stop !

wait !

wait .

go on .

go on .

hel| l| o !

i ran .

i ran .

i try .

i won !

o| h no !

rela| x .

smile .

attack !

attack !

get up .

go now .

got it !

got it ?

got it ?

he ran .

ho| p in .

hu| g me .

i fell .

i know .

i left .

i lied .

i lost .

i quit .

i quit .

i work .

i m .

i m up .

listen .

listen .

listen .

no way !

no way !

no way !

no way !

no way !

no way !

no way !

no way !

no way !

no way !

really ?

really ?

thanks .

thanks .

try it .

we try .

we won .

why me ?

ask tom .

a| we| some !

be calm .

be cool .

be fair .

be kind .

be nice .

beat it .

call me .

call me .

call me .

call us .

come in .

come in .

come in .

come on !

come on .

come on .

drop it !

get tom .

get out !

get out .

get out .

get out .

get out .

get out .

go away !

go away !

go away !

go away

Replacing original dataset with "subworded" dataset

In [None]:
def subworded_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  x = [preprocess_sentence2(l) for l in lines[:num_examples]]
  
  return x

def subworded_dataset2(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  x = [preprocess_sentence(l) for l in lines[:num_examples]]
  
  return x

en = subworded_dataset("output_en.txt", 10000)
sp = subworded_dataset("output_sp.txt", 10000)

print(en[5],sp[5])
print(en2[5],sp2[5])

<start> run ! <end> <start> c or r e ! <end>
run ! corre !


The seq2seq with attention translation model

In [None]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

  return tensor, lang_tokenizer

#Change input and output language here
input_tensor, inp_lang = tokenize(sp)
target_tensor, targ_lang = tokenize(en)

max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

8000 8000 2000 2000


In [None]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
61 ----> tu
16 ----> c
28 ----> on
183 ----> oc
6 ----> es
33 ----> g
8 ----> en
25 ----> te
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
6 ----> you
47 ----> know
612 ----> people
3 ----> .
2 ----> <end>


In [None]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 80
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([80, 22]), TensorShape([80, 11]))

Encoder Model

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (80, 22, 1024)
Encoder Hidden state shape: (batch size, units) (80, 1024)


In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (80, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (80, 22, 1)


Decoder Model

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (80, 1701)


Optimizer and Loss Function

In [None]:
optimizer = tf.keras.optimizers.Adadelta()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

Training

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 20 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 5 epochs
  if (epoch + 1) % 5 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 3.7023
Epoch 1 Batch 20 Loss 3.9047
Epoch 1 Batch 40 Loss 3.7948
Epoch 1 Batch 60 Loss 3.8875
Epoch 1 Batch 80 Loss 3.7265
Epoch 1 Loss 3.8018
Time taken for 1 epoch 373.4365780353546 sec

Epoch 2 Batch 0 Loss 3.7855
Epoch 2 Batch 20 Loss 3.8275
Epoch 2 Batch 40 Loss 4.0554
Epoch 2 Batch 60 Loss 3.8100
Epoch 2 Batch 80 Loss 3.7928
Epoch 2 Loss 3.8005
Time taken for 1 epoch 358.6863694190979 sec

Epoch 3 Batch 0 Loss 3.7671
Epoch 3 Batch 20 Loss 3.8602
Epoch 3 Batch 40 Loss 3.7923
Epoch 3 Batch 60 Loss 3.6987
Epoch 3 Batch 80 Loss 3.7831
Epoch 3 Loss 3.7991
Time taken for 1 epoch 359.6864058971405 sec

Epoch 4 Batch 0 Loss 3.7321
Epoch 4 Batch 20 Loss 3.8842
Epoch 4 Batch 40 Loss 3.7146
Epoch 4 Batch 60 Loss 3.9001
Epoch 4 Batch 80 Loss 3.8914
Epoch 4 Loss 3.7977
Time taken for 1 epoch 358.653746843338 sec

Epoch 5 Batch 0 Loss 3.7726
Epoch 5 Batch 20 Loss 3.8489
Epoch 5 Batch 40 Loss 3.8651
Epoch 5 Batch 60 Loss 3.7296
Epoch 5 Batch 80 Loss 3.7972
Epoch 5 Loss 3.79

Evaluation

In [None]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence2(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

Plotting Attention Weights

In [None]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

Translating

In [None]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  # print('Input: %s' % (sentence))
  # print('Predicted translation: {}'.format(result))

  attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
  #plot_attention(attention_plot, sentence.split(' '), result.split(' '))

  return result

# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f83ec612b70>

In [None]:
dset = sp2[3000:4000]



eval_codes = open("codes_sp.txt" , "r")

eval_out = open("output_eval.txt" , "w")
seperator_eval = '|'
bpe_eval = BPE(eval_codes, seperator_eval)

for line in dset:
  eval_out.write(bpe_eval.segment(line).strip())
  eval_out.write('\n')
  # print(line)

print("----------------------------------")

eval_out.close()

t_eval_out = open("output_eval.txt" , "r")

i = 0
total_score = 0

for line in t_eval_out:
  candidate = str(translate(line))
  # candidate = candidate.split()
  reference = str(en2[i])
  # reference = reference.split()
  score = sentence_bleu(candidate, reference, weights=(1, 0, 0, 0))
  total_score = total_score + score
  # print(score)
  # print("--------------------------------")
  i = i+1

print("--------------------------------\nAVERAGE BLEU SCORE \n--------------------------------")
print(total_score/(i+1))

----------------------------------


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


--------------------------------
AVERAGE BLEU SCORE 
--------------------------------
0.20140404328716144
