### Neural machine translation

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

### Download and prepare the dataset

We'll use a language dataset provided by http://www.manythings.org/anki/

In [2]:
# Download the file
path_to_file = "./data/translation/rus.txt"

In [3]:
def preprocess_sentence(w):
    w = w.lower().strip()
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [4]:
preprocess_sentence("I can't go.")

"<start> i can't go . <end>"

In [5]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENG, RUS]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [6]:
en, ru = create_dataset(path_to_file, None)
print(en[0])
print(ru[0])

<start> go . <end>
<start> марш ! <end>


In [7]:
en[10:20]

('<start> run ! <end>',
 '<start> run . <end>',
 '<start> run . <end>',
 '<start> who ? <end>',
 '<start> wow ! <end>',
 '<start> wow ! <end>',
 '<start> wow ! <end>',
 '<start> wow ! <end>',
 '<start> wow ! <end>',
 '<start> wow ! <end>')

In [8]:
ru[10:20]

('<start> бегите ! <end>',
 '<start> беги ! <end>',
 '<start> бегите ! <end>',
 '<start> кто ? <end>',
 '<start> вот это да ! <end>',
 '<start> круто ! <end>',
 '<start> здорово ! <end>',
 '<start> ух ты ! <end>',
 '<start> ого ! <end>',
 '<start> вах ! <end>')

In [9]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    return tensor, lang_tokenizer

In [10]:
def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [11]:
tokenize(ru[10:20])

(array([[ 1,  4,  3,  2,  0,  0],
        [ 1,  5,  3,  2,  0,  0],
        [ 1,  4,  3,  2,  0,  0],
        [ 1,  6,  7,  2,  0,  0],
        [ 1,  8,  9, 10,  3,  2],
        [ 1, 11,  3,  2,  0,  0],
        [ 1, 12,  3,  2,  0,  0],
        [ 1, 13, 14,  3,  2,  0],
        [ 1, 15,  3,  2,  0,  0],
        [ 1, 16,  3,  2,  0,  0]], dtype=int32),
 <keras_preprocessing.text.Tokenizer at 0x7fe03555ea58>)

### Limit the size of the dataset to experiment faster (optional)

In [12]:
len(en), len(ru)

(431097, 431097)

In [13]:
# Try experimenting with the size of that dataset
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [14]:
max_length_targ, max_length_inp

(11, 15)

In [15]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

80000 80000 20000 20000


In [16]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [17]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
122 ----> дай
138 ----> им
571 ----> войти
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
89 ----> let
99 ----> them
66 ----> come
35 ----> in
3 ----> .
2 ----> <end>


### Create a tf.data dataset

In [18]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 300
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [19]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 15]), TensorShape([64, 11]))

In [20]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=False,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
    
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return state

    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [21]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_hidden = encoder(example_input_batch, sample_hidden)
# print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder Hidden state shape: (batch size, units) (64, 1024)


In [22]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)


    
    def call(self, x, hidden):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x, initial_state=hidden)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size, vocab)
        x = self.fc(output)
        
        return x, state

In [23]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_sample_x, decoder_sample_h = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                             sample_hidden)

In [24]:
decoder_sample_x.shape

TensorShape([64, 7260])

In [25]:
decoder_sample_h.shape

TensorShape([64, 1024])

### Define the optimizer and the loss function

In [26]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

### Checkpoints (Object-based saving)

In [27]:
checkpoint_dir = './training_nmt_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [28]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_hidden = encoder(inp, enc_hidden)
        
        dec_hidden = enc_hidden
        
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
        
        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden = decoder(dec_input, dec_hidden)
            
            loss += loss_function(targ[:, t], predictions)
            
            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [29]:
%%time

EPOCHS = 50

for epoch in range(EPOCHS):
    start = time.time()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.6597
Epoch 1 Batch 100 Loss 1.9972
Epoch 1 Batch 200 Loss 1.7233
Epoch 1 Batch 300 Loss 1.7129
Epoch 1 Batch 400 Loss 1.6030
Epoch 1 Batch 500 Loss 1.5352
Epoch 1 Batch 600 Loss 1.4080
Epoch 1 Batch 700 Loss 1.3399
Epoch 1 Batch 800 Loss 1.2970
Epoch 1 Batch 900 Loss 1.1240
Epoch 1 Batch 1000 Loss 1.1781
Epoch 1 Batch 1100 Loss 1.0053
Epoch 1 Batch 1200 Loss 1.1517
Epoch 1 Loss 1.4725
Time taken for 1 epoch 798.4947366714478 sec

Epoch 2 Batch 0 Loss 0.9337
Epoch 2 Batch 100 Loss 0.8786
Epoch 2 Batch 200 Loss 0.8134
Epoch 2 Batch 300 Loss 0.8870
Epoch 2 Batch 400 Loss 0.9022
Epoch 2 Batch 500 Loss 0.7981
Epoch 2 Batch 600 Loss 0.7729
Epoch 2 Batch 700 Loss 0.7666
Epoch 2 Batch 800 Loss 0.6648
Epoch 2 Batch 900 Loss 0.7020
Epoch 2 Batch 1000 Loss 0.7426
Epoch 2 Batch 1100 Loss 0.6918
Epoch 2 Batch 1200 Loss 0.6418
Epoch 2 Loss 0.7800
Time taken for 1 epoch 770.3030886650085 sec

Epoch 3 Batch 0 Loss 0.4400
Epoch 3 Batch 100 Loss 0.4200
Epoch 3 Batch 200 Loss 0.441

Epoch 18 Batch 1100 Loss 0.0425
Epoch 18 Batch 1200 Loss 0.0814
Epoch 18 Loss 0.0651
Time taken for 1 epoch 771.4019482135773 sec

Epoch 19 Batch 0 Loss 0.0518
Epoch 19 Batch 100 Loss 0.0626
Epoch 19 Batch 200 Loss 0.0624
Epoch 19 Batch 300 Loss 0.0486
Epoch 19 Batch 400 Loss 0.0706
Epoch 19 Batch 500 Loss 0.0684
Epoch 19 Batch 600 Loss 0.0801
Epoch 19 Batch 700 Loss 0.0649
Epoch 19 Batch 800 Loss 0.0729
Epoch 19 Batch 900 Loss 0.0485
Epoch 19 Batch 1000 Loss 0.0684
Epoch 19 Batch 1100 Loss 0.0611
Epoch 19 Batch 1200 Loss 0.1043
Epoch 19 Loss 0.0645
Time taken for 1 epoch 786.1595189571381 sec

Epoch 20 Batch 0 Loss 0.0627
Epoch 20 Batch 100 Loss 0.0773
Epoch 20 Batch 200 Loss 0.0605
Epoch 20 Batch 300 Loss 0.0609
Epoch 20 Batch 400 Loss 0.0534
Epoch 20 Batch 500 Loss 0.0302
Epoch 20 Batch 600 Loss 0.0738
Epoch 20 Batch 700 Loss 0.0260
Epoch 20 Batch 800 Loss 0.0669
Epoch 20 Batch 900 Loss 0.0723
Epoch 20 Batch 1000 Loss 0.0549
Epoch 20 Batch 1100 Loss 0.0810
Epoch 20 Batch 1200 Loss 0

Epoch 36 Batch 200 Loss 0.0496
Epoch 36 Batch 300 Loss 0.0408
Epoch 36 Batch 400 Loss 0.0418
Epoch 36 Batch 500 Loss 0.0671
Epoch 36 Batch 600 Loss 0.0605
Epoch 36 Batch 700 Loss 0.0534
Epoch 36 Batch 800 Loss 0.0619
Epoch 36 Batch 900 Loss 0.0418
Epoch 36 Batch 1000 Loss 0.0831
Epoch 36 Batch 1100 Loss 0.0898
Epoch 36 Batch 1200 Loss 0.0827
Epoch 36 Loss 0.0522
Time taken for 1 epoch 776.7171592712402 sec

Epoch 37 Batch 0 Loss 0.0449
Epoch 37 Batch 100 Loss 0.0372
Epoch 37 Batch 200 Loss 0.0137
Epoch 37 Batch 300 Loss 0.0421
Epoch 37 Batch 400 Loss 0.0348
Epoch 37 Batch 500 Loss 0.0481
Epoch 37 Batch 600 Loss 0.0209
Epoch 37 Batch 700 Loss 0.0461
Epoch 37 Batch 800 Loss 0.0376
Epoch 37 Batch 900 Loss 0.0618
Epoch 37 Batch 1000 Loss 0.0466
Epoch 37 Batch 1100 Loss 0.0630
Epoch 37 Batch 1200 Loss 0.0518
Epoch 37 Loss 0.0522
Time taken for 1 epoch 787.6506843566895 sec

Epoch 38 Batch 0 Loss 0.0483
Epoch 38 Batch 100 Loss 0.0477
Epoch 38 Batch 200 Loss 0.0392
Epoch 38 Batch 300 Loss 0.0

### Translate

- The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
- Stop predicting when the model predicts the end token.
- And store the attention weights for every time step.  
Note: The encoder output is calculated only once for one input.

In [30]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden = decoder(dec_input, dec_hidden)
        
        # storing the attention weights to plot later on
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.index_word[predicted_id] + ' '
        
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [31]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

### Restore the latest checkpoint and test

In [32]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fe0234eefd0>

In [33]:
translate('Здесь хорошо.')

Input: <start> здесь хорошо . <end>
Predicted translation: it's fine here . <end> 


In [36]:
translate('Я делаю работу')

Input: <start> я делаю работу <end>
Predicted translation: i'm doing the job . <end> 


In [39]:
translate('Он смотрит в ')

Input: <start> он смотрит в <end>
Predicted translation: he's looking at . <end> 


In [48]:
translate('Она пошла в магазин')

Input: <start> она пошла в магазин <end>
Predicted translation: she went to the shop . <end> 


In [50]:
translate('Получилось неплохо')

Input: <start> получилось неплохо <end>
Predicted translation: this isn't bad . <end> 
