In [1]:
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
from sklearn.model_selection import train_test_split

import os
import re
import time
import html
import nltk
import string

## Data Gathering

IWSLT'15 English-Vietnamese data
- Train (133K sentence pairs): [train.en] [train.vi]
- Test: [tst2012.en] [tst2012.vi] [tst2013.en] [tst2013.vi]
- Vocabularies (**top 50K** frequent words): [vocab.en] [vocab.vi]
- Dictionary (extracted from alignment data): [dict.en-vi]

Download dataset

In [0]:
SITE_URL = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi'
!wget -q -r -l1 --no-parent -e robots=off -R "index.html*" $SITE_URL

DATA_FOLDER = 'nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi'

Load English and Vietnames vocabularies

In [3]:
with open('{}/vocab.vi'.format(DATA_FOLDER), 'rb') as vi_vocab_file:
  vi_vocab = vi_vocab_file.read().decode(encoding='utf-8').split('\n')

with open('{}/vocab.en'.format(DATA_FOLDER), 'rb') as en_vocab_file:
  en_vocab = en_vocab_file.read().decode(encoding='utf-8').split('\n')

print('Size of Vietnamese vocabulary:', len(vi_vocab))
print('Size of English vocabulary:', len(en_vocab))

Size of Vietnamese vocabulary: 7710
Size of English vocabulary: 17192


Dataset is organized in parallel order: $i^{th}$ Vietnamese sentence in `train.vi` corresponds to $i^{th}$ English sentence in `train.en`.

In [4]:
with open('{}/train.vi'.format(DATA_FOLDER), 'rb') as vi_text_file:
  vi_text = vi_text_file.read().decode(encoding='utf-8').split('\n')

with open('{}/train.en'.format(DATA_FOLDER), 'rb') as en_text_file:
  en_text = en_text_file.read().decode(encoding='utf-8').split('\n')

print('+ Vietnamese text sample:', *vi_text[:2], sep='\n', end='\n\n')
print('+ English text sample:', *en_text[:2], sep='\n')

+ Vietnamese text sample:
Khoa học đằng sau một tiêu đề về khí hậu
Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .

+ English text sample:
Rachel Pike : The science behind a climate headline
In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .


## Data Preparation

### Preprocessing

Preprocess the text:
1. Normalize HTML marks
2. Convert all characters to lowercase
3. Replace any characters except alphabets in each word with space
4. Add start token and end token (if necessary)

In [5]:
def remove_special_character(w):
  # In case of standalone punctutation
  if len(w) == 1 and not w.isalpha() and not w.isdigit():
    return w
  return ''.join(c for c in w if c.isalpha())

def preprocess(s):
  norm_s = html.unescape(s.lower())
  norm_s = ' '.join(remove_special_character(w) for w in norm_s.split())
  return norm_s

vi_text = [preprocess(sentence) for sentence in vi_text]
en_text = [preprocess(sentence) for sentence in en_text]

# First 3 pairs of sentences after preprocessed
for i in range(3):
  print(vi_text[i], en_text[i], '-' * 50, sep='\n')

khoa học đằng sau một tiêu đề về khí hậu
rachel pike : the science behind a climate headline
--------------------------------------------------
trong  phút , chuyên gia hoá học khí quyển rachel pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình  hàng ngàn người đã cống hiến cho dự án này  một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .
in  minutes , atmospheric chemist rachel pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team  one of thousands who contributed  taking a risky flight over the rainforest in pursuit of data on a key molecule .
--------------------------------------------------
tôi muốn cho các bạn biết về sự to lớn của những nỗ lực khoa học đã góp phần làm nên các dòng tít bạn thường thấy trên báo .
i d like to talk to you today about the scale of the scientific effort that goes

### Tokenizing

Let define special tokens:
- `<sos>`: start token - appear at the beginning of every sentence
- `<eos>`: end token - appear at the end of every sentence
- `<unk>`: unknown token - replace words that does not exist in vocabulary

In [0]:
start_token, end_token, unk_token = '<sos>', '<eos>', '<unk>'
special_tokens = (start_token, end_token, unk_token)


Notice that text in English and Vietnamese does not completely align in title of the talk. Hence, we simply remove author name from strings whose ending character is not a punctuation.

In [0]:
def remove_author(tokens):
  if not (tokens and not tokens[-1] in string.punctuation):
    return tokens
  try:
    tokens = tokens[tokens.index(':') + 1 :]
  except ValueError:
    pass
  return tokens

In [0]:
def tokenize(text, has_author=False):
  tokens = remove_author(text.split()) if has_author else text.split()
  return [start_token] + tokens + [end_token]

In [0]:
vi_tokens = [tokenize(sentence, has_author=False) for sentence in vi_text]
en_tokens = [tokenize(sentence, has_author=True) for sentence in en_text]

Normalize words in vocabularies and add special tokens as new words.

In [10]:
vi_vocab = set(preprocess(w) for w in vi_vocab).union(special_tokens)
vi_vocab.discard('')
vi_vocab = sorted(vi_vocab)

en_vocab = set(preprocess(w) for w in en_vocab).union(special_tokens)
en_vocab.discard('')
en_vocab = sorted(en_vocab)

# Later mapping offset 1
vi_vocab_size, en_vocab_size = len(vi_vocab) + 1, len(en_vocab) + 1
print('Vietnamese vocab size:', vi_vocab_size)
print('English vocab size:', en_vocab_size)

Vietnamese vocab size: 6026
English vocab size: 15305



Create dictonaries mapping from word to index and idx to word.

In [0]:
def create_mapping(vocab):
  word2idx = {word : idx + 1 for idx, word in enumerate(vocab)}
  idx2word = np.array([None] + vocab)
  return word2idx, idx2word

vi2idx, idx2vi = create_mapping(vi_vocab)
en2idx, idx2en = create_mapping(en_vocab)

###Vectorizing

1. Replace infrequent words by unknown tokens
2. Map from words to indices
3. Pad vectors to the same length

#### Mapping

In [12]:
vectorize = lambda tokens, word2idx : [word2idx.get(w, word2idx[unk_token]) for w in tokens]
vi_int = np.array([vectorize(tok, vi2idx) for tok in vi_tokens])
en_int = np.array([vectorize(tok, en2idx) for tok in en_tokens])

vi_sample_str, en_sample_str = vi_tokens[0], en_tokens[0]
print('+ Vietnamese text and its representation:', vi_sample_str, vectorize(vi_sample_str, vi2idx), sep='\n', end='\n\n')
print('+ English text and its representation:', en_sample_str, vectorize(en_sample_str, en2idx), sep='\n')

+ Vietnamese text and its representation:
['<sos>', 'khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu', '<eos>']
[19, 2310, 2057, 5894, 4304, 3193, 4875, 5906, 5446, 2359, 2040, 18]

+ English text and its representation:
['<sos>', 'the', 'science', 'behind', 'a', 'climate', 'headline', '<eos>']
[17, 13746, 11988, 1252, 24, 2414, 6243, 16]


#### Padding

Vectors should be of the same length. However, using maximum length as standard length for padding causes memory inefficiency as more than 90% of Vietnamese sentences are less 50 words and 40 for English.

Hence, we use length at the 90th percentile as threshold and filter all sentences whose number of words less than it.

In [13]:
vi_lengths = [len(v) for v in vi_int]
en_lengths = [len(v) for v in en_int]

# Find theshold
vi_length, en_length = int(np.percentile(vi_lengths, 90)), int(np.percentile(en_lengths, 90))
print('+ VI: max length is {} - 90% is {}'.format(max(vi_lengths), vi_length))
print('+ EN: max length is {} - 90% is {}'.format(max(en_lengths), en_length))

+ VI: max length is 837 - 90% is 48
+ EN: max length is 615 - 90% is 40


In [0]:
num_samples = vi_int.shape[0]

# Get sentences whose number of words are less than the threshold
indices = [i for i in range(num_samples) if len(vi_int[i]) <= vi_length and len(en_int[i]) <= en_length]
vi_int, en_int = vi_int[indices], en_int[indices]

In [15]:
# Pad vectors to the same length
vi_padded = tf.keras.preprocessing.sequence.pad_sequences(vi_int, maxlen=vi_length, padding='post')
en_padded = tf.keras.preprocessing.sequence.pad_sequences(en_int, maxlen=en_length, padding='post')

print('Shape of data:')
print('+ VI:', vi_padded.shape)
print('+ EN:', en_padded.shape)

Shape of data:
+ VI: (117993, 48)
+ EN: (117993, 40)


### Sample data

In [16]:
train_in, test_in, train_out, test_out = train_test_split(vi_padded, en_padded, test_size=.7, random_state=101)
print('Number of training samples:', len(train_in))
print('Number of testing samples:', len(test_in))

Number of training samples: 35397
Number of testing samples: 82596


## Batchifing

In [17]:
BATCH_SIZE = 64
BUFFER_SIZE = 5000
steps_per_epochs = len(train_in) // BATCH_SIZE

pair_stream = tf.data.Dataset.from_tensor_slices((train_in, train_out))
batch_data = pair_stream.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
batch_data

<DatasetV1Adapter shapes: ((64, 48), (64, 40)), types: (tf.int32, tf.int32)>

In [0]:
sample_input, sample_target = next(iter(batch_data))

## Modeling

In [0]:
embedding_dim = 256
num_units = 1024

#### Attention

$score(h_t, h_s) = v_a^{T} \tanh{(W_1 h_t + W_2 h_s)}$

Attention weights: $\alpha_{ts} = \frac{exp{(score(h_t, h_s))}}{\sum_{s'=1}^{S} exp{(score(h_t, h_s))}}$

Context vector: $c_t = \sum_{s} \alpha_{ts} h_s$

In [0]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, num_units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(num_units)
    self.W2 = tf.keras.layers.Dense(num_units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, hidden_state, enc_output):
    hidden_t = tf.expand_dims(hidden_state, 1)
    score = self.V(tf.nn.tanh(self.W1(hidden_t) + self.W2(enc_output)))
    weights = tf.nn.softmax(score, axis=1)
    context_vector = tf.reduce_sum(weights * enc_output, axis=1)
    return context_vector

#### Encoder Decoder

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, num_units, batch_size):
    super(Encoder, self).__init__()
    self.num_units = num_units
    self.batch_size = batch_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.hidden = tf.keras.layers.GRU(num_units, return_sequences=True,
                                      return_state=True, recurrent_initializer='glorot_uniform')

  def init_hidden_state(self):
    return tf.zeros((self.batch_size, self.num_units))

  def call(self, x, hidden_state):
    output, hidden_state = self.hidden(self.embedding(x), initial_state=hidden_state)
    return output, hidden_state


class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, num_units, use_attention=True):
    super(Decoder, self).__init__()
    self.use_attention = use_attention
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.hidden = tf.keras.layers.GRU(num_units, return_sequences=True,
                                      return_state=True, recurrent_initializer='glorot_uniform')
    self.dense = tf.keras.layers.Dense(vocab_size)
    self.attention = BahdanauAttention(num_units)
  
  def call(self, x, hidden_state, enc_output):
    x = self.embedding(x)

    if self.use_attention:
      context_vector = self.attention(hidden_state, enc_output)
      x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    
    output, hidden_state = self.hidden(x, initial_state=hidden_state)
    output = self.dense(tf.reshape(output, (-1, output.shape[2])))
    return output, hidden_state

#### Testing

In [22]:
simple_encoder = Encoder(vi_vocab_size, embedding_dim, num_units, BATCH_SIZE)
simple_decoder = Decoder(en_vocab_size, embedding_dim, num_units, use_attention=False)
attention_encoder = Encoder(vi_vocab_size, embedding_dim, num_units, BATCH_SIZE)
attention_decoder = Decoder(en_vocab_size, embedding_dim, num_units, use_attention=True)

sample_output, sample_state = simple_encoder(sample_input, simple_encoder.init_hidden_state())
sample_simple_decoder_output, _ = simple_decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                                 sample_state, enc_output=sample_output)
sample_attention_decoder_output, _ = attention_decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                                       sample_state, enc_output=sample_output)

print('Encoder output shape:', sample_output.shape)
print('Hidden state shape:', sample_state.shape)
print('Decoder output shape:', sample_simple_decoder_output.shape)
print('Decoder with attention output shape:', sample_attention_decoder_output.shape)

Encoder output shape: (64, 48, 1024)
Hidden state shape: (64, 1024)
Decoder output shape: (64, 15305)
Decoder with attention output shape: (64, 15305)


### Optimizer and Loss function

In [0]:
optimizer = tf.keras.optimizers.Adam()
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss(target, pred):
  spare_loss = loss_obj(target, pred)
  mask = tf.cast(tf.math.logical_not(tf.math.equal(target, 0)), dtype=spare_loss.dtype)
  return tf.reduce_mean(spare_loss * mask)

### Configure checkpoints

In [0]:
def config_checkpoints(encoder, decoder, prefix):
  decode_name = '{}_decoder'.format(prefix)
  checkpoint_dir = './{}_training_checkpoints'.format(prefix)
  checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
  checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder_name=decoder)
  return checkpoint_dir, checkpoint_prefix, checkpoint

## Training

In [0]:
def batch_train(target_start_token, input, target, enc_hidden, encoder, decoder):
  total_loss = 0
  num_words = int(target.shape[1])

  with tf.GradientTape() as tape:
    # Encoder Decoder
    enc_output, enc_hidden = encoder(input, enc_hidden)
    dec_input = tf.expand_dims([target_start_token] * BATCH_SIZE, 1)
    dec_hidden = enc_hidden
    
    # Predict next words of i-th word, except start token
    for i in range(1, num_words):
      pred, dec_hidden = decoder(dec_input, dec_hidden, enc_output)
      total_loss += loss(target[:, i], pred)
      dec_input = tf.expand_dims(target[:, i], 1)     # teacher forcing
  
  variables = encoder.trainable_variables + decoder.trainable_variables
  optimizer.apply_gradients(zip(tape.gradient(total_loss, variables), variables))
  batch_loss = total_loss / num_words
  return batch_loss

In [0]:
def train(sep_token, encoder, decoder, epochs, checkpoint, checkpoint_prefix):
  for epoch in range(epochs):
    start = time.time()
    enc_hidden = encoder.init_hidden_state()
    epoch_loss = 0
    
    for (batch, (input, target)) in enumerate(batch_data.take(steps_per_epochs)):
      batch_loss = batch_train(sep_token, input, target, enc_hidden, encoder, decoder)
      epoch_loss += batch_loss
      if batch % 200 == 0:
        print('Epoch {}, batch {}: Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
    
    if (epoch + 1) % 5 == 0:
      checkpoint.save(file_prefix=checkpoint_prefix)
    print('[Epoch {}] Time: {:.2f}, Loss: {:.4f}'.format(epoch + 1, time.time() - start,
                                                        epoch_loss / steps_per_epochs))

### Without Attention

In [0]:
cp_dir, cp_prefix, cp = config_checkpoints(simple_encoder, simple_decoder, 'simple')
train(en2idx[start_token], simple_encoder, simple_decoder,
      epochs=10, checkpoint=cp, checkpoint_prefix=cp_prefix)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1, batch 0: Loss 3.9561


In [0]:
# Restore checkpoint
cp.restore(tf.train.latest_checkpoint(cp_dir))

### With Attention

In [0]:
attcp_dir, attcp_prefix, attcp = config_checkpoints(attention_encoder, attention_decoder, 'attention')
train(en2idx[start_token], attention_encoder, attention_decoder,
      epochs=10, checkpoint=attcp, checkpoint_prefix=attcp_prefix)

In [0]:
# Restore checkpoint
attcp.restore(tf.train.latest_checkpoint(attcp_dir))

## Translation

In [0]:
def translate(sentence, encoder, decoder, max_length=15):
  
  # Vectorize the sentence and pad to the standard length
  s_int = vectorize(tokenize(preprocess(sentence)), vi2idx)
  s_padded = tf.keras.preprocessing.sequence.pad_sequences([s_int], maxlen=vi_length,
                                                           padding='post')
  
  trans = []
  enc_input = tf.convert_to_tensor(s_padded)
  
  # Feed input and initial zeros states to encoder
  enc_out, dec_hidden = encoder(enc_input, [tf.zeros((1, num_units))])
  dec_input = tf.expand_dims([en2idx[start_token]], 0)
  
  for i in range(max_length):
    pred, dec_hidden = decoder(dec_input, dec_hidden, enc_out)
    next_word_id = tf.argmax(pred[0]).numpy()
    next_word = idx2en[next_word_id]
    
    if next_word == end_token:
      break
    trans.append(idx2en[next_word_id])
    dec_input = tf.expand_dims([next_word_id], 0)
    
  trans_sentence = ' '.join(trans)

  return trans_sentence

In [0]:
print(translate(u'ngày mai trời sẽ mưa .', simple_encoder, simple_decoder))

In [0]:
print(translate(u'ngày mai trời sẽ mưa .', attention_encoder, attention_decoder))