# English to Vietnamese Translator using RNNs
## Author: Ankit Gupta

In [1]:
import numpy as np
import tensorflow as tf

In [2]:
# data can downloaded from https://nlp.stanford.edu/projects/nmt/
# data is already formatted so that there is space between every two tokens

vi_text = open('./datasets/eng-vietnamese/train.vi', encoding='utf-8', errors='strict').read()
en_text = open('./datasets/eng-vietnamese/train.en', encoding='utf-8', errors='strict').read()
vi_words = vi_text.split()
en_words = en_text.split()
vi_lines = vi_text.split('\n')
en_lines = en_text.split('\n')
print(len(vi_lines), len(en_lines))
print(vi_lines[0], en_lines[0])

n_lines = len(vi_lines)

133318 133318
Khoa học đằng sau một tiêu đề về khí hậu The science behind a climate headline


In [3]:
vi_lines_copy, en_lines_copy = [], []

for i in range(n_lines):
    if len(en_lines[i].split('.')) == len(vi_lines[i].split('.')):
        vi_lines_copy += vi_lines[i].split('.')
        en_lines_copy += en_lines[i].split('.')

vi_lines, en_lines = vi_lines_copy, en_lines_copy
# deleting unnecessary variables 
del vi_lines_copy, en_lines_copy

n_lines = len(vi_lines)
print(n_lines)

220091


In [41]:
# build the label mapping

from collections import Counter

vi_vocab = Counter(vi_words) # builds a dict with keys as distict words and their counts in vi_words as vals
en_vocab = Counter(en_words) # builds a dict with keys as distict words and their counts in en_words as vals

vi_vocab_size = 20000 - 3
vi_vocab = [word for (word, count) in vi_vocab.most_common(vi_vocab_size)]
# adding a tokens for padding, out-of-vocabulary words, etc
vi_vocab = ['<PAD>', '<UNK>', '<GO>'] + vi_vocab
vi_vocab_size += 3
print('vi_vocab', len(vi_vocab))

en_vocab_size = 20000 - 2
en_vocab = [word for (word, count) in en_vocab.most_common(en_vocab_size)]
# adding a tokens for padding, out-of-vocabulary words, etc
en_vocab = ['<PAD>', '<UNK>'] + en_vocab
en_vocab_size += 2
print('en_vocab', len(en_vocab))

vi_label = {}
for i in range(len(vi_vocab)):
    vi_label[vi_vocab[i]] = i

en_label = {}
for i in range(len(en_vocab)):
    en_label[en_vocab[i]] = i

vi_vocab 20000
en_vocab 20000


In [42]:
# encoding the dataset using the above encoder

from bisect import bisect_left

def search(x, a):
    '''return True if x in a''' 
    #locate the leftmost value exactly equal to x
    i = bisect_left(a, x)
    return (i != len(a) and a[i] == x)

vi_vocab_sorted = sorted(vi_vocab)
en_vocab_sorted = sorted(en_vocab)

# encoding the words by labels
def encode(word_list, label):
    labels = []
    for word in word_list:
        labels.append(label[word])
    return np.array(labels).astype('int32')

def unk_n_encode(word_list, lang):
    # replace missing words by <UNK> and encode
    if lang == 'vi':
        vocab_sorted = vi_vocab_sorted
        label = vi_label
    else:
        vocab_sorted = en_vocab_sorted
        label = en_label
    # replacing the out of vocab words in dataset by <UNK>
    for i in range(len(word_list)):
        if not search(word_list[i], vocab_sorted):
            word_list[i] = '<UNK>'
    return encode(word_list, label)

vi_labels = unk_n_encode(vi_words, 'vi')
en_labels = unk_n_encode(en_words, 'en')
print(len(vi_labels), len(en_labels))

3311508 2706252


In [43]:
# storing the datasets in form of a numpy matrix

max_en = 0
for line in en_lines:
    line = line.strip()
    if max_en < len(line.split()):
        max_en = len(line.split())

max_vi = 0
for line in vi_lines:
    line = line.strip()
    if max_vi < len(line.split()):
        max_vi = len(line.split())

print(max_en, max_vi)

vi_lines_mat = np.zeros((len(vi_lines), max_vi), dtype='int32')
en_lines_mat = np.zeros((len(en_lines), max_en), dtype='int32')

for i, line in enumerate(en_lines):
    line = line.strip()
    line_labels = unk_n_encode(line.split(), 'en')
    for j in range(len(line_labels)):
        en_lines_mat[i, j] = line_labels[j]

for i, line in enumerate(vi_lines):
    line = line.strip()
    line_labels = unk_n_encode(line.split(), 'vi')
    for j in range(len(line_labels)):
        vi_lines_mat[i, j] = line_labels[j]

en_label['<PAD>']

523 558


0

In [None]:
# Remember that when constructing decoder inputs we prepend the special GO symbol to the input data. This is done in the get_batch() function in 
# seq2seq_model.py, which also reverses the input English sentence. Reversing the inputs was shown to improve results for the neural translation model in 
# Sutskever et al., 2014 (pdf). To put it all together, imagine we have the sentence "I go.", tokenized as ["I", "go", "."] as input and the sentence 
# "Je vais." as output, tokenized ["Je", "vais", "."]. It will be put in the (5, 10) bucket, with encoder inputs representing [PAD PAD "." "go" "I"] and 
# decoder inputs [GO "Je" "vais" "." EOS PAD PAD PAD PAD PAD].

In [7]:
# reverse the encoder inputs, add <GO> to the decoder inputs 

n_steps = sequence_length = 20

def get_next_batch(epoch, batch, batch_size):
    # returns the next batch
    np.random.seed(epoch)
    line_indices = np.random.permutation(n_lines)[batch*batch_size: (batch+1)*batch_size]
    vi_mat, en_mat = vi_lines_mat[line_indices], en_lines_mat[line_indices]
    start = min(epoch, max_vi - n_steps - 1, max_en - n_steps - 1)
    X_ret, y_ret = en_mat[:, start: start + n_steps], vi_mat[:, start: start + n_steps -1]
    #reverse encoder inputs
    X_ret = np.flip(X_ret, axis=1)
    # add <GO> to decoder inputs
    y_ret = np.concatenate((vi_label['<GO>'] * np.ones((batch_size, 1), 'int32'), y_ret), axis=1)
    return X_ret.astype('int32'), y_ret.astype('int32')

X_temp, y_temp = get_next_batch(0, 0, 3)
print(X_temp.shape, y_temp.shape)

for i in range(len(y_temp[0])):
    print(X_temp[0, i], end=' ')
print()
for i in range(len(y_temp[0])):
    print(y_temp[0, i], end=' ')

(3, 20) (3, 20)
7 2 1684 9748 6 627 1391 146 45 9502 58 6 3805 6 801 614 23 16 90 2221 
2 1017 16 11 6 21 647 23 178 12 173 1894 10 244 460 37 172 52 55 58 

In [8]:
# constructing the RNN using encoder-decoder model

n_neurons = 400
n_layers = 1
num_encoder_symbols = len(en_vocab)
num_decoder_symbols = len(vi_vocab)
embedding_size = 150

tf.reset_default_graph()

X = tf.placeholder(dtype='int32', shape=[None, n_steps]) # English
y = tf.placeholder(dtype='int32', shape=[None, n_steps]) # Vietnamese
weights = tf.placeholder(dtype='float32', shape=[None]) # [batch_size*(n_steps - 1)]

y_input = y[:, :-1]
y_target = y[:, 1:]

encoder_inputs = tf.unstack(tf.transpose(X)) # list of 1D tensors
decoder_inputs = tf.unstack(tf.transpose(y_input)) # list of 1D tensors

lstm_cells = [tf.contrib.rnn.GRUCell(num_units=n_neurons, activation=tf.nn.elu)
              for layer in range(n_layers)]
cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)

output_seqs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
    encoder_inputs,
    decoder_inputs,
    cell,
    num_encoder_symbols,
    num_decoder_symbols,
    embedding_size)

logits = tf.transpose(tf.unstack(output_seqs), perm=[1, 0, 2])  # [?, n_steps - 1, 20000]

In [9]:
logits_flat = tf.reshape(logits, [-1, num_decoder_symbols])
y_target_flat = tf.reshape(y_target, [-1])
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_target_flat, 
                                                          logits=logits_flat
                                                         ) # [batch_size*(n_steps-1)]
loss = tf.reduce_sum(xentropy)
#w_entropy = xentropy * weights
#loss = tf.reduce_sum(w_entropy) # instead of reduce_mean(xentropy)

learning_rate = tf.placeholder(dtype='float32', shape=None)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

# checking for exploding/vanishing grads
bottom_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)[1]
bottom_grads = optimizer.compute_gradients(loss, var_list=[bottom_vars])
grads_norm = tf.norm(bottom_grads, ord=1)

init = tf.global_variables_initializer()

saver = tf.train.Saver()

In [10]:
# running a tf session

epoch, batch = 0, 0
sess = tf.InteractiveSession()
init.run()

In [72]:
def lr(epoch, batch):
    return 0.0001
    batch_count = batch + epoch*(n_lines // batch_size)
    if batch_count < 100:
        return 0.001*(batch_count/100) + 0.01*(1 - batch_count/100)
    else:
        return 0.0001

n_epochs = 4
batch_size = 100
#w = np.array([(0.95)**i for i in range(n_steps - 1)]) # n_steps - 1
#w = w / w.sum()
#w = list(w) * batch_size
#w = np.array(w).astype('float32') # [batch_size*(n_steps-1)]


while epoch < n_epochs:
    while batch < n_lines // batch_size:
        X_batch, y_batch = get_next_batch(epoch, batch, batch_size)
        _ , loss_eval = sess.run([training_op, loss], feed_dict={X: X_batch, y: y_batch, weights: w, learning_rate: lr(epoch, batch)})
        #if batch % 10 == 0:
        print(batch, end=' ')
        if batch % 2 == 0:
            print('loss', loss_eval, end=' ')
        #if batch % 200 == 0:
        #    print('grads', grads_norm.eval(feed_dict={X: X_batch, y: y_batch, 
        #                                              learning_rate: lr(epoch, batch)}
        #                                  ), end=' ')
        batch += 1
    print('-----------|epoch|----------', end=' ')
    batch = 0
    epoch += 1

# without dropout it overfits
# loss < 270 for it to translate meaningfully

129 130 loss 270.834 131 132 loss 271.81 133 134 loss 281.535 135 136 loss 282.759 137 138 loss 244.341 139 140 loss 279.327 141 142 loss 262.681 143 144 loss 269.195 145 146 loss 286.826 147 148 loss 273.54 149 150 loss 261.926 151 152 loss 277.532 153 154 loss 278.933 155 156 loss 285.147 157 158 loss 291.245 159 160 loss 287.754 161 162 loss 294.469 163 164 loss 271.15 165 166 loss 286.626 167 168 loss 280.354 169 170 loss 258.536 171 172 loss 275.046 173 174 loss 274.365 175 176 loss 275.493 177 178 loss 284.771 179 180 loss 280.304 181 182 loss 281.318 183 184 loss 275.541 185 186 loss 290.393 187 188 loss 273.735 189 

KeyboardInterrupt: 

In [12]:
# saving the model

saver.save(sess, './datasets/eng-vietnamese/dec3/en_vi.ckpt')
#saver.restore(sess, './datasets/eng-vietnamese/dec3/en_vi.ckpt')

INFO:tensorflow:Restoring parameters from ./datasets/eng-vietnamese/dec3/en_vi.ckpt


In [13]:
test_en_sentence = "This is to inform you about the latest" 
# must have <= n_steps words, should have space between words . , ; ' etc

test_words = test_en_sentence.strip().split() 
test_enc = np.zeros((1, n_steps), 'int32')

for j, lab in enumerate(unk_n_encode(test_words, 'en')):
    test_enc[0, j] = lab
test_enc = np.flip(test_enc, axis=1)
#print(test_enc)

test_dec = np.zeros((1, n_steps), 'int32')
test_dec[0, 0] = vi_label['<GO>']

for i in range(2*len(test_words)):
    logs = logits.eval(feed_dict={X: test_enc, y: test_dec})[0]
    lab = np.argmax(logs, axis=1)[i]
    print(vi_vocab[lab], end=' ')
    test_dec[0, i+1] = lab

Điều này cho phép bạn nghe về vấn đề . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

In [None]:
# Hello, my name is John. chào chào , John là tôi 
# I am going home . Tôi sẽ mua một nhà .
# Yes , I am coming . Vâng , tôi đã bị mất . 
# Will you be joining us for dinner ? Bạn có thể cho chúng ta ? 
# I hope it doesn't rain . Tôi hi vọng nó sẽ có thể . 
# The weather is better today. Nền tảng này được ngày càng tốt hơn .
# Can you meet me in one hour . Bạn có thể tôi ở trong một năm .
# The police have comfirmed Người ta có những người bị đánh lừa 
# We were lost in the forest . Chúng tôi đang ở trong khu vực .
# This is to inform you about the latest Điều này cho phép bạn nghe về vấn đề .