# Seq2Seq
Seq2Seq is constructed of two RNNs, encoder and decoder. Encoder encodes input sequence into a hidden vector first, and decoder will gererate words one by one based on it.

# Neural Machine Translation
Neural Machine Translation (NMT) is one of NLP tasks, which uses deep learning to solve language translation.


In [0]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = '3'
import numpy as np

en_corpus = np.load('./dataset/translate/enCorpus.npy')
en_vocab = np.load('./dataset/translate/enVocab.npy').tolist() # use tolist() to transform back to dict()
en_rev = np.load('./dataset/translate/enRev.npy').tolist()

ch_corpus = np.load('./dataset/translate/chCorpus.npy')
ch_vocab = np.load('./dataset/translate/chVocab.npy').tolist()
ch_rev = np.load('./dataset/translate/chRev.npy').tolist()

for i in range(4):
    print(' '.join([en_rev[en_corpus[i][j]]  for j in range(len(en_corpus[i]))]))
    print(' '.join([ch_rev[ch_corpus[i][j]]  for j in range(len(ch_corpus[i]))]))
    print('')

In [0]:
#For the presentation issue, we only preserve sentences without 'UNK' token.
en_corpus_clean = []
ch_corpus_clean = []

for i in range(len(en_corpus)):
    if not(en_vocab['<UNK>'] in en_corpus[i] or ch_vocab['<UNK>'] in ch_corpus[i]): # remove '<UNK>' sentence
        en_corpus_clean.append(en_corpus[i])
        ch_corpus_clean.append(ch_corpus[i])

for i in range(4):
    print(' '.join([en_rev[en_corpus_clean[i][j]]  for j in range(len(en_corpus_clean[i]))]))
    print(' '.join([ch_rev[ch_corpus_clean[i][j]]  for j in range(len(ch_corpus_clean[i]))]))
    print('')

In [0]:
en_corpus_clean = []
ch_corpus_clean = []

for i in range(len(en_corpus)):
    if not(en_vocab['<UNK>'] in en_corpus[i] or ch_vocab['<UNK>'] in ch_corpus[i]): # remove '<UNK>' sentence
        en_corpus_clean.append(en_corpus[i])
        ch_corpus_clean.append(ch_corpus[i])

for i in range(4):
    print(' '.join([en_rev[en_corpus_clean[i][j]]  for j in range(len(en_corpus_clean[i]))]))
    print(' '.join([ch_rev[ch_corpus_clean[i][j]]  for j in range(len(ch_corpus_clean[i]))]))
    print('')

## Prepare Translation Batch
In this lab, we use legacy_seq2seq which provides a fasy way to build up seq2seq model with attention. Be careful that legacy_seq2seq is time major which means the input and output should be a list contains word batch.

In [0]:
en_max_len = 0
ch_max_len = 0

for i in range(len(en_corpus_clean)): # caculate max length
    en_max_len = max(en_max_len, len(en_corpus_clean[i]))
    ch_max_len = max(ch_max_len, len(ch_corpus_clean[i]))

print(en_max_len, ch_max_len)
#OUTPUT:
#32 32

In [0]:
class BatchGenerator:
    def __init__(self, en_corpus, ch_corpus, en_pad, ch_pad, en_max_len, ch_max_len, batch_size):
        assert len(en_corpus) == len(ch_corpus)
        
        batch_num = len(en_corpus)//batch_size
        n = batch_num*batch_size
        
        self.xs = [np.zeros(n, dtype=np.int32) for _ in range(en_max_len)] # encoder inputs
        self.ys = [np.zeros(n, dtype=np.int32) for _ in range(ch_max_len)] # decoder inputs
        self.gs = [np.zeros(n, dtype=np.int32) for _ in range(ch_max_len)] # decoder outputs
        self.ws = [np.zeros(n, dtype=np.float32) for _ in range(ch_max_len)] # decoder weight for loss caculation
        
        self.en_max_len = en_max_len
        self.ch_max_len = ch_max_len
        self.batch_size = batch_size
        
        for b in range(batch_num):
            for i in range(b*batch_size, (b+1)*batch_size):
                for j in range(len(en_corpus[i])-2):
                    self.xs[j][i] = en_corpus[i][j+1]
                for j in range(j+1, en_max_len):
                    self.xs[j][i] = en_pad
                
                for j in range(len(ch_corpus[i])-1):
                    self.ys[j][i] = ch_corpus[i][j]
                    self.gs[j][i] = ch_corpus[i][j+1]
                    self.ws[j][i] = 1.0
                for j in range(j+1, ch_max_len): # don't forget padding and let loss weight zero
                    self.ys[j][i] = ch_pad
                    self.gs[j][i] = ch_pad
                    self.ws[j][i] = 0.0
    
    def get(self, batch_id):
        x = [self.xs[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.en_max_len)]
        y = [self.ys[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.ch_max_len)]
        g = [self.gs[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.ch_max_len)]
        w = [self.ws[i][batch_id*self.batch_size:(batch_id+1)*self.batch_size] for i in range(self.ch_max_len)]
        
        return x, y, g, w

batch = BatchGenerator(en_corpus_clean, ch_corpus_clean, 
                       en_vocab['<PAD>'], ch_vocab['<PAD>'], en_max_len, ch_max_len, 4)

print("Encoder input")
print("Decoder input")
print("Decoder output")
print()

x, y, g, w = batch.get(2)
for i in range(4):
    print(' '.join([en_rev[x[j][i]] for j in range(en_max_len)]))
    print(' '.join([ch_rev[y[j][i]] for j in range(ch_max_len)]))
    print(' '.join([ch_rev[g[j][i]] for j in range(ch_max_len)]))
    print('')

## Build Seq2Seq Graph
For training Seq2Seq, we usually use a trick named **teacher forcing** which can help training more efficiently. But when testing, there isn't teacher any more. In tensorflow implementation, we need to **build 2 same models** with one feeding previous. Since they both **share same weight**, don't forget **reuse RNN cell and model in variable scope**. **Attention mechanim** let decoder **focus on specific input** when deciding, and generate more accurate output. Thanks to legacy_seq2seq, attention has been implemented also.

In [0]:
import tensorflow as tf
class MachineTranslationSeq2Seq:
    def __init__(self, en_max_len, ch_max_len, en_size, ch_size):
        self.en_max_len = en_max_len
        self.ch_max_len = ch_max_len
        
        with tf.variable_scope('seq2seq_intput/output'):
            self.enc_inputs = [tf.placeholder(tf.int32, [None]) for i in range(en_max_len)] # time mojor feed
            self.dec_inputs = [tf.placeholder(tf.int32, [None]) for i in range(ch_max_len)]
            self.groundtruths = [tf.placeholder(tf.int32, [None]) for i in range(ch_max_len)]
            self.weights = [tf.placeholder(tf.float32, [None]) for i in range(ch_max_len)]
            
        with tf.variable_scope('seq2seq_rnn'): # training by teacher forcing
            self.out_cell = tf.contrib.rnn.LSTMCell(512)
            self.outputs, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(self.enc_inputs, self.dec_inputs, 
                                                                                    self.out_cell, 
                                                                                    en_size, ch_size, 300)
        with tf.variable_scope('seq2seq_rnn', reuse=True): # predict by feeding previous
            self.pred_cell = tf.contrib.rnn.LSTMCell(512, reuse=True) # reuse cell for train and test
            self.predictions, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(self.enc_inputs, self.dec_inputs, 
                                                                                        self.pred_cell, 
                                                                                        en_size, ch_size, 300, 
                                                                                        feed_previous=True)
        
        with tf.variable_scope('loss'):
            # caculate weighted loss
            self.loss = tf.reduce_mean(tf.contrib.legacy_seq2seq.sequence_loss_by_example(self.outputs, 
                                                                                          self.groundtruths, 
                                                                                          self.weights))
            self.optimizer = tf.train.AdamOptimizer(0.002).minimize(self.loss)
        
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
    
    def train(self, x, y, g, w):
        fd = {}
        for i in range(self.en_max_len):
            fd[self.enc_inputs[i]] = x[i] # show how to feed a list
        
        for i in range(self.ch_max_len):
            fd[self.dec_inputs[i]] = y[i]
            fd[self.groundtruths[i]] = g[i]
            fd[self.weights[i]] = w[i]
        
        loss, _ = self.sess.run([self.loss, self.optimizer], fd)
        
        return loss

    def output(self, x, y):
        fd = {}
        for i in range(self.en_max_len):
            fd[self.enc_inputs[i]] = x[i]
        
        for i in range(self.ch_max_len):
            fd[self.dec_inputs[i]] = y[i]
        
        out = self.sess.run(self.outputs, fd)
        
        return out
    
    def predict(self, x, ch_beg):
        fd = {}
        for i in range(self.en_max_len):
            fd[self.enc_inputs[i]] = x[i]
        
        for i in range(self.ch_max_len): # when feed previous, the fist token should be '<BEG>', and others are useless
            if i==0:
                fd[self.dec_inputs[i]] = np.ones(y[i].shape, dtype=np.int32)*ch_beg
            else:
                fd[self.dec_inputs[i]] = np.zeros(y[i].shape, dtype=np.int32)
        
        pd = self.sess.run(self.predictions, fd)
        
        return pd
    
    def save(self, e):
        self.saver.save(self.sess, 'model/seq2seq/seq2seq_%d.ckpt'%(e+1))
    
    def restore(self, e):
        self.saver.restore(self.sess, 'model/seq2seq/seq2seq_%d.ckpt'%(e))

In [0]:
tf.reset_default_graph()
model = MachineTranslationSeq2Seq(en_max_len, ch_max_len, len(en_vocab), len(ch_vocab))

In [0]:
#hyperparameters
EPOCHS = 40
BATCH_SIZE = 256
batch_num = len(en_corpus_clean)//BATCH_SIZE

batch = BatchGenerator(en_corpus_clean, ch_corpus_clean, 
                       en_vocab['<PAD>'], ch_vocab['<PAD>'], 
                       en_max_len, ch_max_len, BATCH_SIZE)

In [0]:
rec_loss = []
for e in range(EPOCHS):
    train_loss = 0
    
    for b in range(batch_num):
        x, y, g, w = batch.get(b)
        batch_loss = model.train(x, y, g, w)
        train_loss += batch_loss
    
    train_loss /= batch_num
    rec_loss.append(train_loss)
    print("epoch %d loss: %f" % (e, train_loss))
    
    model.save(e)
    
np.save('./model/seq2seq/rec_loss.npy', rec_loss)

In [0]:
rec_loss = np.load('./model/seq2seq/rec_loss.npy')

import matplotlib.pyplot as plt

plt_loss = plt.plot([rec_loss[i] for i in range(len(rec_loss))])
plt.legend(['Loss'])
plt.show()

In [0]:
model.restore(EPOCHS)

# Cherry Pick and Show Result
**BLEU is a metric for supervised text generation which finds the similarity between two sentence based on n-gram token.** Now, we want to show some great translation result which is called **cherry pick**.



In [0]:
import nltk

def cherry_pick(records, n, upper_bound=1.0):
    bleus = []
    
    for en, ch_gr, ch_pd in records:
        # caculate BLEU by nltk
        bleu = nltk.translate.bleu_score.sentence_bleu([ch_gr], ch_pd)
        bleus.append(bleu)
    
    lst = [i for i in range(len(records)) if bleus[i]<=upper_bound]
    lst = sorted(lst, key=lambda i: bleus[i], reverse=True) # sort by BLEU score
    
    return [records[lst[i]] for i in range(n)]

In [0]:
import random as rd

records = []

for i in range(10):
    i = rd.randint(0, batch_num-1) # random pick one to translate
    
    x, y, g, w = batch.get(i)
    out = model.output(x, y)
    pd = model.predict(x, ch_vocab['<BEG>'])

    for j in range(10):
        j = rd.randint(0, BATCH_SIZE-1)
        
        en = [en_rev[x[i][j]] for i in range(en_max_len)]
        en = en[:en.index('<PAD>')]
        ch_gr = [ch_rev[g[i][j]] for i in range(ch_max_len)]
        if '<END>' in ch_gr:
            ch_gr = ch_gr[:ch_gr.index('<END>')]
        ch_pd = [ch_rev[np.argmax(pd[i][j, :])] for i in range(ch_max_len)]
        if '<END>' in ch_pd:
            ch_pd = ch_pd[:ch_pd.index('<END>')]
        
        records.append([en, ch_gr, ch_pd])

n = 12 # how many result we show
rec_cherry = cherry_pick(records, n)

print("Encoder input")
print("Ground truth")
print("Decoder output")
print()

for i in range(n):
    for j in range(3):
        print(' '.join(rec_cherry[i][j]))
    
    print('')