107062566 Yu-Cheng Huang, requires tqdm and the latest tensorflow

In [5]:
import re
import os
from collections import defaultdict
from tqdm import tqdm

import numpy as np
import tensorflow as tf

os.environ["CUDA_VISIBLE_DEVICES"] = '1'

In [6]:
BATCH_SIZE = 256
MAX_LENGTH = 10


class Corpus:
    def __init__(self):
        self.word2cnt = defaultdict(int)
        self.word2idx = dict()
        self.idx2word = dict()
        self.special = ['SOS', 'EOS', 'UKN', 'PAD']
        
    def __len__(self):
        return len(self.word2idx)

    def add_word(self, word):
        self.word2cnt[word] += 1

    def add_words(self, words):
        for word in words:
            self.add_word(word)

    def freeze(self):
        rare_words = [word for word, cnt in self.word2cnt.items() if cnt < 5]
        for word in rare_words:
            del self.word2cnt[word]
            
        self.word2idx = dict()
        self.idx2word = dict()
        freq_words = self.special + list(self.word2cnt.keys())
        for idx, word in enumerate(freq_words):
            self.word2idx[word] = idx
            self.idx2word[idx] = word
    
    def transform(self, tokens):
        ukn = self.word2idx['UKN']
        pad = self.word2idx['PAD']
        result = [self.word2idx.get(t, ukn) for t in tokens]
        n_need = MAX_LENGTH - len(result)
        if n_need > 0:
            result.extend([pad] * n_need)
        return result
    
    def restore(self, idxs):
        tokens = [self.idx2word[idx] for idx in idxs]
        return tokens

In [7]:
regex = re.compile(r'[-:;,.?!\'\"+*/]*')
line_data = dict() # line_id -> tokens
with open('./data/movie_lines.txt', errors='ignore') as f:
    data = f.read().split('\n')
    for line in tqdm(data):
        line = line.split(' ')
        line_id = line[0]
        tokens = [t for t in line[8:] if t]
        tokens = [regex.sub('', t.lower()) for t in tokens]
        line_data[line_id] = tokens

corpus = Corpus()
ques = []
anss = []
with open('./data/movie_conversations.txt', errors='ignore') as f:
    data = f.read().split('\n')
    for line in tqdm(data):
        idxs = re.findall(r'L\d+', line)
        for que_idx, ans_idx in zip(idxs[:-1], idxs[1:]):
            que = line_data[que_idx]
            ans = line_data[ans_idx]
            if len(que) < MAX_LENGTH - 1 and len(ans) < MAX_LENGTH - 1:
                corpus.add_words(que)
                corpus.add_words(ans)
                ques.append(que)
                anss.append(ans)

corpus.freeze()

100%|██████████| 304714/304714 [00:03<00:00, 86463.98it/s]
100%|██████████| 83098/83098 [00:00<00:00, 197302.31it/s]


In [8]:
encoder_inp = []
decoder_inp = []
decoder_true = []
decoder_mask = []
for que, ans in zip(ques, anss):
    encoder_inp.append(np.int32(corpus.transform(que + ['EOS'])))
    decoder_inp.append(np.int32(corpus.transform(['SOS'] + ans)))
    decoder_true.append(np.int32(corpus.transform(ans + ['EOS'])))
    decoder_mask.append(np.float32([1 if t >= 4 else 0 for t in decoder_true[-1]]))

encoder_inp = np.stack(encoder_inp, axis=0)
decoder_inp = np.stack(decoder_inp, axis=0)
decoder_true = np.stack(decoder_true, axis=0)             
decoder_mask = np.stack(decoder_mask, axis=0) 

def batchify(arr):
    n_part = len(arr) // BATCH_SIZE
    arr = np.split(arr[:n_part * BATCH_SIZE], n_part, axis=0)
    arr = np.stack(arr, axis=0)
    return arr

def transpose(arr):
    return np.transpose(arr, (0, 2, 1))
                
encoder_inp = transpose(batchify(encoder_inp))
decoder_inp = transpose(batchify(decoder_inp))
decoder_true = transpose(batchify(decoder_true))
decoder_mask = transpose(batchify(decoder_mask))

# [n_part, seq_len, batch_size]
print(encoder_inp.shape, encoder_inp.dtype)
print(decoder_inp.shape, decoder_inp.dtype)
print(decoder_true.shape, decoder_true.dtype)
print(decoder_mask.shape, decoder_mask.dtype)
print(len(corpus))

(307, 10, 256) int32
(307, 10, 256) int32
(307, 10, 256) int32
(307, 10, 256) float32
6205


In [9]:
class Seq2Seq:
    def __init__(self):
        dict_sz = len(corpus)
        
        with tf.variable_scope('var'):
            self.enc_inp = [tf.placeholder(tf.int32, [None]) for _ in range(MAX_LENGTH)]
            self.dec_inp = [tf.placeholder(tf.int32, [None]) for _ in range(MAX_LENGTH)]
            self.dec_true = [tf.placeholder(tf.int32, [None]) for _ in range(MAX_LENGTH)]
            self.dec_mask = [tf.placeholder(tf.float32, [None]) for _ in range(MAX_LENGTH)]
        
        with tf.variable_scope('rnn'):
            self.rnn_cell_tf = tf.contrib.rnn.LSTMCell(512)
            self.dec_pred_tf, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                self.enc_inp, self.dec_inp, self.rnn_cell_tf, dict_sz, dict_sz, 300
            )
            
        with tf.variable_scope('rnn', reuse=True):
            self.rnn_cell = tf.contrib.rnn.LSTMCell(512, reuse=True)
            self.dec_pred, _ = tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                self.enc_inp, self.dec_inp, self.rnn_cell, dict_sz, dict_sz, 300, feed_previous=True
            )
            
        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.contrib.legacy_seq2seq.sequence_loss_by_example(
                self.dec_pred_tf, self.dec_true, self.dec_mask
            ))
            self.optimizer = tf.train.AdamOptimizer(0.002).minimize(self.loss)
            
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
        
    def train_step(self, encoder_inp, decoder_inp, decoder_true, decoder_mask):
        feed_dict = dict()
        for i in range(MAX_LENGTH):
            feed_dict[self.enc_inp[i]] = encoder_inp[i]
            feed_dict[self.dec_inp[i]] = decoder_inp[i]
            feed_dict[self.dec_true[i]] = decoder_true[i]
            feed_dict[self.dec_mask[i]] = decoder_mask[i]
        loss, _ = self.sess.run([self.loss, self.optimizer], feed_dict)
        return loss
    
    def predict(self, encoder_inp):
        feed_dict = dict()
        for i in range(MAX_LENGTH):
            feed_dict[self.enc_inp[i]] = encoder_inp[i]
            if i == 0:
                feed_dict[self.dec_inp[i]] = np.full(encoder_inp.shape[1], corpus.word2idx['SOS'])
            else:
                feed_dict[self.dec_inp[i]] = np.zeros(encoder_inp.shape[1])
                
        pred = self.sess.run(self.dec_pred, feed_dict)
        pred = np.float32(pred) # [seq_len, batch_sizes, dict_sz]
        pred = np.argmax(pred, axis=2)
        
        return pred
    
    def save(self, e):
        self.saver.save(self.sess, 'data/rnn_%d.ckpt'%(e+1))
    
    def restore(self, e):
        self.saver.restore(self.sess, 'data/rnn_%d.ckpt'%(e))
    
tf.reset_default_graph()
model = Seq2Seq()

In [10]:
# # This cell is executed in tmux for long training time
# for epoch in range(50):
#     n_step = len(encoder_inp)
    
#     indices = np.random.permutation(n_step)
#     encoder_inp = encoder_inp[indices]
#     decoder_inp = decoder_inp[indices]
#     decoder_true = decoder_true[indices]
#     decoder_mask = decoder_mask[indices]
    
#     avg_loss = 0.0
#     with tqdm(total=n_step, desc=f'Epoch {epoch:02d}') as pbar:
#         for i in range(n_step):
#             loss = model.train_step(encoder_inp[i], decoder_inp[i], decoder_true[i], decoder_mask[i])
#             avg_loss = (avg_loss * i + loss) / (i + 1)
#             pbar.set_postfix(loss=avg_loss)
#             pbar.update(1)
    
#     model.save(epoch)

In [12]:
model.restore(98)

ques = [
    ['hello', 'EOS'],
    ['how', 'are', 'you', 'EOS'],
    ['where', 'are', 'you', 'going', 'EOS'],
    ['you', 'look', 'great', 'EOS'],
    ['good', 'night', 'EOS'],
]
ques_nd = np.int32(list(map(corpus.transform, ques))).transpose()
anss_nd = model.predict(np.int32(ques_nd)).transpose()
anss = list(map(corpus.restore, anss_nd))
for que, ans in zip(ques, anss):
    print('>', ' '.join(que[:-1]))
    print('<', ' '.join(ans))

INFO:tensorflow:Restoring parameters from data/rnn_98.ckpt
> hello
< hi mike its frank  me in the bathroom where
> how are you
< fine watch well im feelin for you thats a better
> where are you going
< nowhere you were fucking around bed inn im leaving from
> you look great
< thanks for calling the tv lights here calling me back
> good night
< sleep tight its friday a good land mine land might


# Report

資料前處理的部份我使用 regular expression 來抽取文字，然後使用 np.split 來分切成 batch。我覺得我寫了一個 corpus 的 class 做得非常好，簡化了很多邏輯，程式碼看起來就很好看~

而結果的部份，model 回答的前半部份都是合理的，但之後就生成奇怪的單詞，我想這是 teacher forcing 造成的。我研究了一些網路上其他人實現的 seq2seq，發現所有人都是「機率性」地使用 teacher forcing（0.5 的機率使用，0.5 的機率使用前一個 hidden state），他們並沒有像 TA 的程式一樣「全部」使用 teacher forceing。我想這是造成 model 回答不甚理想的原因之一。

同時，另一個觀察到的現像為：model 不會輸出 EOS，即使訓練資料（`decode_true`）是有包含 EOS 的。我不確定為什麼會這樣，不過這是目前我訓練出來最好的結果了。

心得：tensorflow 的 rnn 怎麼這麼吃 cpu 啊…