## 1. 数据预处理

### 查看数据

In [1]:
%%bash

for i in ../*txt
do
    wc -l $i
done

14871 ../TED_en_test.txt
300000 ../TED_en_train.txt
14871 ../TED_zh_test.txt
300000 ../TED_zh_train.txt


train 文件足足 30 万行，所以 shuffle 一部分行数查看编码

In [2]:
%%bash

for i in ../*txt
do
    echo $i
    shuf -n 50 $i | chardet
done

../TED_en_test.txt
<stdin>: ascii with confidence 1.0
../TED_en_train.txt
<stdin>: ascii with confidence 1.0
../TED_zh_test.txt
<stdin>: utf-8 with confidence 0.99
../TED_zh_train.txt
<stdin>: utf-8 with confidence 0.99


└ 英文数据是 ASCII 编码，中文数据是 UTF-8 编码，Python3 两者都支持，所以不需用 `iconv` 转码了

In [3]:
%%bash

for i in ../*txt
do
    echo $i
    echo '---------------'
    head -n 2 $i
    echo ''
done

../TED_en_test.txt
---------------
We could use sales, anything you like.
There it is: after some little fluctuations at the beginning,

../TED_en_train.txt
---------------
It can be a very complicated thing, the ocean.
And it can be a very complicated thing, what human health is.

../TED_zh_test.txt
---------------
我们还可以用销售量 什么都行
看 当公司进行革新

../TED_zh_train.txt
---------------
海洋是一个非常复杂的事物。
人类的健康也是一件非常复杂的事情。



└ 英文符号有时对应的是空格，为了方便翻译，可以把这些先去除，提升文字翻译准确性

### 数据清理

先创建个小数据文件方便测试

In [4]:
%%bash

mkdir -p ../data && \
for i in ../*txt
do
    head -n 50 $i > "../data/tst_${i#../}"
done

In [5]:
!ls ../data

tst_TED_en_test.txt   tst_TED_zh_test.txt
tst_TED_en_train.txt  tst_TED_zh_train.txt


读取数据

In [6]:
import os
import itertools
import operator
import string
from collections import deque, Counter

import jieba
import numpy as np
import tensorflow as tf
import zhon.hanzi as zh

from tensorflow.contrib.legacy_seq2seq import basic_rnn_seq2seq, embedding_rnn_seq2seq, sequence_loss
from tensorflow.python.ops import variable_scope

`exec` 可把 str 转为 variable name

In [7]:
basepath = '../data'
for s in ('en_train', 'en_test', 'zh_train', 'zh_test'):
    path = os.path.join(basepath, 'tst_TED_' + s + '.txt')
    with open(path, 'r') as file:
        exec(s + " = file.read().splitlines()") 

中文数据先做分词

In [8]:
def cut_chinese_data(sequences):
    jieba.setLogLevel(20)                                                                       
    jieba.enable_parallel(4)
    for sequence in sequences:
        data = jieba.cut(sequence)
        yield ' '.join(data)

In [9]:
def clean_data(sequences):
    sequences = (''.join(c for c in x if c not in string.punctuation) 
                           for x in sequences)
    sequences = (''.join(c for c in x if c not in zh.punctuation) 
                           for x in sequences)
    sequences = (x.split() for x in sequences)
    sequences = [[c for c in x if c] for x in sequences]  # clean 之后多次复用，所以用 list
    return sequences

### 数据转置

In [10]:
zh_train = cut_chinese_data(zh_train)
zh_test = cut_chinese_data(zh_test)

zh_train = clean_data(zh_train)
en_train = clean_data(en_train)
zh_test = clean_data(zh_test)
en_test = clean_data(en_test)

data = itertools.chain(zh_train, en_train, zh_test, en_test)  # Concat all data
words = itertools.chain.from_iterable(data)  # Flat data

In [11]:
def get_common_words(words, n):
    count = Counter(words)
    count_dict = {key: value for key, value in count.items() if value > n}
    word_counts = sorted(count_dict.items(), 
                         key=operator.itemgetter(1), reverse=True)
    return word_counts

In [12]:
def build_dict(word_counts):
    count = [['<UNK>', -1]]
    count.extend(word_counts)
    vocab2ix = {key: ix for ix, (key, _) in enumerate(count)}
    vocab2ix['<GO>'] = max(vocab2ix.values()) + 1
    vocab2ix['<EOS>'] = max(vocab2ix.values()) + 1
    ix2vocab = {value: key for key, value in vocab2ix.items()}
    return vocab2ix, ix2vocab

In [13]:
def word_to_number(sequences, word_dict):
    data = []
    for sequence in sequences:
        sequence_data = []
        for word in sequence:
            try:
                sequence_data.append(word_dict[word])
            except:
                sequence_data.append(0)
        data.append(sequence_data)
    return data

In [14]:
def append_go_eos(nested_list):
    # 因为生成器只能**遍历一次**，而遍历 deque_ 后，
    #     没有新的 references，所以第一行不能使用生成器
    nested_list = [deque(list_) for list_ in nested_list]
    for deque_ in nested_list:
        deque_.appendleft('<GO>')
        deque_.append('<EOS>')
    nested_list = [list(deque_) for deque_ in nested_list]
    return nested_list

In [15]:
word_counts = get_common_words(words, 0)
vocab2ix, ix2vocab = build_dict(word_counts)

In [16]:
zh_train = append_go_eos(zh_train)
zh_test  = append_go_eos(zh_test)

In [17]:
def seq2seq_pad(encoder_inputs, encoder_length, decoder_inputs, decoder_length, vocab, pad_symbol='<UNK>'):
    """
    - encoder_input: A nested list of symbol str for encoding, length: batch_size
    - encoder_length: max length of encoder input
    - decoder_input: A nested list of symbol str for decoding, length: batch_size
    - decoder_length: max length of decoder input
    - vocab: vocabulary index, symbol (str) -> index (int)
    
    Example: 
    ["hello", "world"] -> ["hi", "<EOS>"]
    ["cover", "me"] -> ["roger", "<EOS>"]
    
    seq2seq_pad([['hello', 'world'], ['cover', 'me']], 4, [['hi', '<EOS>'], ['roger', '<EOS>']], 4, vocab)
    
    Assume that index of "<PAD>" is 0

    Output:
    [[0, 0, <index of 'hello'>, <index of 'world'>], [0, 0, <index of 'cover'>, <index of 'me'>]],
    [[<index of 'hi'>, <index of 'EOS'>, 0, 0], [<index of 'roger'>, <index of 'EOS'>, 0, 0]]
    """
    pad_index = vocab[pad_symbol]
    def to_index(inputs, length, pad_from_start=True):
        inputs_to_index = []
        for cur_input in inputs:
            cur_input_to_index = [pad_index] * length
            l = len(cur_input)
            if l < length:
                if pad_from_start:
                    cur_input_to_index[(length - l):] = [vocab[i] for i in cur_input]
                else:
                    cur_input_to_index[:l] = [vocab[i] for i in cur_input]
            else:
                cur_input_to_index = [vocab[i] for i in cur_input[:length]]
            inputs_to_index.append(cur_input_to_index)    
        return inputs_to_index
    return to_index(encoder_inputs, encoder_length, True), to_index(decoder_inputs, decoder_length, False)

## 2. 构建模型

In [18]:
for i, j in zip('en_train zh_train en_test zh_test'.split(),
                (en_train, zh_train, en_test, zh_test)):
    print(i, max(len(x) for x in j))

en_train 13
zh_train 14
en_test 13
zh_test 12


In [19]:
tf.reset_default_graph()

In [20]:
encoder_length = max(len(x) for x in en_train)
decoder_length = max(len(x) for x in zh_train)

cell = tf.contrib.rnn.BasicRNNCell(128)
num_encoder_symbols = len(vocab2ix)
num_decoder_symbols = len(vocab2ix)
batch_size = 5
embedding_size = 128
epochs = 50
print_loss_every = 5


encoder_placeholders = [tf.placeholder(tf.int32, shape=[None],
                                       name="encoder_%d" % i) for i in range(encoder_length)]
decoder_placeholders = [tf.placeholder(tf.int32, shape=[None],
                                       name="decoder_%d" % i) for i in range(decoder_length)]
target_placeholders = [tf.placeholder(tf.int32, shape=[None],
                                       name="target_%d" % i) for i in range(decoder_length)]
target_weights_placeholders = [tf.placeholder(tf.float32, shape=[None],
                                       name="decoder_weight_%d" % i) for i in range(decoder_length)]
outputs, states = embedding_rnn_seq2seq(
    encoder_placeholders, decoder_placeholders, cell,
    num_encoder_symbols, num_decoder_symbols,
    embedding_size, output_projection=None,
    feed_previous=False)

loss = sequence_loss(outputs, target_placeholders, target_weights_placeholders)
train_step = tf.train.AdamOptimizer(0.01).minimize(loss)

**数据准备步骤：**

1. 先做 padding
2. 通过 batch size 获取 mini-batch data
3. left_shift 
4. 获取 feed_dict

In [21]:
def get_batch_data(en, zh, batch_size):
    en = np.asarray(en)
    zh = np.asarray(zh)
    idx = np.random.randint(np.asarray(en).shape[0], size=batch_size)
    return en[idx], zh[idx], idx

In [22]:
def left_shift(decoder_inputs, pad_idx):
    # for generating targets
    return [list(input_[1:]) + [pad_idx] for input_ in decoder_inputs]

In [23]:
def get_feed_dict(encoder_inputs, decoder_inputs):
    encoder_inputs = list(zip(*encoder_inputs))
    target_inputs = list(zip(*left_shift(decoder_inputs, vocab2ix['<UNK>'])))
    decoder_inputs = list(zip(*decoder_inputs))
    
    feed_dict = dict()
    # Prepare input data    
    for (i, placeholder) in enumerate(encoder_placeholders):
        # 这里用 placeholder 或者 placeholder.name 都可以
        feed_dict[placeholder.name] = np.asarray(encoder_inputs[i], dtype=int)
    for i in range(len(decoder_placeholders)):
        feed_dict[decoder_placeholders[i].name] = np.asarray(decoder_inputs[i], dtype=int)
        feed_dict[target_placeholders[i].name] = np.asarray(target_inputs[i], dtype=int)        
        # 这里使用 weights 把 <PAD> 的损失屏蔽了
        feed_dict[target_weights_placeholders[i].name] = np.asarray(
            [float(idx != vocab2ix['<UNK>'])
             for idx in target_inputs[i]],dtype=float)
    return feed_dict

In [24]:
en_train_data, zh_train_data = seq2seq_pad(en_train, encoder_length, 
                                           zh_train, decoder_length, vocab2ix)

en_test_data, zh_test_data = seq2seq_pad(en_test, encoder_length, 
                                           en_test, decoder_length, vocab2ix)

# For test prediction
en_test_batch_data, zh_test_batch_data, idx = get_batch_data(en_test_data, zh_test_data, batch_size=5)
test_feed_dict = get_feed_dict(en_test_batch_data, zh_test_batch_data)

In [25]:
def get_train_batch_input_target(idx):
    test_batch_array = np.asarray(zh_train)[idx]
    test_target_list = [x[1:][:-1] for x in test_batch_array]  # Remove <GO> and <EOS>
    test_target_list = [' '.join(x) for x in test_target_list]
    test_input_list = [' '.join(x) for x in np.asarray(en_train)[idx]]
    return test_input_list, test_target_list

In [27]:
def get_test_batch_input_target(idx):
    test_batch_array = np.asarray(zh_test)[idx]
    test_target_list = [x[1:][:-1] for x in test_batch_array]  # Remove <GO> and <EOS>
    test_target_list = [' '.join(x) for x in test_target_list]
    test_input_list = [' '.join(x) for x in np.asarray(en_test)[idx]]
    return test_input_list, test_target_list

In [28]:
def get_out_sequences(outputs_list):
    test_out_array = np.asarray(outputs_list).T
    test_out_list = [[ix2vocab[words] for words in sublist]
                                      for sublist in test_out_array]
    # 删除 <EOS> 之后的词
    test_out_list = [list(itertools.takewhile(lambda x: x != '<EOS>', sublist))
                     for sublist in test_out_list]
    test_out_list = [' '.join(i) for i in test_out_list]
    return test_out_list

In [29]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    encoder_inputs, decoder_inputs = seq2seq_pad(en_train, encoder_length, 
                                                 zh_train, decoder_length, vocab2ix)
    for i in range(epochs):
        en_train_batch_data, zh_train_batch_data, _ = get_batch_data(
            en_train_data, zh_train_data, batch_size)
        feed_dict = get_feed_dict(en_train_batch_data, zh_train_batch_data)
        sess.run(train_step, feed_dict)
        if i % print_loss_every == 0:
            print('After {:4d} steps, cost is {:.4f}'.format(i, sess.run(loss, feed_dict)))

    print("\n---Deocding---\n")
    
    # Decoding
    with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True):
        outputs, states = embedding_rnn_seq2seq(
            encoder_placeholders, decoder_placeholders, cell,
            num_encoder_symbols, num_decoder_symbols,
            embedding_size, output_projection=None,
            feed_previous=True)

        outputs_list = []
        for o in outputs:
            # 注意这里也需要提供 feed_dict
            m = np.argmax(o.eval(feed_dict), axis=1)
            outputs_list.append(m)
        
        outputs_ = get_out_sequences(outputs_list)
        inputs_, targets_ = get_train_batch_input_target(_)
        for i, o, t in zip(inputs_, outputs_, targets_):
            print('Input:', i)
            print('Target:', t)
            print('Output:', o)
            print('-=-' * 10)

After    0 steps, cost is 5.5506
After    5 steps, cost is 4.7260
After   10 steps, cost is 4.7547
After   15 steps, cost is 5.1344
After   20 steps, cost is 3.5311
After   25 steps, cost is 3.7506
After   30 steps, cost is 2.5597
After   35 steps, cost is 2.2508
After   40 steps, cost is 3.2460
After   45 steps, cost is 1.8039

---Deocding---

Input: You smell money
Target: 你 闻到 的 是 钱 的 气味
Output: 你 闻到 的 是 钱 的 气味
-=--=--=--=--=--=--=--=--=--=-
Input: And bringing those two together might seem a very daunting task
Target: 将 两者 统一 起来 看起来 是 一件 艰巨 的 任务
Output: 将 两者 还 不再 健康
-=--=--=--=--=--=--=--=--=--=-
Input: And were making the ocean pretty unhappy in a lot of different ways
Target: 我们 正在 通过 许多 不同 的 方法 惹怒 海洋
Output: 我们 正在 通过 浮沫
-=--=--=--=--=--=--=--=--=--=-
Input: and absorbed it into their skin and into their bodies
Target: 并 把 污染 吸入 了 它们 的 皮肤 和 身体
Output: 在 西海岸 最大 的
-=--=--=--=--=--=--=--=--=--=-
Input: The pyramid of ocean life
Target: 海洋生物 的 食物链
Output: 海洋生物 的 是
-=--=--=--=--=--=--

└ 对比 Input, Target, Output 三者，发现在训练集上验证是没问题的