In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [2]:
path = './datasets/cmn-eng/cmn.txt'

# 一、文本预处理：中英文不同

In [None]:
def preprocess_eng(w):
    w = w.lower().strip()

    # 在单词与跟在其后的标点符号之间插入一个空格
    # 例如： "he is a boy." => "he is a boy ."
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # 除了 (a-z, A-Z, ".", "?", "!", ",")，将所有字符替换为空格
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    # 给句子加上开始和结束标记，以便模型知道何时开始和结束预测
    w = '<start> ' + w + ' <end>'
    return w


def preprocess_ch(w):
    w = zhconv.convert(w, 'zh-cn')
    w = ' '.join(jieba.cut(w))
    w = w.rstrip().strip()
    w = '<start> ' + w + ' <end>'
    return w


def create_dataset(path, num_examples):
    lines = io.open(path, encoding='utf-8').read().strip().split('\n')
    eng, ch = [], []
    for l in lines[:num_examples]:
        word_pairs = l.split('\t')
        eng.append(preprocess_eng(word_pairs[0]))
        ch.append(preprocess_ch(word_pairs[1]))
    return [eng, ch]

In [None]:
eng, ch = create_dataset(path, None)
print(eng[-1])
print(ch[-1])

# 二、文本向量化

In [None]:
def max_length(tensor):
    return max(len(t) for t in tensor)


def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=' ')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding='post')
    return tensor, lang_tokenizer


def load_dataset(path, num_examples=None):
    eng, ch = create_dataset(path, num_examples)
    ch_tensor, ch_tokenizer = tokenize(ch)
    eng_tensor, eng_tokenizer = tokenize(eng)
    return ch_tensor, eng_tensor, ch_tokenizer, eng_tokenizer

In [None]:
num_examples = None
ch_tensor, eng_tensor, ch_tokenizer, eng_tokenizer = load_dataset(
    path, num_examples)

In [None]:
# 保存分词器，模型复用
# tokenizer_json = ch_tokenizer.to_json()
# with io.open('ch_tokenizer.json', 'w', encoding='utf-8') as f:
#     f.write(json.dumps(tokenizer_json, ensure_ascii=False))
#
# with open('ch_tokenizer.json') as f:
#     data = json.load(f)
#     tokenizer = tokenizer_from_json(data)

In [None]:
max_length_ch, max_length_eng = max_length(ch_tensor), max_length(eng_tensor)

ch_tensor_train, ch_tensor_val, eng_tensor_train, eng_tensor_val = train_test_split(
    ch_tensor, eng_tensor, test_size=0.2)

print(len(ch_tensor_train), len(ch_tensor_val), len(eng_tensor_train),
      len(eng_tensor_val))

In [None]:
def convert(tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print("%d ----> %s" % (t, tokenizer.index_word[t]))

In [None]:
print("Input Language; index to word mapping")
convert(ch_tokenizer, ch_tensor_train[0])
print()
print("Target Language; index to word mapping")
convert(eng_tokenizer, eng_tensor_train[0])

# 三、创建 tf.data 数据集

In [None]:
BUFFER_SIZE = len(ch_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(ch_tensor_train) // BATCH_SIZE
embedding_size = 256
units = 1024
vocab_inp_size = len(ch_tokenizer.word_index) + 1
vocab_targ_size = len(eng_tokenizer.word_index) + 1

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(
    (ch_tensor_train, eng_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
example_inp_batch, example_targ_batch = next(iter(dataset))
print(example_inp_batch.shape, example_targ_batch.shape)

# 四、编码器和解码器模型

In [None]:
# 1. 编码器结构：

# 输入的中文数据，[batch_size, max_length_inp]
# ---> Embedding 层  ---> [batch_size,max_length_inp, embedding_dim]
# ---> GRU 层 ---> 输出状态[batch_size, max_length_inp, enc_units]， 隐藏状态 [
# batch_size,  enc_units]

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_size, units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_inp_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units) {}'.format(
    sample_output.shape))
print('Encoder Hidden state shape: (batch size, units) {}'.format(
    sample_hidden.shape))

In [None]:
# 2. 注意力机制
# values 为上一步的输出 [batch_size, max_length_inp, enc_units]
# query 为上一步的隐藏状态 [batch_size, enc_units]


In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        # [batch_size, 1, enc_units]

        score = self.V(
            tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        # [batch_size, mex_length_inp, 1]

        attention_weights = tf.nn.softmax(score, axis=1)
        # [batch_size, mex_length_inp, 1]

        context_vector = attention_weights * values
        # [batch_size, max_length_inp, enc_units]

        context_vector = tf.reduce_sum(context_vector, axis=1)
        # [batch_size, enc_units]

        return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden,
                                                      sample_output)

print("Attention result shape: (batch size, units) {}".format(
    attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(
    attention_weights.shape))

In [None]:
# 3. 解码器结构：


In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # 逐个单词进行翻译
        # x: [batch_size, 1],
        # hidden: [batch_size, units]
        # enc_output: [batch_size, max_length_inp, enc_units]

        context_vector, attention_weights = self.attention(hidden, enc_output)
        # [batch_size, enc_units]，[batch_size, max_length_inp, 1]

        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        # [batch_size, 1, embedding_dim+enc_units]

        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)

        return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_targ_size, embedding_size, units, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),
                                      sample_hidden, sample_output)
print('Decoder output shape: (batch_size, vocab size) {}'.format(
    sample_decoder_output.shape))

In [None]:
# 4. 优化器及损失函数：

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_objects = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                             reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    # 所有非零处为 True，零处 False
    # 数据向量化时都是在后部补 0 达到相同的长度

    loss_ = loss_objects(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [None]:
# 5. 训练模型：

In [None]:
checkpoint_dir = '../H/save/zh2eng_attention'
# checkpoint_prefix = os.path.join(checkpoint_dir, "cpkt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder,
                                 decoder=decoder)

manager = tf.train.CheckpointManager(checkpoint, directory=checkpoint_dir,
                                     checkpoint_name='model.ckpt',
                                     max_to_keep=3)

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([eng_tokenizer.word_index['<start>']] *
                                   BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden,
                                                 enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(grads_and_vars=zip(gradients, variables))
    return batch_loss

In [None]:
EPOCHS = 10
for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print("Epoch {} Batch {} Loss {:4f}".format(
                epoch + 1, batch, batch_loss.numpy()))

        if (epoch + 1) % 2 == 0:
            # checkpoint.save(file_prefix=checkpoint_prefix)

            manager.save()

        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                            total_loss / steps_per_epoch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_eng, max_length_ch))

    sentence = preprocess_ch(sentence)
    inputs = [ch_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_ch, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([eng_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_eng):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += eng_tokenizer.index_word[predicted_id] + ' '

        if eng_tokenizer.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}
    ax.set_xticklabels([''] + sentence, fontdict=fontdict)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print("Input : %s" % (sentence))
    print("Predicted translation : {}".format(result))
    attention_plot = attention_plot[:len(result.split(' ')
                                         ), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))


checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

translate(u'我迷失了')