In [14]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sklearn
import sys
import tensorflow as tf
import time

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=9, releaselevel='final', serial=0)
matplotlib 3.3.1
numpy 1.19.1
pandas 1.1.3
sklearn 0.23.2
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [15]:

# 1. preprocessing data
# 2. build model
# 2.1 encoder
# 2.2 attention
# 2.3 decoder
# 2.4 loss & optimizer
# 2.5 train
# 3. evaluation
# 3.1 given sentence, return translated results
# 3.2 visualize results (attention)

In [16]:
import unicodedata
import re
from sklearn.model_selection import train_test_split

In [17]:
# http://www.manythings.org/anki/
# http://www.manythings.org/anki/spa-eng.zip
en_spa_file_path = './spa.txt'

def unicode_to_ascii(s):
    # NFD 是一种 normalize 方法，如果有一个 unicode 是由多个 ascii 组成的，
    # 就将多个 ascii 拆开
    # Mn 表示 重音。
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"

print(unicode_to_ascii(en_sentence))
print(unicode_to_ascii(sp_sentence))

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    # 标点符号前后加空格
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    #  多余的空格变成一个空格
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    # 除了标点符号和字母外都是空格
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    # 去掉前后空格
    w = w.rstrip().strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

May I borrow this book?
¿Puedo tomar prestado este libro?
<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


In [18]:
def parse_data(file_name):
    lines = open(file_name, encoding='UTF-8').read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]
    preprocessed_sentence_pairs = [
        (preprocess_sentence(en), preprocess_sentence(sp)) for en, sp in sentence_pairs]
    return zip(*preprocessed_sentence_pairs)

en_dataset, sp_dataset = parse_data(en_spa_file_path)
print(en_dataset[-1])
print(sp_dataset[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [19]:
a = [(1, 2), (3, 4), (5, 6)]
c, d = zip(*a)
print(c, d)

(1, 3, 5) (2, 4, 6)


In [20]:
def tokenizer(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(
        num_words=None, filters='', split=' ')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = keras.preprocessing.sequence.pad_sequences(
        tensor, padding='post')
    return tensor, lang_tokenizer
input_tensor, input_tokenizer = tokenizer(sp_dataset[0: 30000])
output_tensor, output_tokenizer = tokenizer(en_dataset[0: 30000])

def max_length(tensor):
    return max(len(t) for t in tensor)

max_length_input = max_length(input_tensor)
max_length_output = max_length(output_tensor)
print(max_length_input, max_length_output)

16 11


In [21]:
input_train, input_eval, output_train, output_eval = train_test_split(
    input_tensor, output_tensor, test_size=0.2)
print(len(input_train), len(input_eval), len(output_train), len(output_eval))

24000 6000 24000 6000


In [22]:
def convert(example, tokenizer):
    for t in example:
        if t != 0:
            print('%d --> %s' % (t, tokenizer.index_word[t]))
convert(input_train[0], input_tokenizer)
print()
convert(output_train[0], output_tokenizer)

1 --> <start>
1420 --> ensename
94 --> algo
103 --> nuevo
3 --> .
2 --> <end>

1 --> <start>
255 --> show
17 --> me
187 --> something
146 --> new
3 --> .
2 --> <end>


In [23]:
def make_dataset(input_tensor, output_tensor, batch_size, epochs, shuffle):
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, output_tensor))
    if shuffle:
        dataset = dataset.shuffle(30000)
    dataset = dataset.repeat(epochs).batch(batch_size, drop_remainder=True)
    return dataset

batch_size = 64
epochs = 20
train_dataset = make_dataset(input_train, output_train, batch_size, epochs, True)
eval_dataset = make_dataset(input_eval, output_eval, batch_size, epochs, False)

In [24]:
for x, y in train_dataset.take(1):
    print(x.shape)
    print(y.shape)
    print(x)
    print(y)

(64, 16)
(64, 11)
tf.Tensor(
[[   1 4057   33 ...    0    0    0]
 [   1    9   55 ...    0    0    0]
 [   1  178  171 ...    0    0    0]
 ...
 [   1    6  507 ...    0    0    0]
 [   1 1648   60 ...    0    0    0]
 [   1   51 3583 ...    0    0    0]], shape=(64, 16), dtype=int32)
tf.Tensor(
[[   1  939   31  744  109    3    2    0    0    0    0]
 [   1   14   26 3696    3    2    0    0    0    0    0]
 [   1    4  130  123  102    3    2    0    0    0    0]
 [   1  178   25   86   41    3    2    0    0    0    0]
 [   1   13 2740 1280    3    2    0    0    0    0    0]
 [   1    4   38   64   44   10    3    2    0    0    0]
 [   1   14   11 1021    3    2    0    0    0    0    0]
 [   1    5 1235   61  168    3    2    0    0    0    0]
 [   1   32   29    6   76    7    2    0    0    0    0]
 [   1    4   77  665 4379    3    2    0    0    0    0]
 [   1   22    6   63   15   36    7    2    0    0    0]
 [   1  116    5  679   54   17    3    2    0    0    0]
 [   1

In [25]:
embedding_units = 256
units = 1024
input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = len(output_tokenizer.word_index) + 1

In [26]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoding_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoding_units = encoding_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.encoding_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoding_units))
    
encoder = Encoder(input_vocab_size, embedding_units, units, batch_size)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(x, sample_hidden)

print('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 16, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [27]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, decoder_hidden, encoder_outputs):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        decoder_hidden_with_time_axis = tf.expand_dims(
            decoder_hidden, 1)

        # before V: (batch_size, length, units)
        # after V: (batch_size, length, 1)
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(
            tf.nn.tanh(
                self.W1(encoder_outputs) + self.W2(decoder_hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * encoder_outputs
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights
    
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 16, 1)


In [28]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoding_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.decoding_units = decoding_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.decoding_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
        self.fc = keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.decoding_units)

    def call(self, x, hidden, encoding_output):
        # context_vector.shape: (batch_size, units)
        context_vector, attention_weights = self.attention(hidden, encoding_output)

        # before embedding: x.shape: (batch_size, 1)
        # after embedding: x.shape: (batch_size, 1, embedding_units)
        x = self.embedding(x)

        # context_vector.shape: (batch_size, units)
        # x.shape: (batch_size, 1, embedding_units)
        # 两者的维度不一样，需要对 context_vector 进行维度扩展
        # x shape after concatenation：
        # (batch_size, 1, embedding_dim + hidden_size)
        combined_x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # output.shape: [batch_size, 1, decoding_units]
        # state.shape: [batch_size, decoding_units]
        output, state = self.gru(combined_x)

        # output.shape: [batch_size, decoding_units]
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape: [batch_size, vocab_size]
        output = self.fc(output)

        return output, state, attention_weights

decoder = Decoder(output_vocab_size, embedding_units, units, batch_size)
outputs = decoder(tf.random.uniform((batch_size, 1)),
                                      sample_hidden, sample_output)
decoder_output, decoder_hidden, decoder_aw = outputs
print ('Decoder output shape: {}'.format(decoder_output.shape))
print ('Decoder hidden shape: {}'.format(decoder_hidden.shape))
print ('Decoder attention weights shape: {}'.format(decoder_aw.shape))

Decoder output shape: (64, 4935)
Decoder hidden shape: (64, 1024)
Decoder attention weights shape: (64, 16, 1)


In [29]:
optimizer = keras.optimizers.Adam()
# reduction：指明损失函数如何聚合
loss_object = keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    # logical_not：取反
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    # 由于 loss_  是 float 类型，需要对 mask 做一个类型变换
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

checkpoint_dir = './10-1_checkpoints'
if not os.path.exists(checkpoint_dir):
    os.mkdir(checkpoint_dir)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [30]:
@tf.function
def train_step(inp, targ, encoding_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        encoding_outputs, encoding_hidden = encoder(inp, encoding_hidden)

        decoding_hidden = encoding_hidden

        # eg: <start> I am here <end>
        # 1. <start> --> I
        # 2. I --> am
        # 3. am --> here
        # 4. here --> <end>
        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1] - 1):
            decoding_input = tf.expand_dims(targ[:, t], 1)

            # passing enc_output to the decoder
            predictions, decoding_hidden, _ = decoder(
                decoding_input, decoding_hidden, encoding_outputs)

            loss += loss_function(targ[:, t + 1], predictions)

    batch_loss = (loss / int(targ.shape[0]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [31]:
epochs = 10
steps_per_epoch = len(input_tensor) // batch_size

for epoch in range(epochs):
    start = time.time()

    encoding_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, encoding_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(
                epoch + 1, batch, batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.6561
Epoch 1 Batch 100 Loss 0.3151


KeyboardInterrupt: 

In [None]:
def evaluate(input_sentence):
    attention_matrix = np.zeros((max_length_output, max_length_input))
    input_sentence = preprocess_sentence(input_sentence)

    inputs = [input_tokenizer.word_index[i] for i in input_sentence.split(' ')]
    inputs = keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=max_length_input, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''
    encoding_hidden = tf.zeros((1, units))

    encoding_outputs, encoding_hidden = encoder(inputs, encoding_hidden)
    decoding_hidden = encoding_hidden

    # eg: <start> --> A
    # A --> B --> C --> D
    # decoding_input.shape: (1, 1)
    decoding_input = tf.expand_dims([output_tokenizer.word_index['<start>']], 0)
    for t in range(max_length_output):
        predictions, decoding_hidden, attention_weights = decoder(
            decoding_input, decoding_hidden, encoding_outputs)

        # attention_weights.shape: (batch_size, input_length, 1) (1, 16, 1)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_matrix[t] = attention_weights.numpy()

        # predictions.shape: (batch_size, vocab_size) (1, 4935)
        predicted_id = tf.argmax(predictions[0]).numpy()

        result += output_tokenizer.index_word[predicted_id] + ' '

        if output_tokenizer.index_word[predicted_id] == '<end>':
            return result, input_sentence, attention_matrix

        # the predicted ID is fed back into the model
        decoding_input = tf.expand_dims([predicted_id], 0)

    return result, input_sentence, attention_matrix

# function for plotting the attention weights
def plot_attention(attention_matrix, input_sentence, predicted_sentence):
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention_matrix, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + input_sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    plt.show()
    
def translate(input_sentence):
    result, input_sentence, attention_matrix = evaluate(input_sentence)

    print('Input: %s' % (input_sentence))
    print('Predicted translation: {}'.format(result))

    attention_matrix = attention_matrix[:len(result.split(' ')), :len(input_sentence.split(' '))]
    plot_attention(attention_matrix, input_sentence.split(' '), result.split(' '))

checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
translate(u'hace mucho frio aqui.')

In [None]:
translate(u'esta es mi vida.')

In [None]:
translate(u'¿todavia estan en casa?')

In [None]:
translate(u'trata de averiguarlo.')