In [1]:
import tensorflow as tf 

import matplotlib.pylab as plt 
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re 
import numpy as np 
import os 
import io 
import time

In [2]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

In [3]:
path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"


In [4]:
path_to_file

'/home/mark/.keras/datasets/spa-eng/spa.txt'

In [5]:
# 将 unicode 文件转换为 ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # 在单词与跟在其后的标点符号之间插入一个空格
    # 例如： "he is a boy." => "he is a boy ."
    # 参考：https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # 除了 (a-z, A-Z, ".", "?", "!", ",")，将所有字符替换为空格
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.rstrip().strip()

    # 给句子加上开始和结束标记
    # 以便模型知道何时开始和结束预测
    w = '<start> ' + w + ' <end>'
    return w

In [6]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


In [7]:
# 1. 去除重音符号
# 2. 清理句子
# 3. 返回这样格式的单词对：[ENGLISH, SPANISH]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [8]:
en, sp = create_dataset(path_to_file, None)

In [9]:
en[-1]

'<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>'

In [10]:
sp[-1]

'<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>'

In [11]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [12]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

In [14]:
def load_dataset(path, num_examples=None):
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [15]:
num_examples = 30000

In [16]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

In [17]:
input_tensor

array([[   1,  135,    3, ...,    0,    0,    0],
       [   1,  293,    3, ...,    0,    0,    0],
       [   1,  595,    3, ...,    0,    0,    0],
       ...,
       [   1,   18, 9413, ...,    0,    0,    0],
       [   1,   63, 2490, ...,    0,    0,    0],
       [   1,   23, 2175, ...,    0,    0,    0]], dtype=int32)

In [18]:
inp_lang

<keras_preprocessing.text.Tokenizer at 0x7f692d0080f0>

In [19]:
max_length_targ, max_leng_inp = max_length(target_tensor), max_length(input_tensor)

In [20]:
max_length_targ

11

In [21]:
max_leng_inp

16

In [22]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [23]:
len(input_tensor_train)

24000

In [25]:
len(input_tensor_val)

6000

In [26]:
BUFFER_SIZE = len(input_tensor_train)
BUFFER_SIZE

24000

In [27]:
BATCH_SIZE = 64
step_per_epoch = BUFFER_SIZE // BATCH_SIZE

In [28]:
step_per_epoch

375

In [29]:
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

In [30]:
vocab_inp_size

9414

In [31]:
vocab_tar_size

4935

In [34]:
dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor_train, target_tensor_train)
).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [35]:
dataset

<BatchDataset shapes: ((64, 16), (64, 11)), types: (tf.int32, tf.int32)>

In [36]:
example_input_batch, example_target_batch = next(iter(dataset))

In [40]:
example_input_batch.shape

TensorShape([64, 16])

In [39]:
example_target_batch.shape

TensorShape([64, 11])