In [2]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

tf.__version__

'2.0.0'

In [3]:
# 1. preprocessing data

import unicodedata

# 去西班牙语重音
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'
    )

en_sentence = 'No way!'
spa_sentence = '¡De ningún modo!'



In [4]:
unicode_to_ascii(en_sentence)

'No way!'

In [5]:
unicode_to_ascii(spa_sentence)

'¡De ningun modo!'

In [6]:
import re

def preprocess_sentence(s):
    s = unicode_to_ascii(s.lower().strip())
    # 标点符号前后加空格
    s = re.sub(r'([?.!,¿])', r'\1', s)
    # 多余的空格变成一个空格
    s = re.sub(r'[" "]+', ' ', s)
    # 除标点符号和字母外都是空格
    s = re.sub(r'[^a-zA-Z?.!,¿]', ' ', s)
    # 前后去空格
    s = s.rstrip().strip()

    return '<start>' + s + '<end>'


In [7]:
preprocess_sentence(en_sentence)

'<start>no way!<end>'

In [8]:
preprocess_sentence(spa_sentence)

'<start>de ningun modo!<end>'

In [9]:
def parse_data(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]
    preprocessed_sentence_pairs = [
         (preprocess_sentence(en), preprocess_sentence(sp)) for en,sp in sentence_pairs
    ]
    return zip(*preprocessed_sentence_pairs)

In [10]:
a = [(1, 2), (3, 4), (5, 6)]
c, d = zip(*a)
print(c, d)

(1, 3, 5) (2, 4, 6)


In [11]:
en_dataset,sp_dataset =  parse_data('spa.txt')

In [12]:
en_dataset[-1]

'<start>if you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.<end>'

In [13]:
sp_dataset[-1]

'<start>si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.<end>'

In [14]:
def tokenizer(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=None, filters='', split=' '
    )
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

input_tensor, input_tokenizer = tokenizer(sp_dataset[:30000])
output_tensor, output_tokenizer = tokenizer(en_dataset[:30000])

max_length_input = max(len(t) for t in input_tensor)
max_length_output = max(len(t) for t in output_tensor)

print(max_length_input, max_length_output)

12 7


In [15]:
from sklearn.model_selection import train_test_split

input_train, input_val, output_train, output_val = train_test_split(
    input_tensor, output_tensor, test_size = 0.2
)

print(len(input_train), len(output_train))

24000 24000


In [16]:
def make_dataset(input_tensor, output_tensor, batch_size, epochs=1, shuffle=False):
    dataset = tf.data.Dataset.from_tensor_slices(
        (input_tensor, output_tensor)
    )

    if shuffle:
        dataset = dataset.shuffle(30000)
    dataset = dataset.repeat(epochs).batch(batch_size, drop_remainder = True)
    return dataset

batch_size = 64
epochs = 20

train_dataset = make_dataset(
    input_train, output_train, batch_size,epochs, True
)

val_dataset = make_dataset(
    input_val, output_val, batch_size
)

for x, y in train_dataset.take(1):
    print(x.shape, y.shape)
    print(x)
    print(y)


(64, 12) (64, 7)
tf.Tensor(
[[   65  3939   211     0     0     0     0     0     0     0     0     0]
 [  192    10  3325     0     0     0     0     0     0     0     0     0]
 [  228   280  1014     0     0     0     0     0     0     0     0     0]
 [  248     8  5031     0     0     0     0     0     0     0     0     0]
 [   13     9  2624     0     0     0     0     0     0     0     0     0]
 [    4    48   936     0     0     0     0     0     0     0     0     0]
 [   64     8  4280    60  1137     0     0     0     0     0     0     0]
 [    7     2  2236     0     0     0     0     0     0     0     0     0]
 [   43  3176   231  2947     0     0     0     0     0     0     0     0]
 [    7  1582     2   944     0     0     0     0     0     0     0     0]
 [    7   142   633     0     0     0     0     0     0     0     0     0]
 [   95    12  4947     0     0     0     0     0     0     0     0     0]
 [   28   116   713     0     0     0     0     0     0     0     0     

In [17]:
embedding_units = 256
units = 1024
input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = len(output_tokenizer.word_index) + 1

In [18]:
class Encoder(tf.keras.Model):

    def __init__(self, vocab_size, embedding_units, encoding_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoding_units = encoding_units
        self.embedding_units = embedding_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_units)
        self.gru = tf.keras.layers.GRU(
            encoding_units, return_sequences=True, return_state=True,
            recurrent_initializer='glorot_uniform'
            )
        

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoding_units))

encoder = Encoder(input_vocab_size, embedding_units, units, batch_size)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(x, sample_hidden)

print(sample_hidden.shape, sample_output.shape)

(64, 1024) (64, 12, 1024)


In [20]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  3715840   
_________________________________________________________________
gru (GRU)                    multiple                  3938304   
Total params: 7,654,144
Trainable params: 7,654,144
Non-trainable params: 0
_________________________________________________________________


In [None]:
class DecoderLayer(tf.keras.layers.layer):

    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__
        self.mha1 = tf.keras.layers.mu

    def call(self,x, encoding_outputs, training, look_ahead_mask, padding_mask):
        pass