In [14]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

#################### Ignore from this part to ####################

# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  return zip(*word_pairs)

en, sp = create_dataset(path_to_file, None)

def max_length(tensor):
    return max(len(t) for t in tensor)

def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

def load_dataset(path, num_examples=None):
    # クリーニングされた入力と出力のペアを生成
    targ_lang, inp_lang = create_dataset(path, num_examples)
    
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

# このサイズのデータセットで実験
num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# ターゲットテンソルの最大長を計算
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

# 80-20で分割を行い、訓練用と検証用のデータセットを作成
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

#################### this part. ####################

In [6]:
input_tensor_train.shape, input_tensor_val.shape, target_tensor_train.shape, target_tensor_val.shape

((24000, 16), (6000, 16), (24000, 11), (6000, 11))

In [7]:
input_tensor_train

array([[   1,  715,   10, ...,    0,    0,    0],
       [   1,   39, 2934, ...,    0,    0,    0],
       [   1,    9,    8, ...,    0,    0,    0],
       ...,
       [   1,    6,   11, ...,    0,    0,    0],
       [   1,    8,  312, ...,    0,    0,    0],
       [   1,    4,   91, ...,    0,    0,    0]], dtype=int32)

In [8]:
input_tensor_train[0]

array([  1, 715,  10, 164,   3,   2,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0], dtype=int32)

In [12]:
convert(inp_lang, input_tensor_train[0])

1 ----> <start>
715 ----> limitate
10 ----> a
164 ----> hacerlo
3 ----> .
2 ----> <end>


In [10]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [115]:
a=tokenize(tuple('<start> madre . <end>'))

Inside tokenize()

('<', 's', 't', 'a', 'r', 't', '>', ' ', 'm', 'a', 'd', 'r', 'e', ' ', '.', ' ', '<', 'e', 'n', 'd', '>')


In [116]:
a

(array([[ 1],
        [ 8],
        [ 2],
        [ 3],
        [ 4],
        [ 2],
        [ 5],
        [ 0],
        [ 9],
        [ 3],
        [ 6],
        [ 4],
        [ 7],
        [ 0],
        [10],
        [ 0],
        [ 1],
        [ 7],
        [11],
        [ 6],
        [ 5]], dtype=int32),
 <keras_preprocessing.text.Tokenizer at 0x7f8250162d10>)

In [92]:
input_tensor

array([[   1,  135,    3, ...,    0,    0,    0],
       [   1,  293,    3, ...,    0,    0,    0],
       [   1,  595,    3, ...,    0,    0,    0],
       ...,
       [   1,   18, 9413, ...,    0,    0,    0],
       [   1,   63, 2490, ...,    0,    0,    0],
       [   1,   23, 2175, ...,    0,    0,    0]], dtype=int32)

In [6]:
input_tensor_val.shape

(6000, 16)

In [7]:
target_tensor_train.shape

(24000, 11)

In [8]:
target_tensor_val.shape

(6000, 11)

In [9]:
input_tensor_train[0]

array([  1,  81, 118,   3,   2,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0], dtype=int32)

In [10]:
len(inp_lang.word_index.keys())


9413

In [93]:
# 80-20で分割を行い、訓練用と検証用のデータセットを作成
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# 長さを表示
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

24000 24000 6000 6000


In [95]:
input_tensor_train.shape

(24000, 16)

In [99]:
input_tensor_train[0]

array([   1,    4,   17,   16, 2258,   27,    2,    0,    0,    0,    0,
          0,    0,    0,    0,    0], dtype=int32)

In [97]:
target_tensor_val.shape

(6000, 11)

In [90]:
en

('<start> go . <end>',
 '<start> go . <end>',
 '<start> go . <end>',
 '<start> go . <end>',
 '<start> hi . <end>',
 '<start> run ! <end>',
 '<start> run . <end>',
 '<start> who ? <end>',
 '<start> fire ! <end>',
 '<start> fire ! <end>',
 '<start> fire ! <end>',
 '<start> help ! <end>',
 '<start> help ! <end>',
 '<start> help ! <end>',
 '<start> jump ! <end>',
 '<start> jump . <end>',
 '<start> stop ! <end>',
 '<start> stop ! <end>',
 '<start> stop ! <end>',
 '<start> wait ! <end>',
 '<start> wait . <end>',
 '<start> go on . <end>',
 '<start> go on . <end>',
 '<start> hello ! <end>',
 '<start> i ran . <end>',
 '<start> i ran . <end>',
 '<start> i try . <end>',
 '<start> i won ! <end>',
 '<start> oh no ! <end>',
 '<start> relax . <end>',
 '<start> smile . <end>',
 '<start> attack ! <end>',
 '<start> attack ! <end>',
 '<start> get up . <end>',
 '<start> go now . <end>',
 '<start> got it ! <end>',
 '<start> got it ? <end>',
 '<start> got it ? <end>',
 '<start> he ran . <end>',
 '<start> ho

In [16]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [20]:
input_tensor_train[0]

array([   1,   12,   40, 1068,    3,    2,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0], dtype=int32)

In [87]:
sample_sentence = u'todo sobre mi madre .'

In [81]:
preprocess_sentence(sample_sentence)

'<start> todo sobre mi madre . <end>'

In [88]:
word_pairs = [preprocess_sentence(sample_sentence)]



In [89]:
create_dataset(zip(*word_pairs), None)

TypeError: expected str, bytes or os.PathLike object, not zip

In [84]:
tokenize(zip(*word_pairs))

AttributeError: 'tuple' object has no attribute 'lower'

In [76]:
sample_code = tokenize(tuple('<start> todo sobre mi madre . <end>'))[0]

In [77]:
sample_code = np.squeeze(sample_code)

In [78]:
sample_code

array([ 6,  7,  1,  8,  2,  1,  9,  0,  1,  3,  4,  3,  0,  7,  3, 11,  2,
        5,  0, 10, 12,  0, 10,  8,  4,  2,  5,  0, 13,  0,  6,  5, 14,  4,
        9], dtype=int32)

In [70]:
input_tensor_train[0]

array([   1,   12,   40, 1068,    3,    2,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0], dtype=int32)

In [65]:
convert(inp_lang, sample_code)



6 ----> ¿
1 ----> <start>
2 ----> <end>
1 ----> <start>
7 ----> es
1 ----> <start>
8 ----> no
3 ----> .
4 ----> tom
5 ----> ?
9 ----> el
5 ----> ?
10 ----> a
2 ----> <end>
3 ----> .
4 ----> tom
11 ----> que


In [33]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
12 ----> me
40 ----> gusta
1068 ----> esquiar
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
4 ----> i
35 ----> like
975 ----> skiing
3 ----> .
2 ----> <end>


In [18]:
print ("Input Language; index to word mapping")
convert(inp_lang, sample_sentence)

Input Language; index to word mapping


KeyError: 't'

In [13]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[1])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping


AttributeError: 'str' object has no attribute 'index_word'

In [12]:
BUFFER_SIZE

24000

In [123]:
sentence = preprocess_sentence('Mujeres al  de un ataque de nervios')
inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]



In [120]:
inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]


In [124]:
inputs

[1, 581, 70, 14, 15, 3415, 14, 8345, 2]

In [13]:
inp_lang.word_index

{'<start>': 1,
 '<end>': 2,
 '.': 3,
 'tom': 4,
 '?': 5,
 '¿': 6,
 'es': 7,
 'no': 8,
 'el': 9,
 'a': 10,
 'que': 11,
 'me': 12,
 'la': 13,
 'de': 14,
 'un': 15,
 'esta': 16,
 'se': 17,
 'lo': 18,
 'mi': 19,
 'en': 20,
 'una': 21,
 'por': 22,
 'te': 23,
 'estoy': 24,
 'ella': 25,
 'yo': 26,
 '!': 27,
 'eso': 28,
 'le': 29,
 'esto': 30,
 'tu': 31,
 ',': 32,
 'los': 33,
 'aqui': 34,
 'soy': 35,
 'muy': 36,
 'tengo': 37,
 'puedo': 38,
 'las': 39,
 'gusta': 40,
 'mary': 41,
 'tiene': 42,
 'son': 43,
 'con': 44,
 'como': 45,
 'quien': 46,
 'estaba': 47,
 'su': 48,
 'este': 49,
 'favor': 50,
 'estas': 51,
 'eres': 52,
 'quiero': 53,
 'ellos': 54,
 'fue': 55,
 'bien': 56,
 'casa': 57,
 'ahora': 58,
 'tomas': 59,
 'donde': 60,
 'mas': 61,
 'estan': 62,
 'nos': 63,
 'he': 64,
 'solo': 65,
 'puede': 66,
 'ha': 67,
 'era': 68,
 'todos': 69,
 'al': 70,
 'para': 71,
 'ir': 72,
 'tan': 73,
 'todo': 74,
 'estamos': 75,
 'necesito': 76,
 'ya': 77,
 'nadie': 78,
 'puedes': 79,
 'trabajo': 80,
 'voy': 8

In [11]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [16]:
dataset

<BatchDataset shapes: ((64, 16), (64, 11)), types: (tf.int32, tf.int32)>

In [29]:
input_tensor_train.shape

(24000, 16)

In [31]:
target_tensor_train.shape

(24000, 11)

In [24]:
BUFFER_SIZE

24000

In [23]:
steps_per_epoch

375

In [17]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz # 64
    self.enc_units = enc_units # 24000 // 64 = 375
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) # (9414, 256)
    self.gru = tf.keras.layers.GRU(self.enc_units, # 1024
                                   return_sequences=True, 
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [18]:
vocab_inp_size

9414

In [25]:
encoder = Encoder(vocab_inp_size, # 9414
                  embedding_dim, # 256
                  units, # 1024
                  BATCH_SIZE # 24000
                 )

In [15]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 16]), TensorShape([64, 11]))

In [32]:
example_input_batch.shape

TensorShape([64, 16])

In [33]:
example_target_batch.shape

TensorShape([64, 11])

In [26]:
sample_hidden = encoder.initialize_hidden_state()



In [27]:
sample_hidden.shape

TensorShape([64, 1024])

In [None]:
# サンプル入力
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

In [28]:
sample_output.shape, sample_hidden.shape

(TensorShape([64, 16, 1024]), TensorShape([64, 1024]))

In [20]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # hidden shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # スコアを計算するためにこのように加算を実行する
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # スコアを self.V に適用するために最後の軸は 1 となる
    # self.V に適用する前のテンソルの shape は  (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights の shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector の合計後の shape == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [21]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 16, 1)


In [39]:
attention_weights[0].numpy().sum()

1.0

In [37]:
tf.math.reduce_sum(attention_weights[0])

<tf.Tensor: shape=(), dtype=float32, numpy=1.0>

In [20]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # アテンションのため
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output の shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, # The hidden vector of the decodr. 
                                                               # This is the query　of calculating attentions. 
                                                       enc_output) # You need enocder outpus as keys
    
    # Attention mechanism calculates correlations of the query and keys, and the scores are regularized 
    # with a softmax function.

    # 埋め込み層を通過したあとの x の shape  == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # 結合後の x の shape == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # 結合したベクトルを GRU 層に渡す
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [21]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 4935)


In [42]:
vocab_tar_size

4935

In [43]:
vocab_inp_size

9414

In [22]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [23]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [46]:
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    print(inp.shape)
    print(targ.shape)
    print()



(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64, 11)

(64, 16)
(64

In [22]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)


In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, 
                                           dec_hidden, 
                                           enc_output) # You need encoder outputs to calculate attentions. 

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss
