In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import MeCab
import unicodedata
import re
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import os
import io
import time

tf.enable_eager_execution()

In [2]:
# load txt file
def load_def(path):
    # open a txt file as read only
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    
    return lines

In [3]:
# create each languages list
def create_lang_list(num_example):
    # load txt file
    lines = load_def("dataset/raw.txt")

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_example]]

    return zip(*word_pairs)

# translate English to Japanese

In [4]:
# convert unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFC', s)
                    if unicodedata.category(c) != 'Mn')

# preprocess
def preprocess_sentence(w):
    # check japanese lang
    p = re.compile('[\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F]+')
    if p.search(w):
        # Morphological analysis for japanese lang
        m = MeCab.Tagger("-Owakati")
        w = m.parse(w)
    w = unicode_to_ascii(w.lower().strip())
    # create a space between word and the punctuation
    w = re.sub(r"([?!¿.,。])", r" \1 ", w)
    # replacing everything with space except(a-z, A-Z, ".",  "?",  "!",  ",", "-", "ー", , "。", "Kanji", "Katakana", "Hiragana")
    w = re.sub(r"[^a-zA-Z\u3041-\u309F\u30A1-\u30FF\uFF66-\uFF9F\u4E00-\u9FD0\u309B\u3099\uFF9E\u309C\u309A\uFF9F?.!,。¿\-/ {1,}/]+",  " ", w)
    w = w.rstrip().strip()
    
    # add a start and end  token to the sentence
    # model know when to start and end
    w = "<start> " + w + " <end>"
    return w

In [5]:
# check word
en_sentence =u"May I borrow this book?"
ja_sentence = u"プールに行きたい。でも今日は雨."
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(ja_sentence))

<start> may i borrow this book ? <end>
<start> プール に 行き たい  。  でも 今日 は 雨  . <end>


In [6]:
en, ja = create_lang_list(1000)
print(len(en))
print(ja[:10])

1000
('<start> あなた は 戻っ た の ね ハロルド  ? <end>', '<start> 俺 の 相手 は シャーク だ  。 <end>', '<start> 引き換え だ ある 事 と ある 物 の <end>', '<start> もう いい よ ごちそうさま ううん <end>', '<start> もう 会社 に は 来 ない で くれ 電話 も する な <end>', '<start> きれい だ  。 <end>', '<start> 連れ て 行け 殺し そう だ わかっ た か  ? <end>', '<start> 殺し た の か  ! <end>', '<start> わぁ   !  いつも すみません  。  いい の よ   。 <end>', '<start> カンパニー の 元 社員 が <end>')


In [7]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [8]:
def tokenize(lang):
    # vectorize a text corpus
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=' ')
    # updates internal vocabulary based on a list of texts
    # e.g. "[this place is good ]"→{this:1, place:2, is:3, good:4} "
    lang_tokenizer.fit_on_texts(lang)
    # Transforms each text in texts to a sequence of integers.
    tensor = lang_tokenizer.texts_to_sequences(lang)
    # transform a list of num sample into a 2D Numpy array of shape 
    # Fixed length because length of sequence of integers are different
    # return (len(sequences), maxlen)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                          padding='post')
    return tensor, lang_tokenizer

In [9]:
# example
tokenize(['this place is good', "hello world", "today is so cold"])

(array([[2, 3, 1, 4],
        [5, 6, 0, 0],
        [7, 1, 8, 9]], dtype=int32),
 <keras_preprocessing.text.Tokenizer at 0x1a3fccc690>)

In [10]:
# create a clean input, output pairs
def load_dataset(num_example):
    input_lang, target_lang= create_lang_list(num_example)
    input_tensor, input_lang_tokenize = tokenize(input_lang)
    print(input_lang_tokenize.word_index["hello"])
    target_tensor, target_lang_tokenize = tokenize(target_lang)

    return input_tensor, target_tensor, input_lang_tokenize, target_lang_tokenize

In [11]:
# limit datasize for test
num_example = 30000
# get data
input_tensor, target_tensor, input_lang, target_lang = load_dataset(num_example)
# Calculate max_length of the target tensors
max_length_target, max_length_input = max_length(target_tensor), max_length(input_tensor)

397


In [12]:
# create trainnig set and validation set
input_tensor_train, input_tensor_val, \
    target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=42)

# show length
print(len(input_tensor_train), len(input_tensor_val), len(target_tensor_train), len(target_tensor_val))

24000 6000 24000 6000


In [13]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            # Index number assigned to each word
            print("%d----->%s" % (t, lang.index_word[t]))

In [14]:
print("input lang: index to word mapping")
convert(input_lang, input_tensor_train[0])
print("output lang: index to word mapping")
convert(target_lang, target_tensor_train[0])

input lang: index to word mapping
1-----><start>
28----->he
75----->did
12----->it
38----->with
84----->his
243----->own
63----->will
16----->!
2-----><end>
output lang: index to word mapping
1-----><start>
7283----->茂
9----->が
865----->勝手
5----->に
905----->落ち
6----->た
3----->の
2-----><end>


In [15]:
# create a dataset
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)
embedding_dim = 100
units = 1024
vocab_inp_size = len(input_lang.word_index) + 1
print('Total unique words in the input: %s' % len(input_lang.word_index))
vocab_tar_size = len(target_lang.word_index) + 1
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
print(dataset)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

Total unique words in the input: 16351
<DatasetV1Adapter shapes: ((65,), (51,)), types: (tf.int32, tf.int32)>
<DatasetV1Adapter shapes: ((64, 65), (64, 51)), types: (tf.int32, tf.int32)>


In [16]:
example_input_batch, example_target_batch =  next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([Dimension(64), Dimension(65)]),
 TensorShape([Dimension(64), Dimension(51)]))

# encoder and decoder model

In [17]:
def load_glove():
    embeddings_dictionary = {}
    glove_file = open("dataset/glove.6B/glove.6B.100d.txt", encoding="utf-8")
    
    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = asarray(records[1:], dtype="float32")
        # correspond word and vector
        embeddings_dictionary[word] = vector_dimensions
    glove_file.close()
    
    num_words = min(num_example, vocab_inp_size)
    embedding_matrix = zeros((num_words, embedding_dim))
    for word, index in input_lang.word_index.items():
        embedding_vector = embeddings_dictionary.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
    return embeddings_dictionary, embedding_matrix

In [18]:
embeddings_dictionary, embedding_matrix = load_glove()

In [19]:
# check vector "hello" → both of vector are almost same
print(embeddings_dictionary["hello"])
print(embedding_matrix[397])

[ 0.26688    0.39632    0.6169    -0.77451   -0.1039     0.26697
  0.2788     0.30992    0.0054685 -0.085256   0.73602   -0.098432
  0.5479    -0.030305   0.33479    0.14094   -0.0070003  0.32569
  0.22902    0.46557   -0.19531    0.37491   -0.7139    -0.51775
  0.77039    1.0881    -0.66011   -0.16234    0.9119     0.21046
  0.047494   1.0019     1.1133     0.70094   -0.08696    0.47571
  0.1636    -0.44469    0.4469    -0.93817    0.013101   0.085964
 -0.67456    0.49662   -0.037827  -0.11038   -0.28612    0.074606
 -0.31527   -0.093774  -0.57069    0.66865    0.45307   -0.34154
 -0.7166    -0.75273    0.075212   0.57903   -0.1191    -0.11379
 -0.10026    0.71341   -1.1574    -0.74026    0.40452    0.18023
  0.21449    0.37638    0.11239   -0.53639   -0.025092   0.31886
 -0.25013   -0.63283   -0.011843   1.377      0.86013    0.20476
 -0.36815   -0.68874    0.53512   -0.46556    0.27389    0.4118
 -0.854     -0.046288   0.11304   -0.27326    0.15636   -0.20334
  0.53586    0.59784   

In [20]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size, embedding_matrix):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_inp_size, embedding_dim, 
                                                   embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix))
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                                            return_sequences=True,
                                                            return_state=True,
                                                            recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
        
    def initialize_hidden_state(self):
            return tf.zeros((self.batch_size, self.enc_units))

In [21]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE, embedding_matrix)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 65, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [22]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, nuits):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)
        
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
                self.W1(values) +  self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights

In [23]:
attention_layer = BahdanauAttention(10)
attention_result , attention_weights = attention_layer(sample_hidden, sample_output)

print("attention result shape: (batch size, units) {}".format(attention_result.shape))
print("attention weight shape:(batch size, sequence_length, 1) {}".format(attention_weights.shape))

attention result shape: (batch size, units) (64, 1024)
attention weight shape:(batch size, sequence_length, 1) (64, 65, 1)


In [24]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                                            return_sequences=True,
                                                            return_state=True,
                                                            recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        self.attention = BahdanauAttention(self.dec_units)
    
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state =  self.gru(x)
                
        # output shape == (batch_size * 1, hidden_size
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output  shape == (batch_size, vocab) 
        x = self.fc(output)
        
        return  x, state, attention_weights

In [25]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _,  _ = decoder(tf.random.uniform((64, 1)),
                                                                        sample_hidden, sample_output)
print("Decoder output  shape:(batch_size, vocab size) {}".format(sample_decoder_output.shape))

Decoder output  shape:(batch_size, vocab size) (64, 20070)


# Define the optimizer and the loss function

In [26]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

# Checkpoints

In [27]:
checkpoint_dir = '/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                                        encoder=encoder,
                                                        decoder=decoder)

# train

In [28]:
@tf.function
def train_step(inp, target, enc_hidden):
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
         
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_lang.word_index['<start>']] * BATCH_SIZE, 1)
        
        # teacher forcing - feeding the target as a next input
        for t in range(1, target.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            
            loss += loss_function(target[:, t], predictions)
            
            # using teacher forcing
            dec_input  = tf.expand_dims(target[:, t], 1)
        
        batch_loss = (loss / int(target.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        
        return batch_loss

In [None]:
EPOCHS = 10
for epochs in range(EPOCHS):
    start = time.time()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss =  0
     
    for (batch, (inp, target)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, target, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print("epoch {} batch {} loss  {: .4f}".format(epochs + 1,
                                                                                          batch,
                                                                                          batch_loss.numpy()))
        # save checkpoint every 2 epochs
        if (epochs + 1) % 2 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)
            
        print("epoch {} loss {: .4f}".format(epochs + 1,
                                                                     total_loss / steps_per_epoch))
        print("time taken for 1 epoch {} sec\n".format(time.time() - start))