In [1]:
import tensorflow as tf
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np
import typing

from typing import Any, Tuple
from google.colab import drive #if use colab
from tensorflow.nn import relu, tanh, softmax
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# if use colab
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#if use colab
git_dir = "/content/IOH-Chat-App"
git_url = "https://github.com/bangkit-team/IOH-chat-app.git"

if not os.path.exists(git_dir):
  subprocess.call(["git", "clone", git_url])

In [4]:
filedir = "/content/IOH-chat-app/Machine Learning/datasets/translation/result/eng-ind.csv" # #if use colab
# filedir = "../..//datasets/translate sentence/result/eng-ind.csv" #if use local env

In [5]:
start_mark = "<START>"
end_mark = "<END>"

In [368]:
class TranslatorDataset:
  def __init__(self, filedir):
    self.filedir = filedir
    self.input_tokenizer = None
    self.target_tokenizer = None
    self._load_data_from_file()

  def _load_data_from_file(self):
    df = pd.read_csv(self.filedir)

    input_lang = df.English.values
    target_lang = df.Indonesia.values

    return input_lang, target_lang

  def _normalize_and_preprocess(self, text, use_mark=False):
    punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    
    if use_mark:
      text = text.lower().strip()
      text = text.replace(punctuation, "")
      text = start_mark + " " + text
      text = text + " " + end_mark
    else:
      text = text.lower().strip()
      text = text.replace(punctuation, "")

    return text

  def _tokenize(self, sentences, num_words, maxlen):
    tokenizer = layers.TextVectorization(
        max_tokens=num_words, output_sequence_length=maxlen)
    
    tokenizer.adapt(sentences)

    return tokenizer

  def _create_dataset(self):
    input_lang, target_lang = self._load_data_from_file()

    input_lang = np.array(list(map(lambda x: self._normalize_and_preprocess(x, False), input_lang)))
    target_lang = np.array(list(map(lambda y: self._normalize_and_preprocess(y, True), target_lang)))
    
    return input_lang, target_lang

  def _load_dataset(self, num_words):
    input_lang, target_lang = self._create_dataset()

    self.maxlen = max([len(i)for i in input_lang]) // 5
    self.buffer_size = len(input_lang)

    input_tokenizer = self._tokenize(
        input_lang, num_words, self.maxlen)
    
    target_tokenizer = self._tokenize(
        target_lang, num_words, self.maxlen,)

    return (input_lang, input_tokenizer), (target_lang, target_tokenizer)
  
  def get(self, num_words, batch_size):
    input, target = self._load_dataset(num_words)

    input_sentence, self.input_tokenizer = input
    target_sentence, self.target_tokenizer = target

    dataset = tf.data.Dataset.from_tensor_slices((input_sentence, target_sentence))
    dataset = dataset.shuffle(self.buffer_size).batch(batch_size, drop_remainder=True)

    return self.input_tokenizer, self.target_tokenizer, dataset

In [369]:
num_words = 6000
batch_size = 128

In [370]:
translator_dataset = TranslatorDataset(filedir)
input_tokenizer, target_tokenizer, dataset = translator_dataset.get(num_words, batch_size)

In [371]:
input_batch, target_batch = next(iter(dataset))

In [372]:
input_batch.shape, target_batch.shape

(TensorShape([128]), TensorShape([128]))

In [374]:
input_vocab_size = len(input_tokenizer.get_vocabulary()) + 1
target_vocab_size = len(target_tokenizer.get_vocabulary()) + 1

input_vocab_size, target_vocab_size

(4097, 4979)

In [405]:
input_example = input_batch[-1].numpy().decode()
input_example

'everybody knows them.'

In [407]:
target_example = target_batch[-1].numpy().decode()
target_example

'<START> semua orang tahu mereka. <END>'

In [408]:
embed_dims = 64
units = 512

In [409]:
class Encoder():
  def __init__(self, input_vocab_size, embedding_dims, units):
    self.units = units
    self.input_vocab_size = input_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.input_vocab_size, self.embedding_dims)
    self.gru_layer = layers.GRU(self.units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')

  def call(self, inputs):
    embedding = self.embedding(inputs)
    encoder = self.gru_layer(embedding)

    return encoder

In [439]:
class BahdanauAttention(layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.w1 = layers.Dense(units, use_bias=True) 
    self.w2 = layers.Dense(units, use_bias=True) 
    self.fd = layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    
    score = self.fd(tf.nn.tanh(
        self.w1(query_with_time_axis) + self.w2(values)))

    attention_weights = softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [440]:
class Decoder():
  def __init__(self, output_vocab_size, embedding_dims, units):
    self.units = units
    self.output_vocab_size = output_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.output_vocab_size, self.embedding_dims)
    self.gru_layer = layers.GRU(self.units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
    self.attention = BahdanauAttention(self.units)
    self.dense1 = layers.Dense(self.units, activation=tanh)
    self.dense2 = layers.Dense(output_vocab_size)

  def call(self, inputs, en_outpus, state):
    embedding = self.embedding(inputs)
    dec_outputs, dec_state = self.gru_layer(embedding, initial_state=state)
    context_vector, attention_weights = self.attention.call(
        query=dec_outputs, values=en_outpus)
    
    context_and_rnn_output = tf.concat([context_vector, dec_outputs], axis=-1)

    attention_vector = self.dense1(context_and_rnn_output)
    outputs = self.dense2(attention_vector)

    return outputs

In [443]:
inp_exp_sequences = input_tokenizer(input_batch)

encoder = Encoder(input_vocab_size, embed_dims, units)
en_outputs, en_states = encoder.call(inp_exp_sequences)

en_outputs.shape, en_states.shape

(TensorShape([128, 32, 512]), TensorShape([128, 512]))

In [444]:
targ_exp_sequences = target_tokenizer(target_batch)

decoder = Decoder(target_vocab_size, embed_dims, units)
dec_outputs= decoder.call(targ_exp_sequences, en_outputs, en_states)

dec_outputs.shape

TensorShape([128, 32, 4979])

In [445]:
lr = 0.001
epochs = 10

optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [446]:
class TranslatorModel():
  def __init__(self, input_vocab_size, 
               target_vocab_size, 
               embed_dims, 
               units):
    self.input_vocab_size = input_vocab_size
    self.target_vocab_size = target_vocab_size
    self.embed_dims = embed_dims
    self.units = units

    self.encoder = Encoder(self.input_vocab_size, self.embed_dims, self.units)
    self.decoder = Decoder(self.target_vocab_size, self.embed_dims, self.units)
  
  def build_model(self):
    en_inputs = layers.Input(shape=(None,))

    en_output, en_state = self.encoder.call(en_inputs)

    dec_outputs = self.decoder.call(en_inputs, en_output, en_state)

    model = Model(inputs=[en_inputs], 
                  outputs=[dec_outputs])
    return model

In [447]:
model = TranslatorModel(
    input_maxlen,
    target_maxlen,
    embed_dims,
    units,
).build_model()

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=["accuracy"]
)

In [365]:
checkpoint_path = "/content/drive/MyDrive/translate/checkpoint/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=3, 
    verbose=1)

callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor='loss', 
    verbose=1, 
    save_weights_only=True, 
    save_best_only=True)

callbacks = [callback_early_stopping,
             callback_checkpoint]

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=["accuracy"]
)

model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_116 (Embedding)      (None, None, 64)     2048        ['input_7[0][0]']                
                                                                                                  
 embedding_117 (Embedding)      (None, None, 64)     2048        ['input_7[0][0]']                
                                                                                                  
 gru_116 (GRU)                  [(None, None, 512),  887808      ['embedding_116[0][0]']          
                                 (None, 512)]                                               

In [366]:
model.fit(dataset,
          epochs=epochs,
          callbacks=callbacks,
          verbose=1)

Epoch 1/10


InvalidArgumentError: ignored

In [312]:
# if use colab
saved_model_path = "/content/drive/MyDrive/translate/saved_model/translate.h5"

# if use local env
saved_model_path = "code/translate sentence/saved_model/translate.h5"
saved_model_dir = os.path.dirname(saved_model_path)

if os.path.exists(saved_model_dir):
  shutil.rmtree(saved_model_dir)
model.save(saved_model_path)



AttributeError: ignored

In [48]:
class Translator(tf.Module):
  def __init__(self, encoder, decoder, input_tokenizer, target_tokenizer, maxlen):
    self.encoder = encoder
    self.decoder = decoder
    self.input_tokenizer = input_tokenizer
    self.target_tokenizer = target_tokenizer
    self.maxlen = maxlen

    self.start_token = input_tokenizer.word_index["<START>"]
    self.end_token = target_tokenizer.word_index["<END>"]

  def _normalize_and_preprocess(self, text):
    punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    
    text = text.lower().strip()
    text = text.replace(punctuation, "")
    text = start_mark + " " + text
    text = text + " " + end_mark

    return text

  def translate(self, sentence):
    batch_size = tf.shape(sentence)
    print('test')

    input_sequences = self.input_tokenizer.texts_to_sequences([sentence])
    en_output, en_state = self.encoder.call(input_sequences)

    dec_state = en_state
    new_sequences = tf.fill([batch_size, 1], self.start_token)

    result_sequences = list()
    attention = list()
    done = tf.zeros([batch_size, 1], dtype=tf.bool)


    for _ in range(self.maxlen):
      (dec_outputs, dec_state), attention_weights = self.decoder.call(
          new_sequences, en_output, dec_state)

      attention.append(attention_weights)

      new_sequences = self.sample(dec_outputs, 1.0)

      done = done | (new_sequences == self.end_token)
      new_sequences = tf.where(done, tf.constant(0, dtype=tf.int64), new_sequences)

      result_sequences.append(new_sequences)


      if tf.executing_eagerly() and tf.reduce_all(done):
        break
    result_sequences = tf.concat(result_sequences, axis=-1)
    result_text = target_tokenizer.sequences_to_texts([result_sequences])

    return {'text': result_text}

In [49]:
translator = Translator(
    translator_model.get_encoder(),
    translator_model.get_decoder(),
    input_tokenizer, 
    target_tokenizer,
    input_maxlen,
)

translator.translate("run")

NameError: ignored

In [None]:
translator = Translator(model, input_tokenizer, target_tokenizer, input_maxlen)
translator.translate("run")