In [5]:
import tensorflow as tf
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np
import typing

from typing import Any, Tuple
from google.colab import drive #if use colab
from tensorflow.nn import relu, tanh, softmax
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
# if use colab
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
#if use colab
git_dir = "/content/IOH-Chat-App"
git_url = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

if not os.path.exists(git_dir):
  subprocess.call(["git", "clone", git_url])

In [8]:
filedir = "/content/IOH-Chat-App/Machine Learning/datasets/translate sentence/result/eng-ind.csv" # #if use colab
# filedir = "../..//datasets/translate sentence/result/eng-ind.csv" #if use local env

In [9]:
start_mark = "<START>"
end_mark = "<END>"

In [10]:
class TranslatorDataset:
  def __init__(self, filedir):
    self.filedir = filedir
    self.input_tokenizer = None
    self.target_tokenizer = None
    self._load_data_from_file()

  def _load_data_from_file(self):
    df = pd.read_csv(self.filedir)

    input_lang = df.English.values
    target_lang = df.Indonesia.values

    return input_lang, target_lang

  def _normalize_and_preprocess(self, text):
    punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    
    text = text.lower().strip()
    text = text.replace(punctuation, "")
    text = start_mark + " " + text
    text = text + " " + end_mark

    return text

  def _tokenize(self, sentences, num_words, maxlen, padding, truncating):
    tokenizer = Tokenizer(num_words=num_words, filters="", lower=False)
    tokenizer.fit_on_texts(sentences)

    sequences = tokenizer.texts_to_sequences(sentences)
    sequences = pad_sequences(sequences, 
                              maxlen, 
                              padding=padding, 
                              truncating=truncating)

    return sequences, tokenizer

  def _create_dataset(self):
    input_lang, target_lang = self._load_data_from_file()

    input_lang = np.array(list(map(self._normalize_and_preprocess, input_lang)))
    target_lang = np.array(list(map(self._normalize_and_preprocess, target_lang)))
    
    return input_lang, target_lang

  def _load_dataset(self, num_words):
    input_lang, target_lang = self._create_dataset()

    self.maxlen = max([len(i)for i in input_lang]) // 5

    input_sequences, input_tokenizer = self._tokenize(input_lang, 
                                                      num_words, 
                                                      self.maxlen,
                                                      "pre", "pre")
    target_sequences, target_tokenizer = self._tokenize(target_lang, 
                                                        num_words, 
                                                        self.maxlen,
                                                        "post", "post")

    return (input_sequences, input_tokenizer), (target_sequences, target_tokenizer)
  
  def get(self, num_words):
    input, target = self._load_dataset(num_words)

    input_sequences, self.input_tokenizer = input
    target_sequences, self.target_tokenizer = target

    dataset = (input_sequences, target_sequences)

    return self.input_tokenizer, self.target_tokenizer, dataset

In [11]:
num_words = 6000

In [12]:
translator_dataset = TranslatorDataset(filedir)
input_tokenizer, target_tokenizer, (input_sequences, target_sequences) = translator_dataset.get(num_words)

In [13]:
buffer_size = len(input_sequences)
batch_size = 64
steps_per_epoch = buffer_size // batch_size

In [14]:
dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [15]:
input_batch, target_batch = next(iter(dataset))

In [16]:
input_batch.shape, target_batch.shape

(TensorShape([64, 35]), TensorShape([64, 35]))

In [17]:
input_vocab_size = len(input_tokenizer.index_word) + 1
target_vocab_size = len(target_tokenizer.index_word) + 1
input_maxlen = input_sequences.shape[1]
target_maxlen = target_sequences.shape[1]

input_maxlen, target_maxlen, input_vocab_size, target_vocab_size

(35, 35, 6139, 6865)

In [18]:
input_example = input_sequences[-1]
input_example

array([   0,    0,    0,    0,    0,    1,   66,    9,  859,   42,   28,
         61,    9, 1018,    5,   32, 1013,   76,    4,   73,  120,   74,
        120, 2949,    5,   26,  225,    5, 1475, 1960, 2983,   14,   21,
       1373,    2], dtype=int32)

In [19]:
target_example = target_sequences[-1]
target_example

array([   1,  103,  291,    5,   19, 1785,   67,    6,  135, 2853,  841,
        552,  528,   71,   12,   17, 1936, 1798,   45,   67,  431,    2,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int32)

In [20]:
input_sens_example = input_tokenizer.sequences_to_texts([input_example])
input_sens_example

["<START> if a person has not had a chance to his language by the time he's an he's unlikely to be able to reach native speaker in that language. <END>"]

In [21]:
target_sens_example = target_tokenizer.sequences_to_texts([target_example])
target_sens_example

['<START> jika seseorang tidak untuk menguasai bahasa yang ketika dewasa, maka kecil kemungkinan ia akan bisa mencapai asli dalam bahasa tersebut. <END>']

In [22]:
embed_dims = 64
epochs = 50
units = 512

In [23]:
class Encoder():
  def __init__(self, input_vocab_size, embedding_dims, units):
    self.units = units

    self.embedding = layers.Embedding(input_vocab_size,embedding_dims)
    self.gru_layer = layers.GRU(self.units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')

  def call(self, inputs):
    embedding = self.embedding(inputs)
    encoder = self.gru_layer(embedding)

    return encoder

In [47]:
class BahdanauAttention(layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.w1 = layers.Dense(units, use_bias=True) 
    self.w2 = layers.Dense(units, use_bias=True) 
    self.fd = layers.Dense(1)

  def call(self, query, values):
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.fd(tf.nn.tanh(
        self.w1(query_with_time_axis) + self.w2(values)))

    attention_weights = softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [48]:
class Decoder():
  def __init__(self, output_vocab_size, embedding_dims, units):
    self.units = units

    self.embedding = layers.Embedding(output_vocab_size, embedding_dims)
    self.gru_layer = layers.GRU(self.units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
    self.attention = BahdanauAttention(self.units)
    self.dense1 = layers.Dense(self.units, activation=tanh)
    self.dense2 = layers.Dense(output_vocab_size)

  def call(self, inputs, en_outpus, state):
    embedding = self.embedding(inputs)
    dec_outputs, dec_state = self.gru_layer(embedding, initial_state=state)
    context_vector, attention_weights = self.attention(
        query=dec_outputs, values=en_outpus)
    
    context_and_rnn_output = tf.concat([context_vector, dec_outputs], axis=-1)

    attention_vector = self.dense1(context_and_rnn_output)
    outputs = self.dense2(attention_vector)

    return outputs

In [49]:
encoder = Encoder(input_vocab_size, embed_dims, units)
en_outputs, en_states = encoder.call(input_batch)

en_outputs.shape, en_states.shape

(TensorShape([64, 35, 512]), TensorShape([64, 512]))

In [50]:
decoder = Decoder(target_vocab_size, embed_dims, units)
dec_outputs = decoder.call(target_batch, en_outputs, en_states)

dec_outputs.shape

TensorShape([64, 35, 6865])

In [54]:
lr = 1e-4

optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [55]:
class TranslatorModel():
  def __init__(self, input_vocab_size, 
               target_vocab_size, 
               embed_dims, 
               units):
    self.input_vocab_size = input_vocab_size
    self.target_vocab_size = target_vocab_size
    self.embed_dims = embed_dims
    self.units = units

    self.encoder = Encoder(self.input_vocab_size, self.embed_dims, self.units)
    self.decoder = Decoder(self.target_vocab_size, self.embed_dims, self.units)
  
  def build_model(self):
    en_inputs = layers.Input(shape=(None,))

    en_output, en_state = self.encoder.call(en_inputs)

    dec_outputs = self.decoder.call(en_inputs, en_output, en_state)

    model = Model(inputs=[en_inputs], 
                  outputs=[dec_outputs])
    return model

In [56]:
model = TranslatorModel(
    input_vocab_size, 
    target_vocab_size, 
    embed_dims, 
    units, 
).build_model()

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=["accuracy"]
)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_14 (Embedding)       (None, None, 64)     392896      ['input_6[0][0]']                
                                                                                                  
 embedding_15 (Embedding)       (None, None, 64)     439360      ['input_6[0][0]']                
                                                                                                  
 gru_14 (GRU)                   [(None, None, 512),  887808      ['embedding_14[0][0]']           
                                 (None, 512)]                                               

In [57]:
checkpoint_path = "/content/drive/MyDrive/translate/checkpoint/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3, 
    verbose=1)

callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor='val_loss', 
    verbose=1, 
    save_weights_only=True, 
    save_best_only=True)

callbacks = [callback_early_stopping,
             callback_checkpoint]

In [None]:
model.fit(dataset,
          epochs=epochs,
          steps_per_epoch=steps_per_epoch,
          callbacks=callbacks,
          verbose=1)

Epoch 1/50
Epoch 2/50
  2/137 [..............................] - ETA: 4:58 - loss: 1.1646 - accuracy: 0.8283

In [None]:
# # if use colab
# saved_model_path = "/content/drive/MyDrive/translate/saved_model/translate.h5"

# # if use local env
# # saved_model_path = "code/translate sentence/saved_model/translate.h5"
# saved_model_dir = os.path.dirname(saved_model_path)

# if os.path.exists(saved_model_dir):
#   shutil.rmtree(saved_model_dir)
# model.save(saved_model_path)

In [None]:
class Translator:
  def __init__(self, modelpath, input_tokenizer, target_tokenizer, maxlen):
    self.saved_model_path = modelpath
    self.input_tokenizer = input_tokenizer
    self.target_tokenizer = target_tokenizer
    self.maxlen = maxlen
    self._load_model()

  def _load_model(self):
    self.model = tf.keras.models.load_model(self.saved_model_path)

  def translate(self, sentence):
    my_model = self.model

    target_index_to_word = self.target_tokenizer.index_word
    target_index_to_word[0] = '<OOV>'

    sequences = self.input_tokenizer.texts_to_sequences([sentence])
    sequences = pad_sequences(sequences, maxlen=self.maxlen, padding='post')

    predictions = my_model.predict(sequences)

    result = self.target_tokenizer.sequences_to_texts(predictions[0])
    return result