In [None]:
# !pip install tensorflow-addons

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np

# from google.colab import drive #if use colab
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# # if use colab
# drive.mount('/content/drive')

In [None]:
# #if use colab
# git_dir = "/content/IOH-Chat-App"
# git_url = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

# if not os.path.exists(git_dir):
#   subprocess.call(["git", "clone", git_url])

In [None]:
# filedir = "/content/IOH-Chat-App/Machine Learning/datasets/translate sentence/result/eng-ind.csv" # #if use colab
filedir = "../..//datasets/translate sentence/result/eng-ind.csv" #if use local env

In [None]:
class TranslatorDataset:
  
  def __init__(self, filedir):
    self.filedir = filedir
    self.input_tokenizer = None
    self.target_tokenizer = None
    self._load_data_from_file()

  def _load_data_from_file(self):
    df = pd.read_csv(self.filedir)

    self.input_lang = df.English.tolist()
    self.target_lang = df.Indonesia.values.tolist()

  def normalize_and_preprocess(self, text):
    text = text.lower().strip()
    text = text.replace("\t\n", "")

    return text

  def _create_dataset(self):
    self.input_lang = np.array(list(map(self.normalize_and_preprocess, self.input_lang)))
    self.target_lang = np.array(list(map(self.normalize_and_preprocess, self.target_lang)))
    
    return self.input_lang, self.target_lang

  def _tokenize(self, sentence, num_words, maxlen):
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(sentence)

    sequences = tokenizer.texts_to_sequences(sentence)
    sequences = pad_sequences(sequences, maxlen, padding="post")

    return sequences, tokenizer

  def _load_dataset(self, num_words):
    input_lang, target_lang = self._create_dataset()

    self.maxlen = max([len(i)for i in input_lang])

    input_sequences, input_tokenizer = self._tokenize(input_lang, num_words,  self.maxlen)
    target_sequences, target_tokenizer = self._tokenize(target_lang, num_words,  self.maxlen)

    return (input_sequences, input_tokenizer), (target_sequences, target_tokenizer)
  
  def call(self, num_words, batch_size, buffer_size):
    input, target = self._load_dataset(num_words)

    input_sequences, self.input_tokenizer = input
    target_sequences, self.target_tokenizer = target

    dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
    dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

    return self.input_tokenizer, self.target_tokenizer, dataset


In [None]:
buffer_size = 8000
batch_size = 128
num_words = 500

translator_dataset = TranslatorDataset(filedir)
input_tokenizer, target_tokenizer, dataset = translator_dataset.call(num_words, 
                                                                     batch_size, 
                                                                     buffer_size)

input_batch, target_batch = next(iter(dataset))
input_batch.shape, target_batch.shape

In [None]:
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
input_maxlen = input_batch.shape[1]
target_maxlen = target_batch.shape[1]

input_maxlen, target_maxlen, input_vocab_size, target_vocab_size

In [None]:
class Seq2Seq:

  def __init__(self, input_vocab_size, output_vocab_size, embedding_dim, units, batch_size, maxlen):
    self.input_vocab_size = input_vocab_size
    self.output_vocab_size = output_vocab_size
    self.embedding_dim = embedding_dim
    self.maxlen = maxlen
    self.batch_size = batch_size
    self.units = units
    self.en_embedding = layers.Embedding(self.input_vocab_size, embedding_dim)
    self.dec_embedding = layers.Embedding(self.input_vocab_size, embedding_dim)
    self.en_gru_layer = layers.GRU(self.units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
    self.dec_gru_layer = layers.GRU(self.units,
                                    return_sequences=True,
                                    return_state=True)

  def _create_dense(self, input):
    x = layers.Dense(512, activation=tf.nn.relu)(input)
    x = layers.Dropout(.5)(x)
    x = layers.Dense(1024, activation=tf.nn.relu)(x)
    x = layers.Dropout(.5)(x)
    outputs = layers.TimeDistributed(layers.Dense(self.output_vocab_size, activation=tf.nn.softmax))(x)
    return outputs

  def encoder(self, input):
    embedding = self.en_embedding(input)
    output, state = self.en_gru_layer(embedding)

    return output, state

  def decoder(self, input, encoder_state):
    embedding = self.dec_embedding(input)
    outputs, _ = self.dec_gru_layer(embedding, 
                                    initial_state=encoder_state)
    outputs = self._create_dense(outputs)

    return outputs


In [None]:
embed_dims = 256
epochs = 10
units = 512
lr = 1e-4

optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [None]:
seq2seq = Seq2Seq(input_vocab_size, 
                  target_vocab_size, 
                  embed_dims, 
                  units, 
                  batch_size, 
                  input_maxlen)

In [None]:
en_outputs, en_state = seq2seq.encoder(input_batch)

print(en_outputs.shape)
print(en_state.shape)

In [None]:
dec_outputs = seq2seq.decoder(target_batch, en_state)

print(dec_outputs.shape)


In [None]:
def build_model(seq2seq, batch_size, shape):
  en_inputs = layers.Input(shape=(shape[1],))
  
  en_outputs, en_state = seq2seq.encoder(en_inputs)
  dec_outputs = seq2seq.decoder(en_inputs, en_state)

  model = Model(en_inputs, dec_outputs)

  model.compile(
      optimizer=optimizer,
      loss=loss,
      metrics=["accuracy"]
  )

  return model

In [None]:
# if use colab
# checkpoint_path = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/training_checkpoints/cp-{epoch:04d}.ckpt"

#if use local env
checkpoint_path = "training_checkpoints/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    save_weights_only=True,
    save_best_only=True,
    save_freq=10,
    verbose=1, 
)
model = build_model(
    seq2seq, 
    batch_size, 
    input_batch.shape
)

model.summary()

model.save_weights(checkpoint_path.format(epoch=0))


In [None]:
model.fit(dataset,
          epochs=epochs,
          callbacks=[cp_callback],
          verbose=1)

In [None]:
# # if use colab
# saved_model_path = "/content/drive/MyDrive/saved_model/transelate/translate.h5"

# if use local env
saved_model_path = "code/translate sentence/saved_model/translate.h5"
saved_model_dir = os.path.dirname(saved_model_path)

if os.path.exists(saved_model_dir):
  shutil.rmtree(saved_model_dir)
model.save(saved_model_path)

In [None]:
class Translator(TranslatorDataset):
  def __init__(self, modelpath):
    self.saved_model_path = modelpath
    self._load_seq2seq()

  def _load_seq2seq(self):
    model = tf.keras.models.load_model(self.saved_model_path)

    enc_outputs, enc_state = model.layers[3].output
    self.enc_model = Model(model.input, enc_state)

    dec_input = layers.Input(shape=(512,))

    dec_lstm = model.layers[4]
    dec_outputs, dec_state = dec_lstm(model.input,
                                        initial_state=dec_inputs)
    
    x = model.layers[5](dec_outputs)
    x = model.layers[6](x)
    x = model.layers[7](x)
    x = model.layers[8](x)
    dense = model.layers[9](x)

    self.dec_model = Model([model.input] + dec_input,
                            [dense] + dec_state)

  def translate(self, text):
    words = list()

    sequences = self.input_tokenizer.texts_to_sequences([text])
    sequences = tf.convert_to_tensor(pad_sequences(sequences, 
                                                   self.maxlen, 
                                                   padding="post"))
    input = self.enc_model.predict(sequences)
    target_seq = np.zeros((1, 1))

    for i in sequences:
        output_chars, h, c = self.dec_model.predict([target_seq] + input)
        char_index = np.argmax(output_chars)
        text_char = self.target_tokenizer.index_word[char_index]
        words.append(text_char)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = char_index
        states_value = [h, c]

    sentence = " ".join(words)
    return sentence


In [None]:
translator = Translator()
translator.translate("Good bye!")