In [3]:
import tensorflow as tf
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np

from google.colab import drive
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
git_dir = "/content/IOH-Chat-App"
git_url = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

if not os.path.exists(git_dir):
  subprocess.call(["git", "clone", git_url])

In [34]:
class TranslatorDataset:
  
  def __init__(self):
    self.filedir = "/content/IOH-Chat-App/Machine Learning/datasets/translate sentence/result/eng-ind.csv"
    self.input_tokenizer = None
    self.target_tokenizer = None
    self._load_data_from_file()

  def _load_data_from_file(self):
    df = pd.read_csv(self.filedir)

    self.input_lang = df.English.tolist()
    self.target_lang = df.Indonesia.values.tolist()

  def normalize_and_preprocess(self, text):
    text = text.lower().strip()
    text = text.replace("\t\n", "")

    return text

  def _create_dataset(self):
    self.input_lang = np.array(list(map(self.normalize_and_preprocess, self.input_lang)))
    self.target_lang = np.array(list(map(self.normalize_and_preprocess, self.target_lang)))
    
    return self.input_lang, self.target_lang

  def _tokenize(self, sentence, num_words, maxlen):
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(sentence)

    sequences = tokenizer.texts_to_sequences(sentence)
    sequences = pad_sequences(sequences, maxlen, padding="post")

    return sequences, tokenizer

  def _load_dataset(self, num_words):
    input_lang, target_lang = self._create_dataset()

    maxlen = max([len(i)for i in input_lang])

    input_sequences, input_tokenizer = self._tokenize(input_lang, num_words, maxlen)
    target_sequences, target_tokenizer = self._tokenize(target_lang, num_words, maxlen)

    return (input_sequences, input_tokenizer), (target_sequences, target_tokenizer)
  
  def call(self, num_words, batch_size, buffer_size):
    input, target = self._load_dataset(num_words)

    input_sequences, self.input_tokenizer = input
    target_sequences, self.target_tokenizer = target

    dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
    dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

    return self.input_tokenizer, self.target_tokenizer, dataset

In [35]:
buffer_size = 8000
batch_size = 128
num_words = 500

translator_dataset = TranslatorDataset()
input_tokenizer, target_tokenizer, dataset = translator_dataset.call(num_words, 
                                                                     batch_size, 
                                                                     buffer_size)

input_batch, target_batch = next(iter(dataset))
input_batch.shape, target_batch.shape

(TensorShape([128, 163]), TensorShape([128, 163]))

In [36]:
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
input_maxlen = input_batch.shape[1]
target_maxlen = target_batch.shape[1]

input_maxlen, target_maxlen, input_vocab_size, target_vocab_size

(163, 163, 4091, 4874)

In [37]:
class Seq2Seq():

  def __init__(self, input_vocab_size, output_vocab_size, embedding_dim, units, batch_size, maxlen):
    self.input_vocab_size = input_vocab_size
    self.output_vocab_size = output_vocab_size
    self.embedding_dim = embedding_dim
    self.maxlen = maxlen
    self.batch_size = batch_size
    self.units = units
    self.en_embedding = layers.Embedding(self.input_vocab_size, embedding_dim)
    self.dec_embedding = layers.Embedding(self.input_vocab_size, embedding_dim)
    self.en_lstm_layer = layers.LSTM(self.units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
    self.dec_lstm_layer = layers.LSTM(self.units,
                                    return_sequences=True,
                                    return_state=True)

  def _create_dense(self, input):
    x = layers.Dense(512, activation=tf.nn.relu)(input)
    x = layers.Dropout(.5)(x)
    x = layers.Dense(1024, activation=tf.nn.relu)(x)
    x = layers.Dropout(.5)(x)
    outputs = layers.TimeDistributed(layers.Dense(self.output_vocab_size, activation=tf.nn.softmax))(x)
    return outputs

  def encoder(self, input):
    embedding = self.en_embedding(input)
    output, h, c = self.en_lstm_layer(embedding)

    return output, h, c

  def decoder(self, input, encoder_state):
    embedding = self.dec_embedding(input)
    outputs, _, _ = self.dec_lstm_layer(embedding, 
                                        initial_state=encoder_state)
    outputs = self._create_dense(outputs)

    return outputs

In [51]:
embed_dims = 256
epochs = 10
units = 512
lr = 1e-4

optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [39]:
seq2seq = Seq2Seq(input_vocab_size, 
                  target_vocab_size, 
                  embed_dims, 
                  units, 
                  batch_size, 
                  input_maxlen)

In [40]:
en_outputs, en_h_state, en_c_state = seq2seq.encoder(input_batch)

print(en_outputs.shape)
print(en_h_state.shape)
print(en_c_state.shape)

(128, 163, 512)
(128, 512)
(128, 512)


In [41]:
dec_outputs = seq2seq.decoder(target_batch, [en_h_state, en_c_state])

print(dec_outputs.shape)

(128, 163, 4874)


In [56]:
def build_model(seq2seq, batch_size, shape):
  en_inputs = layers.Input(shape=(shape[1],))
  en_outputs, en_h_state, en_c_state = seq2seq.encoder(en_inputs)
  dec_outputs = seq2seq.decoder(en_inputs, [en_h_state, en_c_state])

  model = Model(en_inputs, dec_outputs)

  model.compile(
      optimizer=optimizer,
      loss=loss,
      metrics=["accuracy"]
  )

  return model

In [57]:
checkpoint_path = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/training_checkpoints/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    save_weights_only=True,
    save_best_only=True,
    save_freq=10,
    verbose=1, 
)
model = build_model(
    seq2seq, 
    batch_size, 
    input_batch.shape
)

model.summary()

model.save_weights(checkpoint_path.format(epoch=0))

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_12 (InputLayer)          [(None, 163)]        0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 163, 256)     1047296     ['input_12[0][0]']               
                                                                                                  
 embedding_3 (Embedding)        (None, 163, 256)     1047296     ['input_12[0][0]']               
                                                                                                  
 lstm_2 (LSTM)                  [(None, 163, 512),   1574912     ['embedding_2[5][0]']            
                                 (None, 512),                                              

In [54]:
model.fit(dataset,
          epochs=epochs,
          callbacks=[cp_callback],
          verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fab7793ed10>

In [55]:
saved_model_path = "/content/drive/MyDrive/saved_model/transelate/translate.h5"
saved_model_dir = os.path.dirname(saved_model_path)

if os.path.exists(saved_model_dir):
  shutil.rmtree(saved_model_dir)
else:
  model.save(saved_model_path)

In [58]:
class Translator(TranslatorDataset):
  def __init__(self):
    self.saved_model_path = "/content/drive/MyDrive/saved_model/transelate/translate.h5"
    self._load_seq2seq()

  def _load_seq2seq(self):
    model = tf.keras.models.load_model(self.saved_model_path)

    enc_outputs, state_h_enc, state_c_enc = model.layers[3].output
    self.enc_model = Model(model.input[0], [state_h_enc, state_c_enc])

    dec_h_input = layers.Input(shape=self.units)
    dec_c_input = layers.Input(shape=self.units)
    dec_inputs = [dec_h_input, dec_c_input]

    dec_lstm = model.layers[4]
    dec_outputs, dec_h, dec_c = dec_lstm(model.input[0],
                                          initial_state=dec_inputs)
    dec_states = [dec_h, dec_c]
    x = model.layers[5](dec_outputs)
    x = model.layers[6](x)
    x = model.layers[7](x)
    x = model.layers[8](x)
    dense = model.layers[9](x)

    self.dec_model = Model([model.input[0]] + dec_inputs,
                            [dense] + dec_states)

  def translate(self, text):
    tokens = list()

    sequences = self.input_tokenizer.texts_to_sequences([text])
    sequences = tf.convert_to_tensor(pad_sequences(
        sequences, self.maxlen, padding="post"))
    input = self.enc_model.predict(sequences)
    target_seq = np.zeros((1, 1))

    for i in sequences:
        output_chars, h, c = self.dec_model.predict([target_seq] + input)
        char_index = np.argmax(output_chars)
        text_char = self.target_tokenizer.index_word[char_index]
        tokens.append(text_char)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = char_index
        states_value = [h, c]

    sentence = " ".join(tokens)
    return sentence


In [60]:
translator = Translator()
translator.translate("Goodbye.")

ValueError: ignored