In [3]:
import tensorflow as tf
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np
import json
import zipfile
import re

from sklearn.model_selection import train_test_split
from google.colab import drive, files #if use colab
from tensorflow.nn import relu, tanh, softmax
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# if use colab
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#if use colab
git_dir = "/content/IOH-Chat-App"
git_url = "https://github.com/bangkit-team/IOH-chat-app.git"

if not os.path.exists(git_dir):
  subprocess.call(["git", "clone", git_url])

In [6]:
filedir1 = "/content/IOH-chat-app/MachineLearning/datasets/translation/result/eng-ind.csv" # #if use colab
filedir2 = "/content/IOH-chat-app/MachineLearning/datasets/spam/emails.csv" # #if use colab
# filedir1 = "../../datasets/translate sentence/result/eng-ind.csv" #if use local env
# filedir2 = "../../datasets/spam/emails.csv" #if use local env

In [7]:
df1 = pd.read_csv(filedir1)
df1

Unnamed: 0,English,Indonesia
0,Run!,Lari!
1,Who?,Siapa?
2,Wow!,Wow!
3,Help!,Tolong!
4,Jump!,Lompat!
...,...,...
8814,Every student who has graduated from our unive...,Semua mahasiswa yang telah menyelesaikan studi...
8815,"If you don't want to put on sunscreen, that's ...","Kalau kamu tidak mau pakai tabir surya, ya, te..."
8816,"When she was finished ironing, Mary switched o...","Ketika dia sudah selesai menyetrika, Mary mema..."
8817,"Irene Pepperberg, a researcher at Northwestern...","Irene Pepperberg, seorang peneliti di Universi..."


In [8]:
df2 = pd.read_csv(filedir2)
df2 = df2.rename(columns={"text": "English", "teks": "Indonesia"})
df2 = df2.drop("spam", axis=1)
df2

Unnamed: 0,English,Indonesia
0,naturally irresistible your corporate identity...,Secara alami tak tertahankan identitas perusah...
1,the stock trading gunslinger fanny is merrill...,Fanny Gunslinger Perdagangan Saham adalah Merr...
2,unbelievable new homes made easy im wanting t...,Rumah Baru yang Luar Biasa Menjadi Mudah Saya ...
3,4 color printing special request additional i...,4 PERMINTAAN PERMINTAAN KHUSUS INFORMASI KHUSU...
4,"do not have money , get software cds from here...","Jangan punya uang, dapatkan CD perangkat lunak..."
...,...,...
5723,research and development charges to gpg here ...,Biaya penelitian dan pengembangan ke GPG di si...
5724,"receipts from visit jim , thanks again for t...","Tanda terima dari kunjungan Jim, terima kasih ..."
5725,enron case study update wow ! all on the same...,Pembaruan Studi Kasus Enron Wow! Semua pada ha...
5726,"interest david , please , call shirley crens...","Bunga David, tolong, hubungi Shirley Crenshaw ..."


In [9]:
df2_len = len(df2)
df = pd.concat([df1, df2])
df

Unnamed: 0,English,Indonesia
0,Run!,Lari!
1,Who?,Siapa?
2,Wow!,Wow!
3,Help!,Tolong!
4,Jump!,Lompat!
...,...,...
5723,research and development charges to gpg here ...,Biaya penelitian dan pengembangan ke GPG di si...
5724,"receipts from visit jim , thanks again for t...","Tanda terima dari kunjungan Jim, terima kasih ..."
5725,enron case study update wow ! all on the same...,Pembaruan Studi Kasus Enron Wow! Semua pada ha...
5726,"interest david , please , call shirley crens...","Bunga David, tolong, hubungi Shirley Crenshaw ..."


In [10]:
start_mark = '<start>'
end_mark = '<end>'

In [11]:
class TranslatorDataset:
  def __init__(self, dataframe):
    self.dataframe = dataframe
    self.input_tokenizer = None
    self.target_tokenizer = None
    self._load_data_from_file()

  def _load_data_from_file(self):
    df = self.dataframe

    input_lang = df.English.values
    target_lang = df.Indonesia.values

    return input_lang, target_lang

  def _normalize_and_preprocess(self, text, use_mark=False):
    if use_mark:
      text = text.lower().strip()
      text = " ".join([start_mark, text, end_mark])
    else:
      text = text.lower().strip()

    return text

  def _tokenize(self, sentences, num_words, maxlen):
    punctuation = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'

    tokenizer = Tokenizer(num_words=num_words, filters=punctuation, lower=False)
    tokenizer.fit_on_texts(sentences)

    sequences = tokenizer.texts_to_sequences(sentences)
    sequences = pad_sequences(
      sequences, maxlen=maxlen, padding="post", truncating="post")

    return sequences, tokenizer

  def _create_dataset(self):
    input_lang, target_lang = self._load_data_from_file()

    input_sentence = np.array(
        list(map(lambda x: self._normalize_and_preprocess(x, False), input_lang)))
    
    target_sentence = np.array(
        list(map(lambda y: self._normalize_and_preprocess(y, True), target_lang)))
    
    return input_sentence, target_sentence

  def _load_dataset(self, num_words):
    input_lang, target_lang = self._create_dataset()

    self.maxlen = max([len(i)for i in input_lang]) // 1000
    self.buffer_size = len(input_lang)

    input_sequences, input_tokenizer = self._tokenize(
        input_lang, num_words, self.maxlen)
    
    target_sequences, target_tokenizer = self._tokenize(
        target_lang, num_words, self.maxlen,)

    return (input_sequences, input_tokenizer), (target_sequences, target_tokenizer)
  
  def get(self, num_words, batch_size):
    input, target = self._load_dataset(num_words)

    input_sequences, self.input_tokenizer = input
    target_sequences, self.target_tokenizer = target

    dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
    dataset = dataset.shuffle(self.buffer_size).batch(batch_size, drop_remainder=True)
    dataset = dataset.cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return self.input_tokenizer, self.target_tokenizer, dataset

In [12]:
num_words = 15000
batch_size = 64

In [13]:
translator_dataset = TranslatorDataset(df)
input_tokenizer, target_tokenizer, dataset = translator_dataset.get(num_words, batch_size)

In [14]:
input_batch, target_batch = next(iter(dataset))

In [15]:
input_batch.shape, target_batch.shape

(TensorShape([64, 43]), TensorShape([64, 43]))

In [16]:
input_vocab_size = len(input_tokenizer.index_word) + 1
target_vocab_size = len(target_tokenizer.index_word) + 1
input_maxlen = input_batch.shape[1]
target_maxlen = target_batch.shape[1]

input_maxlen, target_maxlen, input_vocab_size, target_vocab_size

(43, 43, 38091, 38392)

In [17]:
input_example = input_batch[-1]
input_example

<tf.Tensor: shape=(43,), dtype=int32, numpy=
array([ 1740,   602,   260,   240,  1051,     8,    88,    95,  1061,
        1844,    11,     9,     1,   273,     4,  2483,  4986,   373,
          18,     3,    44,  1728,     4,  1740,    26,    95,   575,
          21,    23,    88,     2, 11151,   236,   224,   159,     2,
          18,    73,    80,     2,   223,   415,     3], dtype=int32)>

In [18]:
target_example = target_batch[-1]
target_example

<tf.Tensor: shape=(43,), dtype=int32, numpy=
array([   8, 2675,  405,  137,  472,  589,  589,    5, 1749, 1772, 1841,
        230,   19,  264,  323, 1191,  282,    4,    2,  359, 2913,   11,
         16,  971,  586,   60,  166,  239,  244,   20,  359,    4,    1,
         29,   74,    2,    5, 1666,   31,   59,   48, 3344, 1713],
      dtype=int32)>

In [19]:
input_sentence = input_tokenizer.sequences_to_texts([input_example.numpy()])
input_sentence

['exotica yet again hi guys i need some advice sharad is in the process of finding differences between your and our versions of exotica at some point we will need to migrate london office over to your more up to date version and']

In [20]:
target_sentence = target_tokenizer.sequences_to_texts([target_example.numpy()])
target_sentence

['<start> exotica sekali lagi hai teman teman saya butuh nasihat sharad sedang dalam proses menemukan perbedaan antara anda dan versi eksotika kami pada titik tertentu kita perlu kantor london ke versi anda yang lebih baik dan saya khawatir bahwa mungkin ada implikasi gaya']

In [21]:
embed_dims = 256
units = 1024

In [22]:
class Encoder():
  def __init__(self, input_vocab_size, embedding_dims, units):
    self.units = units
    self.input_vocab_size = input_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.input_vocab_size, self.embedding_dims)
    self.lstm_layer = layers.LSTM(self.units,
                                 return_sequences=True,
                                 return_state=True,
                                 recurrent_initializer='glorot_uniform')

  def call(self, inputs):
    embedding = self.embedding(inputs)
    encoder = self.lstm_layer(embedding, initial_state=None)

    return encoder

In [23]:
# class BahdanauAttention(layers.Layer):
#   def __init__(self, units):
#     super(BahdanauAttention, self).__init__()
#     self.w1 = layers.Dense(units, use_bias=True) 
#     self.w2 = layers.Dense(units, use_bias=True) 
#     self.fd = layers.Dense(1)

#   def call(self, query, values):
#     query_with_time_axis = tf.expand_dims(query, 1)
    
#     score = self.fd(tf.nn.tanh(
#         self.w1(query_with_time_axis) + self.w2(values)))

#     attention_weights = softmax(score, axis=1)

#     context_vector = attention_weights * values
#     context_vector = tf.reduce_sum(context_vector, axis=1)

#     return context_vector, attention_weights

In [24]:
class Decoder():
  def __init__(self, output_vocab_size, embedding_dims, units):
    self.units = units
    self.output_vocab_size = output_vocab_size
    self.embedding_dims = embedding_dims

    self.embedding = layers.Embedding(self.output_vocab_size, self.embedding_dims)
    self.lstm_layer = layers.LSTM(self.units,
                                  return_sequences=True,
                                  return_state=True,
                                  recurrent_initializer='glorot_uniform')
    self.attention = layers.AdditiveAttention()
    self.dense1 = layers.Dense(self.units, activation=tanh)
    self.dense2 = layers.Dense(self.output_vocab_size)

  def call(self, inputs, en_outputs, state):
    embedding = self.embedding(inputs)
    dec_outputs, dec_h_state, dec_c_state = self.lstm_layer(
        embedding, initial_state=state)
    
    context_vector = self.attention([dec_outputs, en_outputs])
    
    context_and_rnn_output = tf.concat([context_vector, dec_outputs], axis=-1)

    attention_vector = self.dense1(dec_outputs)
    outputs = self.dense2(attention_vector)

    return outputs

In [25]:
encoder = Encoder(input_vocab_size, embed_dims, units)
en_outputs, en_h_state, en_c_state = encoder.call(input_batch)

en_outputs.shape, en_h_state.shape, en_c_state.shape

(TensorShape([64, 43, 1024]), TensorShape([64, 1024]), TensorShape([64, 1024]))

In [26]:
decoder = Decoder(target_vocab_size, embed_dims, units)
dec_outputs= decoder.call(target_batch, en_outputs, [en_h_state, en_c_state])

dec_outputs.shape

TensorShape([64, 43, 38392])

In [27]:
lr = 0.001
epochs = 30

optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [28]:
class TranslatorModel:
  def __init__(self, input_vocab_size, 
               target_vocab_size, 
               embed_dims, 
               units):
    self.input_vocab_size = input_vocab_size
    self.target_vocab_size = target_vocab_size
    self.embed_dims = embed_dims
    self.units = units

    self.encoder = Encoder(self.input_vocab_size, self.embed_dims, self.units)
    self.decoder = Decoder(self.target_vocab_size, self.embed_dims, self.units)
  
  def build_model(self):
    en_inputs = layers.Input(shape=(None,))
    en_output, en_h_state, en_c_state = self.encoder.call(en_inputs)

    dec_outputs = self.decoder.call(en_inputs, en_output, [en_h_state, en_c_state])

    model = Model(inputs=[en_inputs], 
                  outputs=[dec_outputs])
    return model

In [29]:
translator_model = TranslatorModel(
    input_vocab_size,
    target_vocab_size,
    embed_dims,
    units,
)
model = translator_model.build_model()

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=["accuracy"]
)

In [30]:
checkpoint_path = "/content/drive/MyDrive/translate/checkpoint/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=3, 
    verbose=1)

callback_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    monitor='loss', 
    verbose=1, 
    save_weights_only=True, 
    save_best_only=True)

callbacks = [callback_early_stopping,
             callback_checkpoint]

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 256)    9751296     ['input_1[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 256)    9828352     ['input_1[0][0]']                
                                                                                                  
 lstm_2 (LSTM)                  [(None, None, 1024)  5246976     ['embedding_2[0][0]']            
                                , (None, 1024),                                               

In [31]:
model.fit(dataset,
          epochs=epochs,
          callbacks=callbacks,
          verbose=1)

Epoch 1/30
Epoch 1: loss improved from inf to 3.59139, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0001.ckpt
Epoch 2/30
Epoch 2: loss improved from 3.59139 to 2.95033, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0002.ckpt
Epoch 3/30
Epoch 3: loss improved from 2.95033 to 2.72330, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0003.ckpt
Epoch 4/30
Epoch 4: loss improved from 2.72330 to 2.50896, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0004.ckpt
Epoch 5/30
Epoch 5: loss improved from 2.50896 to 2.30182, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0005.ckpt
Epoch 6/30
Epoch 6: loss improved from 2.30182 to 2.10127, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0006.ckpt
Epoch 7/30
Epoch 7: loss improved from 2.10127 to 1.91352, saving model to /content/drive/MyDrive/translate/checkpoint/cp-0007.ckpt
Epoch 8/30
Epoch 8: loss improved from 1.91352 to 1.74769, saving model to /cont

<keras.callbacks.History at 0x7f15705eafd0>

In [32]:
# if use colab
saved_model_path = "saved_model/translation.h5"

# if use local env
# saved_model_path = "code/translate sentence/saved_model"
saved_model_dir = os.path.dirname(saved_model_path)

if os.path.exists(saved_model_dir):
  shutil.rmtree(saved_model_dir)
  
model.save(saved_model_path)

In [33]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS,
  tf.lite.OpsSet.SELECT_TF_OPS
]
converter.experimental_new_converter = True

tflite_model = converter.convert()
open("translation.tflite", "wb").write(tflite_model)



INFO:tensorflow:Assets written to: /tmp/tmpt0l277ii/assets


INFO:tensorflow:Assets written to: /tmp/tmpt0l277ii/assets


281951664

In [34]:
# class Translator:
#   def __init__(self, model_path, input_tokenizer, target_tokenizer, maxlen):
#     self.input_tokenizer = input_tokenizer
#     self.target_tokenizer = target_tokenizer
#     self.maxlen = maxlen
#     self.model_path = model_path

#     self._load_model()

#   def _load_model(self):
#     self.model = tf.keras.models.load_model(self.model_path, compile=True)

#   def _normalize_and_preprocess(self, text):
#     punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    
#     text = text.lower().strip()
#     text = text.replace(punctuation, "")

#     return text
    
#   def translate(self, sentence):
#     index_prediction = list()

#     normalize_sentence = self._normalize_and_preprocess(sentence)

#     sequences = input_tokenizer.texts_to_sequences([normalize_sentence])
#     sequences = pad_sequences(
#         sequences, maxlen=self.maxlen, padding="post", truncating="post")
    
#     predictions = self.model.predict(sequences)

#     for i in predictions[0]:
#       index_prediction.append(np.argmax(i))

#     marks = [start_mark, end_mark]
#     result = target_tokenizer.sequences_to_texts([index_prediction])[0]
#     result = " ".join([word for word in result.split(" ") if word not in marks])

#     return result

In [35]:
# translator = Translator(
#     saved_model_path,
#     input_tokenizer, 
#     target_tokenizer,
#     input_maxlen,
# )

In [36]:
# translate = translator.translate("i'm joking")
# translate