In [1]:
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 5.1 MB/s 
[?25hCollecting tensorflow<2.10,>=2.9.0
  Downloading tensorflow-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |████████████████████████████████| 511.7 MB 3.9 kB/s 
Collecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting keras<2.10.0,>=2.9.0rc0
  Downloading keras-2.9.0-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 39.1 MB/s 
Collecting tensorflow-estimator<2.10.0,>=2.9.0rc0
  Downloading tensorflow_estimator-2.9.0-py2.py3-none-any.whl (438 kB)
[K     |████████████████████████████████| 438 kB 70.3 MB/s 
Collecting tensorboard<2.10,>=2.9
  Downloading tensorboard-2.9.0-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 47.0 MB/s 
Collecting flatbuffers<2,>=1.12
  Down

In [2]:
import tensorflow as tf
import tensorflow_text as tf_text
import subprocess
import os
import pandas as pd
import random
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
git_dir = "/content/IOH-Chat-App"
git_url = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

if not os.path.exists(git_dir):
  subprocess.call(["git", "clone", git_url])

In [4]:
def load_data(filedir):
  df = pd.read_csv(filedir)

  return df.English.tolist(), df.Indonesia.values.tolist()

In [5]:
dataset_dir = "/content/IOH-Chat-App/Machine Learning/datasets/translate sentence/result/eng-ind.csv"

eng_sentences, ind_sentences = load_data(dataset_dir)

print(f"Num of english sentence: {len(eng_sentences)}")
print(f"Num of indonesia sentence: {len(ind_sentences)}")
print()
print(f"English example: {eng_sentences[-1]}")
print(f"Indonesia example: {ind_sentences[-1]}")

Num of english sentence: 8819
Num of indonesia sentence: 8819

English example: If a person has not had a chance to acquire his target language by the time he's an adult, he's unlikely to be able to reach native speaker level in that language.
Indonesia example: Jika seseorang tidak berkesempatan untuk menguasai bahasa yang diinginkannya ketika menginjak dewasa, maka kecil kemungkinan ia akan bisa mencapai tingkatan penutur asli dalam bahasa tersebut.


In [6]:
def normalize_and_preprocess(text):
  text = tf_text.normalize_utf8(text).numpy().decode()
  text = text.lower().strip()
  text = text.replace("\t\n", "")

  return text

In [7]:
eng_sentences = np.array(list(map(normalize_and_preprocess, eng_sentences)))
ind_sentences = np.array(list(map(normalize_and_preprocess, ind_sentences)))

In [8]:
def tokenizer(sentence, max_vocab):
  tokenizer = Tokenizer(num_words=max_vocab)
  tokenizer.fit_on_texts(sentence)

  return tokenizer

In [9]:
def pad_seqs(tokenizer, maxlen=None):
  return pad_sequences(tokenizer, maxlen=maxlen, padding="post", truncating="post")

In [10]:
max_vocab = 8000

eng_tokenizer = tokenizer(eng_sentences, max_vocab)
ind_tokenizer = tokenizer(ind_sentences, max_vocab)

eng_tokenizer.fit_on_texts(eng_sentences)
ind_tokenizer.fit_on_texts(ind_sentences)

eng_encode_example = eng_tokenizer.texts_to_sequences(eng_sentences)
ind_encode_example = ind_tokenizer.texts_to_sequences(ind_sentences)

eng_vocab = eng_tokenizer.index_word
ind_vocab = ind_tokenizer.index_word

eng_decode_example = eng_tokenizer.sequences_to_texts(eng_encode_example)
ind_decode_example = ind_tokenizer.sequences_to_texts(ind_encode_example)

eng_maxlen = max([len(i)for i in eng_decode_example])
ind_maxlen = max([len(i)for i in ind_decode_example])


print(f"English sentence: {eng_decode_example[-1]}")
print(f"English sequences: {eng_encode_example[-1]}")
print()
print(f"Indonesia sentence: {ind_decode_example[-1]}")
print(f"Indonesia sequences: {ind_encode_example[-1]}")

English sentence: if a person has not had a chance to acquire his target language by the time he's an adult he's unlikely to be able to reach native speaker level in that language
English sequences: [70, 7, 448, 42, 28, 63, 7, 692, 5, 4087, 32, 4088, 660, 78, 3, 43, 127, 77, 4089, 127, 2249, 5, 25, 258, 5, 1290, 1641, 1623, 4090, 15, 11, 660]

Indonesia sentence: jika seseorang tidak berkesempatan untuk menguasai bahasa yang diinginkannya ketika menginjak dewasa maka kecil kemungkinan ia akan bisa mencapai tingkatan penutur asli dalam bahasa tersebut
Indonesia sequences: [119, 290, 3, 4869, 17, 1607, 71, 4, 4870, 158, 4871, 1165, 823, 242, 534, 80, 12, 15, 1719, 4872, 4873, 1160, 46, 71, 248]


In [11]:
eng_pad_seqs = tf.convert_to_tensor(pad_seqs(eng_encode_example, maxlen=eng_maxlen))
ind_pad_seqs = tf.convert_to_tensor(pad_seqs(ind_encode_example, maxlen=eng_maxlen))

# eng_pad_seqs = tf.reshape(eng_pad_seqs, (*eng_pad_seqs.shape, 1))
# ind_pad_seqs = tf.reshape(ind_pad_seqs, (*eng_pad_seqs.shape[:-1], 1))

print(f"English sentence: {eng_decode_example[-1]}")
print(f"English sequences: {eng_pad_seqs[-1]}")
print()
print(f"Indonesia sentence: {ind_decode_example[-1]}")
print(f"Indonesia sequences: {ind_pad_seqs[-1]}")

English sentence: if a person has not had a chance to acquire his target language by the time he's an adult he's unlikely to be able to reach native speaker level in that language
English sequences: [  70    7  448   42   28   63    7  692    5 4087   32 4088  660   78
    3   43  127   77 4089  127 2249    5   25  258    5 1290 1641 1623
 4090   15   11  660    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0

In [12]:
embed_dims = 128
lr = 1e-4
epochs = 30
batch_size = 256

optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.sparse_categorical_crossentropy

In [13]:
def build_model(input_len, output_len, embed_dims, maxlen):
  # Model Architecure 1

  en_inputs = layers.Input(shape=(maxlen, 1))
  en_embedding = layers.Embedding(input_len, embed_dims)(en_inputs)
  
  en_lstm = layers.LSTM(512, return_state=True)
  en_outputs, state_h, state_c = en_lstm(en_inputs)
  en_states = [state_h, state_c]

  dec_inputs = layers.Input(shape=(maxlen, 1))
  dec_embedding = layers.Embedding(output_len, embed_dims)(dec_inputs)
  
  dec_lstm = layers.LSTM(512, return_sequences=True, return_state=True)
  dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states)
  x = layers.Dense(1024, activation=tf.nn.relu)(dec_outputs)
  x = layers.Dropout(.5)(x)
  outputs = layers.Dense(output_len, activation=tf.nn.softmax)(x)

  model = Model([en_inputs, dec_inputs], outputs)

  # Model Architecure 2

  # en_inputs = layers.Input(shape=(maxlen, 1))
  # en_embedding = layers.Embedding(input_len, embed_dims)(en_inputs)
  
  # en_lstm = layers.LSTM(512, return_state=True)
  # en_outputs, state_h, state_c = en_lstm(en_inputs)
  # en_states = [state_h, state_c]

  # dec_inputs = layers.Input(shape=(maxlen, 1))
  # dec_embedding = layers.Embedding(output_len, embed_dims)(dec_inputs)
  
  # dec_lstm = layers.LSTM(512, return_sequences=True, return_state=True)
  # dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states)
  # x = layers.Dense(512, activation=tf.nn.relu)(dec_outputs)
  # x = layers.Dropout(.5)(x)
  # x = layers.Dense(1024, activation=tf.nn.relu)(dec_outputs)
  # x = layers.Dropout(.5)(x)
  # outputs = layers.Dense(output_len, activation=tf.nn.softmax)(x)

  # model = Model([en_inputs, dec_inputs], outputs)

  # Model Architecure 3

  # model.add(layers.Embedding(input_len, embed_dims, input_length=input_length, mask_zero=True))
  # model.add(layers.Dropout(0.5))
  # model.add(layers.Bidirectional(layers.GRU(512, return_sequences=True)))
  # model.add(layers.GRU(512, return_sequences=True))
  # model.add(layers.Dense(512, activation=tf.nn.relu))
  # model.add(layers.Dropout(0.5))
  # model.add(layers.Dense(1024, activation=tf.nn.relu))
  # model.add(layers.Dropout(0.5))
  # model.add(layers.Dense(output_len, activation=tf.nn.softmax))

  model.compile(
      optimizer=optimizer,
      loss=loss,
      metrics=["accuracy"]
  )

  model.summary()

  return model

In [None]:
checkpoint_path = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/training_checkpoints/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    save_weights_only=True,
    save_best_only=True,
    save_freq=5,
    verbose=1, 
)

model = build_model(
    len(eng_vocab) + 1, 
    len(ind_vocab) + 1, 
    embed_dims,
    eng_maxlen,
)

model.save_weights(checkpoint_path.format(epoch=0))

history=model.fit([eng_pad_seqs, ind_pad_seqs], 
                  ind_pad_seqs,
                  epochs=epochs,
                  batch_size=batch_size,
                  callbacks=[cp_callback],
                  validation_split=0.2,
                  verbose=1)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 161, 1)]     0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 161, 1)]     0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 512),        1052672     ['input_1[0][0]']                
                                 (None, 512),                                                     
                                 (None, 512)]                                                     
                                                                                              

In [None]:
saved_model_dir = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/saved_model"

model.save(saved_model_dir)

In [None]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest