In [None]:
# !pip install tensorflow_text

In [None]:
import tensorflow as tf
import tensorflow_text as tf_text
import subprocess
import os
import pandas as pd
import random
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
GIT_DIR = "/content/IOH-Chat-App"
GIT_URL = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

if not os.path.exists(GIT_DIR):
  subprocess.call(["git", "clone", GIT_URL])

In [None]:
def load_data(filedir):
  df = pd.read_csv(filedir)

  return df.English.tolist(), df.Indonesia.values.tolist()

In [None]:
DATASET_DIR = "/content/IOH-Chat-App/Machine Learning/datasets/translate sentence/result/eng-ind.csv"

eng_sentences, ind_sentences = load_data(DATASET_DIR)

print(f"Num of english sentence: {len(eng_sentences)}")
print(f"Num of indonesia sentence: {len(ind_sentences)}")
print()
print(f"English example: {eng_sentences[-1]}")
print(f"Indonesia example: {ind_sentences[-1]}")

In [None]:
def normalize_and_preprocess(text):
  text = tf_text.normalize_utf8(text).numpy().decode()
  text = text.lower().strip()
  text = text.replace("\t\n", "")

  return text

In [None]:
eng_sentences = np.array(list(map(normalize_and_preprocess, eng_sentences)))
ind_sentences = np.array(list(map(normalize_and_preprocess, ind_sentences)))

In [None]:
def tokenizer(sentence, max_vocab):
  tokenizer = Tokenizer(num_words=max_vocab)
  tokenizer.fit_on_texts(sentence)

  return tokenizer

In [None]:
def pad_seqs(tokenizer, maxlen=None):
  return pad_sequences(tokenizer, maxlen=maxlen, padding="post", truncating="post")

In [None]:
max_vocab = 8000

eng_tokenizer = tokenizer(eng_sentences, max_vocab)
ind_tokenizer = tokenizer(ind_sentences, max_vocab)

eng_tokenizer.fit_on_texts(eng_sentences)
ind_tokenizer.fit_on_texts(ind_sentences)

eng_encode_example = eng_tokenizer.texts_to_sequences(eng_sentences)
ind_encode_example = ind_tokenizer.texts_to_sequences(ind_sentences)

eng_vocab = eng_tokenizer.index_word
ind_vocab = ind_tokenizer.index_word

eng_decode_example = eng_tokenizer.sequences_to_texts(eng_encode_example)
ind_decode_example = ind_tokenizer.sequences_to_texts(ind_encode_example)

eng_maxlen = max([len(i)for i in eng_decode_example])
ind_maxlen = max([len(i)for i in ind_decode_example])


print(f"English sentence: {eng_decode_example[-1]}")
print(f"English sequences: {eng_encode_example[-1]}")
print()
print(f"Indonesia sentence: {ind_decode_example[-1]}")
print(f"Indonesia sequences: {ind_encode_example[-1]}")

In [None]:
eng_pad_seqs = tf.convert_to_tensor(pad_seqs(eng_encode_example, maxlen=eng_maxlen))
ind_pad_seqs = tf.convert_to_tensor(pad_seqs(ind_encode_example, maxlen=eng_maxlen))

# eng_pad_seqs = tf.reshape(eng_pad_seqs, (*eng_pad_seqs.shape, 1))
# ind_pad_seqs = tf.reshape(ind_pad_seqs, (*eng_pad_seqs.shape[:-1], 1))

print(f"English sentence: {eng_decode_example[-1]}")
print(f"English sequences: {eng_pad_seqs[-1]}")
print()
print(f"Indonesia sentence: {ind_decode_example[-1]}")
print(f"Indonesia sequences: {ind_pad_seqs[-1]}")

In [None]:
embed_dims = 128
lr = 1e-4
epochs = 30
batch_size = 256

optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.sparse_categorical_crossentropy

In [None]:
def build_model(input_len, output_len, embed_dims, maxlen):

  en_inputs = layers.Input(shape=(maxlen, 1))
  en_embedding = layers.Embedding(input_len, embed_dims)(en_inputs)
  
  en_lstm = layers.LSTM(512, return_state=True)
  en_outputs, state_h, state_c = en_lstm(en_inputs)
  en_states = [state_h, state_c]

  dec_inputs = layers.Input(shape=(maxlen, 1))
  dec_embedding = layers.Embedding(output_len, embed_dims)(dec_inputs)
  
  dec_lstm = layers.LSTM(512, return_sequences=True, return_state=True)
  dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states)
  x = layers.Dense(1024, activation=tf.nn.relu)(dec_outputs)
  x = layers.Dropout(.5)(x)
  outputs = layers.Dense(output_len, activation=tf.nn.softmax)(x)

  model = tf.keras.Model([en_inputs, dec_inputs], outputs)

  # model.add(layers.Embedding(input_len, embed_dims, input_length=input_length, mask_zero=True))
  # model.add(layers.Dropout(0.5))
  # model.add(layers.Bidirectional(layers.GRU(512, return_sequences=True)))
  # model.add(layers.GRU(512, return_sequences=True))
  # model.add(layers.Dense(512, activation=tf.nn.relu))
  # model.add(layers.Dropout(0.5))
  # model.add(layers.Dense(1024, activation=tf.nn.relu))
  # model.add(layers.Dropout(0.5))
  # model.add(layers.Dense(output_len, activation=tf.nn.softmax))

  model.compile(
      optimizer=optimizer,
      loss=loss,
      metrics=["accuracy"]
  )

  model.summary()

  return model

In [None]:
CHECKPOINT_PATH = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/training_checkpoints/cp-{epoch:04d}.ckpt"
CHECKPOINT_DIR = os.path.dirname(CHECKPOINT_PATH)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=CHECKPOINT_PATH, 
    save_weights_only=True,
    save_best_only=True,
    save_freq=5,
    verbose=1, 
)

model = build_model(
    len(eng_vocab) + 1, 
    len(ind_vocab) + 1, 
    embed_dims,
    eng_maxlen,
)

model.save_weights(CHECKPOINT_PATH.format(epoch=0))

history=model.fit([eng_pad_seqs, ind_pad_seqs], 
                  ind_pad_seqs,
                  epochs=epochs,
                  batch_size=batch_size,
                  callbacks=[cp_callback],
                  validation_split=0.2,
                  verbose=1)