In [1]:
# !pip install tensorflow_text

In [2]:
import tensorflow as tf
import tensorflow_text as tf_text
import subprocess
import os
import pandas as pd
import random
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
GIT_DIR = "/content/IOH-Chat-App"
GIT_URL = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

if not os.path.exists(GIT_DIR):
  subprocess.call(["git", "clone", GIT_URL])

In [36]:
def load_data(filedir):
  df = pd.read_csv(filedir)

  return df.English.tolist(), df.Indonesia.values.tolist()

In [37]:
DATASET_DIR = "/content/IOH-Chat-App/Machine Learning/datasets/translate sentence/result/eng-ind.csv"

eng_sentences, ind_sentences = load_data(DATASET_DIR)

print(f"Num of english sentence: {len(eng_sentences)}")
print(f"Num of indonesia sentence: {len(ind_sentences)}")
print()
print(f"English example: {eng_sentences[-1]}")
print(f"Indonesia example: {ind_sentences[-1]}")

Num of english sentence: 8819
Num of indonesia sentence: 8819

English example: If a person has not had a chance to acquire his target language by the time he's an adult, he's unlikely to be able to reach native speaker level in that language.
Indonesia example: Jika seseorang tidak berkesempatan untuk menguasai bahasa yang diinginkannya ketika menginjak dewasa, maka kecil kemungkinan ia akan bisa mencapai tingkatan penutur asli dalam bahasa tersebut.


In [38]:
def normalize_and_preprocess(text):
  text = tf_text.normalize_utf8(text).numpy().decode()
  text = text.lower().strip()
  text = text.replace("\t\n", "")

  return text

In [40]:
eng_sentences = np.array(list(map(normalize_and_preprocess, eng_sentences)))
ind_sentences = np.array(list(map(normalize_and_preprocess, ind_sentences)))

run!


In [69]:
def tokenizer(sentence, max_vocab):
  tokenizer = Tokenizer(num_words=max_vocab)
  tokenizer.fit_on_texts(sentence)

  return tokenizer

In [70]:
def pad_seqs(tokenizer, maxlen=None):
  return pad_sequences(tokenizer, maxlen=maxlen, padding="post", truncating="post")

In [93]:
max_vocab = 8000

eng_tokenizer = tokenizer(eng_sentences, max_vocab)
ind_tokenizer = tokenizer(ind_sentences, max_vocab)

eng_tokenizer.fit_on_texts(eng_sentences)
ind_tokenizer.fit_on_texts(ind_sentences)

eng_encode_example = eng_tokenizer.texts_to_sequences(eng_sentences)
ind_encode_example = ind_tokenizer.texts_to_sequences(ind_sentences)

eng_decode_example = eng_tokenizer.sequences_to_texts(eng_encode_example)
ind_decode_example = ind_tokenizer.sequences_to_texts(ind_encode_example)

maxlen = max([len(i)for i in eng_decode_example])

print(f"English sentence: {eng_decode_example[-1]}")
print(f"English sequences: {eng_encode_example[-1]}")
print()
print(f"Indonesia sentence: {ind_decode_example[-1]}")
print(f"Indonesia sequences: {ind_encode_example[-1]}")

English sentence: if a person has not had a chance to acquire his target language by the time he's an adult he's unlikely to be able to reach native speaker level in that language
English sequences: [70, 7, 448, 42, 28, 63, 7, 692, 5, 4087, 32, 4088, 660, 78, 3, 43, 127, 77, 4089, 127, 2249, 5, 25, 258, 5, 1290, 1641, 1623, 4090, 15, 11, 660]

Indonesia sentence: jika seseorang tidak berkesempatan untuk menguasai bahasa yang diinginkannya ketika menginjak dewasa maka kecil kemungkinan ia akan bisa mencapai tingkatan penutur asli dalam bahasa tersebut
Indonesia sequences: [119, 290, 3, 4869, 17, 1607, 71, 4, 4870, 158, 4871, 1165, 823, 242, 534, 80, 12, 15, 1719, 4872, 4873, 1160, 46, 71, 248]


In [103]:
eng_pad_seqs = tf.convert_to_tensor(pad_seqs(eng_encode_example, maxlen=maxlen))
ind_pad_seqs = tf.convert_to_tensor(pad_seqs(ind_encode_example, maxlen=maxlen))

print(f"English sentence: {eng_decode_example[-1]}")
print(f"English sequences: {eng_pad_seqs[-1]}")
print()
print(f"Indonesia sentence: {ind_decode_example[-1]}")
print(f"Indonesia sequences: {ind_pad_seqs[-1]}")

English sentence: if a person has not had a chance to acquire his target language by the time he's an adult he's unlikely to be able to reach native speaker level in that language
English sequences: [  70    7  448   42   28   63    7  692    5 4087   32 4088  660   78
    3   43  127   77 4089  127 2249    5   25  258    5 1290 1641 1623
 4090   15   11  660    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0

In [None]:
EMBED_DIMS = 256
LR = 1e-4
EPOCHS = 200
BATCH_SIZE = 256

OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=LR)
LOSS = tf.keras.losses.sparse_categorical_crossentropy

In [None]:
def build_model(input_len, output_len, embed_dims, input_length):

  model=tf.keras.Sequential()

  model.add(layers.Embedding(input_len, embed_dims, input_length=input_length, mask_zero=True))
  model.add(layers.Dropout(0.5))
  model.add(layers.Bidirectional(layers.GRU(512, return_sequences=True)))
  model.add(layers.GRU(512, return_sequences=True))
  model.add(layers.Dense(512, activation=tf.nn.relu))
  model.add(layers.Dropout(0.5))
  model.add(layers.Dense(1024, activation=tf.nn.relu))
  model.add(layers.Dropout(0.5))
  model.add(layers.Dense(output_len, activation=tf.nn.softmax))

  model.compile(
      optimizer=OPTIMIZER,
      loss=LOSS,
      metrics=["accuracy"]
  )

  model.summary()

  return model

In [None]:
CHECKPOINT_PATH = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/training_checkpoints/cp-{epoch:04d}.ckpt"
CHECKPOINT_DIR = os.path.dirname(CHECKPOINT_PATH)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=CHECKPOINT_PATH, 
    save_weights_only=True,
    save_best_only=True,
    verbose=1, 
)

model = build_model(
    eng_total_words, 
    ind_total_words, 
    EMBED_DIMS,
    maxlen,
)

model.save_weights(CHECKPOINT_PATH.format(epoch=0))

history=model.fit(eng_pad_seqs, 
                  ind_pad_seqs,
                  epochs=EPOCHS,
                  batch_size=BATCH_SIZE,
                  callbacks=[cp_callback],
                  verbose=1)