In [28]:
!pip install tensorflow_text



In [102]:
import tensorflow as tf
import tensorflow_text as tf_text
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np

from tensorflow.keras import layers
from tensorflow.python.ops import math_ops
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
git_dir = "/content/IOH-Chat-App"
git_url = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

if not os.path.exists(git_dir):
  subprocess.call(["git", "clone", git_url])

In [31]:
def load_data(filedir):
  df = pd.read_csv(filedir)

  return df.English.tolist(), df.Indonesia.values.tolist()

In [32]:
dataset_dir = "/content/IOH-Chat-App/Machine Learning/datasets/translate sentence/result/eng-ind.csv"

eng_sentences, ind_sentences = load_data(dataset_dir)

print(f"Num of english sentence: {len(eng_sentences)}")
print(f"Num of indonesia sentence: {len(ind_sentences)}")
print()
print(f"English example: {eng_sentences[-1]}")
print(f"Indonesia example: {ind_sentences[-1]}")

Num of english sentence: 8819
Num of indonesia sentence: 8819

English example: If a person has not had a chance to acquire his target language by the time he's an adult, he's unlikely to be able to reach native speaker level in that language.
Indonesia example: Jika seseorang tidak berkesempatan untuk menguasai bahasa yang diinginkannya ketika menginjak dewasa, maka kecil kemungkinan ia akan bisa mencapai tingkatan penutur asli dalam bahasa tersebut.


In [33]:
def normalize_and_preprocess(text):
  text = tf_text.normalize_utf8(text).numpy().decode()
  text = text.lower().strip()
  text = text.replace("\t\n", "")

  return text

In [34]:
eng_sentences = np.array(list(map(normalize_and_preprocess, eng_sentences)))
ind_sentences = np.array(list(map(normalize_and_preprocess, ind_sentences)))

In [35]:
def tokenizer(sentence, max_vocab):
  tokenizer = Tokenizer(num_words=max_vocab)
  tokenizer.fit_on_texts(sentence)

  return tokenizer

In [36]:
def pad_seqs(tokenizer, maxlen=None):
  return pad_sequences(tokenizer, maxlen=maxlen, padding="post", truncating="post")

In [155]:
max_vocab = 8000

eng_tokenizer = tokenizer(eng_sentences, max_vocab)
ind_tokenizer = tokenizer(ind_sentences, max_vocab)

eng_tokenizer.fit_on_texts(eng_sentences)
ind_tokenizer.fit_on_texts(ind_sentences)

eng_encode_example = eng_tokenizer.texts_to_sequences(eng_sentences)
ind_encode_example = ind_tokenizer.texts_to_sequences(ind_sentences)

eng_vocab = eng_tokenizer.index_word
ind_vocab = ind_tokenizer.index_word

eng_decode_example = eng_tokenizer.sequences_to_texts(eng_encode_example)
ind_decode_example = ind_tokenizer.sequences_to_texts(ind_encode_example)

eng_maxlen = max([len(i)for i in eng_decode_example])
ind_maxlen = max([len(i)for i in ind_decode_example])

print(f"English sentence: {eng_decode_example[-1]}")
print(f"English sequences: {eng_encode_example[-1]}")
print()
print(f"Indonesia sentence: {ind_decode_example[-1]}")
print(f"Indonesia sequences: {ind_encode_example[-1]}")

English sentence: if a person has not had a chance to acquire his target language by the time he's an adult he's unlikely to be able to reach native speaker level in that language
English sequences: [70, 7, 448, 42, 28, 63, 7, 692, 5, 4087, 32, 4088, 660, 78, 3, 43, 127, 77, 4089, 127, 2249, 5, 25, 258, 5, 1290, 1641, 1623, 4090, 15, 11, 660]

Indonesia sentence: jika seseorang tidak berkesempatan untuk menguasai bahasa yang diinginkannya ketika menginjak dewasa maka kecil kemungkinan ia akan bisa mencapai tingkatan penutur asli dalam bahasa tersebut
Indonesia sequences: [119, 290, 3, 4869, 17, 1607, 71, 4, 4870, 158, 4871, 1165, 823, 242, 534, 80, 12, 15, 1719, 4872, 4873, 1160, 46, 71, 248]


In [156]:
batch_size = 128
buffer_size = len(eng_sentences)

eng_pad_seqs = tf.convert_to_tensor(pad_seqs(eng_encode_example, maxlen=eng_maxlen))
ind_pad_seqs = tf.convert_to_tensor(pad_seqs(ind_encode_example, maxlen=eng_maxlen))

eng_pad_seqs = tf.data.Dataset.from_tensor_slices((eng_pad_seqs, ind_pad_seqs)).shuffle(buffer_size).batch(batch_size)

# eng_pad_seqs = tf.reshape(eng_pad_seqs, (*eng_pad_seqs.shape, 1))
# ind_pad_seqs = tf.reshape(ind_pad_seqs, (*eng_pad_seqs.shape[:-1], 1))

for input, target in dataset.take(1):
  print(f"English sentence: {eng_decode_example[-1]}")
  print(f"English sequences: {input}")
  print()
  print(f"Indonesia sentence: {ind_decode_example[-1]}")
  print(f"Indonesia sequences: {target}")

English sentence: if a person has not had a chance to acquire his target language by the time he's an adult he's unlikely to be able to reach native speaker level in that language
English sequences: [[ 70 266   7 ...   0   0   0]
 [  2  30   5 ...   0   0   0]
 [215   5   3 ...   0   0   0]
 ...
 [ 53 135 117 ...   0   0   0]
 [110 442 727 ...   0   0   0]
 [ 41   6   7 ...   0   0   0]]

Indonesia sentence: jika seseorang tidak berkesempatan untuk menguasai bahasa yang diinginkannya ketika menginjak dewasa maka kecil kemungkinan ia akan bisa mencapai tingkatan penutur asli dalam bahasa tersebut
Indonesia sequences: [[119  19 880 ...   0   0   0]
 [  1  21 168 ...   0   0   0]
 [747  16 958 ...   0   0   0]
 ...
 [ 63 112   4 ...   0   0   0]
 [734   2 491 ...   0   0   0]
 [ 11  57   4 ...   0   0   0]]


In [157]:
embed_dims = 256
epochs = 5

optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [158]:
def build_model(input_len, output_len, embed_dims, maxlen):
  # # Model Architecure 1
  # en_inputs = layers.Input(shape=(maxlen, 1))
  # en_embedding = layers.Embedding(input_len, embed_dims)(en_inputs)
  
  # en_lstm = layers.LSTM(512, return_state=True)
  # en_outputs, state_h, state_c = en_lstm(en_inputs)
  # en_states = [state_h, state_c]

  # dec_inputs = layers.Input(shape=(maxlen, 1))
  # dec_embedding = layers.Embedding(output_len, embed_dims)(dec_inputs)
  
  # dec_lstm = layers.LSTM(512, return_sequences=True, return_state=True)
  # dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states)
  # x = layers.Dense(1024, activation=tf.nn.relu)(dec_outputs)
  # x = layers.Dropout(.5)(x)
  # outputs = layers.Dense(output_len, activation=tf.nn.softmax)(x)

  # model = Model([en_inputs, dec_inputs], outputs)

  # Model Architecure 2
  en_inputs = layers.Input(shape=(maxlen,))
  en_embedding = layers.Embedding(input_len, embed_dims)(en_inputs)
  
  en_gru = layers.GRU(512, return_sequences=True, return_state=True,)
  en_outputs, en_state = en_gru(en_embedding)

  dec_embedding = layers.Embedding(output_len, embed_dims)(en_inputs)
  
  dec_gru = layers.GRU(512, return_sequences=True, return_state=True)
  dec_outputs, dec_state = dec_gru(dec_embedding, initial_state=en_state)

  x = layers.Dense(512, activation=tf.nn.relu)(dec_outputs)
  x = layers.Dropout(.5)(x)
  x = layers.Dense(1024, activation=tf.nn.relu)(dec_outputs)
  x = layers.Dropout(.5)(x)
  outputs = layers.Dense(output_len, activation=tf.nn.softmax)(x)

  model = Model(en_inputs, outputs)

  model.compile(
      optimizer=optimizer,
      loss=loss,
      metrics=["accuracy"]
  )

  return model

In [159]:
checkpoint_path = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/training_checkpoints/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    save_weights_only=True,
    save_best_only=True,
    save_freq=10,
    verbose=1, 
)

model = build_model(
    len(eng_vocab) + 1, 
    len(ind_vocab) + 1, 
    embed_dims,
    eng_maxlen,
)

model.summary()

model.save_weights(checkpoint_path.format(epoch=0))

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_29 (InputLayer)          [(None, 161)]        0           []                               
                                                                                                  
 embedding_60 (Embedding)       (None, 161, 256)     1047296     ['input_29[0][0]']               
                                                                                                  
 embedding_61 (Embedding)       (None, 161, 256)     1247744     ['input_29[0][0]']               
                                                                                                  
 gru_56 (GRU)                   [(None, 161, 512),   1182720     ['embedding_60[0][0]']           
                                 (None, 512)]                                               

In [None]:
model.fit(dataset,
          epochs=epochs,
          callbacks=[cp_callback],
          verbose=1)

Epoch 1/5
Epoch 2/5

In [None]:
saved_model_path = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/saved_model/model.h5"
saved_model_dir = os.path.dirname(saved_model_path)

if os.path.exists(saved_model_dir):
  shutil.rmtree(saved_model_dir)

model.save(saved_model_path)

In [None]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest

In [None]:
def load_model(model_path):
  model = tf.keras.models.load_model(model_path)

  en_outputs, en_state = model.layers[2].output
  en_model = Model(model.input[0], en_state)

  dec_state = layers.Input(shape=(512,))
  dec_inputs = dec_state

  dec_gru = model.layers[3]
  dec_outputs, dec_state= dec_gru(model.input[0], initial_state=dec_inputs)
  
  dec_dense1 = model.layers[4](dec_outputs)
  dec_dropout1 = model.layers[5](dec_dense1)
  dec_dense2 = model.layers[6](dec_dropout1)
  dec_dropout2 = model.layers[7](dec_dense2)

  dec_model = Model(model.input[0] + dec_inputs, 
                    [dec_dropout2] + dec_state)
  
  return en_model, dec_model

In [None]:
en_model, dec_model = load_model(saved_model_path)

In [None]:
def translate(text):
  tokens = list()

  sequences = eng_tokenizer.texts_to_sequences([text])
  sequences = tf.convert_to_tensor(pad_seqs(sequences))

  input = en_model.predict(sequences)
  target_seq = np.zeros((1, 1))
  
  for i in sequences:
    output_chars, h, c = dec_model.predict([target_seq] + input)
    char_index = np.argmax(output_chars)
    text_char = ind_tokenizer.index_word[char_index]
    tokens.append(text_char)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = char_index
    states_value = [h, c]
  
  sentence = " ".join(tokens)
  return sentence

In [None]:
translate("if a person has not had a chance to acquire his target language by the time")