In [8]:
!pip install tensorflow_text

Collecting tensorflow_text
  Using cached tensorflow_text-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
Collecting tensorflow<2.10,>=2.9.0
  Downloading tensorflow-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |████████████████████████████████| 511.7 MB 4.0 kB/s 
[?25hCollecting tensorflow-estimator<2.10.0,>=2.9.0rc0
  Downloading tensorflow_estimator-2.9.0-py2.py3-none-any.whl (438 kB)
[K     |████████████████████████████████| 438 kB 56.7 MB/s 
Collecting flatbuffers<2,>=1.12
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting tensorboard<2.10,>=2.9
  Downloading tensorboard-2.9.0-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 44.5 MB/s 
Collecting keras<2.10.0,>=2.9.0rc0
  Downloading keras-2.9.0-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 49.9 MB/s 
Collecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Insta

In [63]:
import tensorflow as tf
import tensorflow_text as tf_text
import subprocess
import os
import pandas as pd
import random
import shutil
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [64]:
git_dir = "/content/IOH-Chat-App"
git_url = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

if not os.path.exists(git_dir):
  subprocess.call(["git", "clone", git_url])

In [107]:
class TranslatorDataset:
  
  def __init__(self, filedir):
    self.filedir = filedir
    self.input_tokenizer = None
    self.target_tokenizer = None
    self._load_data_from_file()

  def _load_data_from_file(self):
    df = pd.read_csv(self.filedir)

    self.input_lang = df.English.tolist()
    self.target_lang = df.Indonesia.values.tolist()

  def _normalize_and_preprocess(self, text):
    text = tf_text.normalize_utf8(text).numpy().decode()
    text = text.lower().strip()
    text = text.replace("\t\n", "")

    return text

  def _create_dataset(self):
    self.input_lang = np.array(list(map(self._normalize_and_preprocess, self.input_lang)))
    self.target_lang = np.array(list(map(self._normalize_and_preprocess, self.target_lang)))

    return self.input_lang, self.target_lang

  def _tokenize(self, sentence, num_words):
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(sentence)

    sequences = tokenizer.texts_to_sequences(sentence)

    maxlen = max([len(i) for i in sentence])
    sequences = pad_sequences(sequences, maxlen, padding="post")

    return sequences, tokenizer

  def _load_dataset(self, num_words):
    input_lang, target_lang = self._create_dataset()

    input_sequences, input_tokenizer = self._tokenize(input_lang, num_words)
    target_sequences, target_tokenizer = self._tokenize(target_lang, num_words)

    return (input_sequences, input_tokenizer), (target_sequences, target_tokenizer)
  
  def call(self, num_words, batch_size, buffer_size):
    input, target = self._load_dataset(num_words)

    input_sequences, self.input_tokenizer = input
    target_sequences, self.target_tokenizer = target

    dataset = tf.data.Dataset.from_tensor_slices((eng_pad_seqs, ind_pad_seqs))
    dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

    return self.input_tokenizer, self.target_tokenizer, dataset

In [108]:
buffer_size = 8000
batch_size = 128
num_words = 500

dataset_dir = "/content/IOH-Chat-App/Machine Learning/datasets/translate sentence/result/eng-ind.csv"

translator_dataset = TranslatorDataset(dataset_dir)
input_tokenizer, target_tokenizer, dataset = translator_dataset.call(num_words, 
                                                                     batch_size, 
                                                                     buffer_size)

input_batch, target_batch = next(iter(dataset))
input_batch.shape, target_batch.shape

(TensorShape([128, 161]), TensorShape([128, 161]))

In [111]:
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
input_maxlen = input_batch.shape[1]
output_maxlen = target_batch.shape[1]

input_maxlen, output_maxlen, input_vocab_size, target_vocab_size

(161, 161, 4091, 4874)

In [60]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([128, 161]), TensorShape([128, 161]))

In [11]:
embed_dims = 256
epochs = 5

optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.SparseCategoricalCrossentropy(
       from_logits=True, reduction='none')

In [58]:
def build_model(input_len, output_len, embed_dims, maxlen):
  # # Model Architecure 1
  # en_inputs = layers.Input(shape=(maxlen, 1))
  # en_embedding = layers.Embedding(input_len, embed_dims)(en_inputs)
  
  # en_lstm = layers.LSTM(512, return_state=True)
  # en_outputs, state_h, state_c = en_lstm(en_inputs)
  # en_states = [state_h, state_c]

  # dec_inputs = layers.Input(shape=(maxlen, 1))
  # dec_embedding = layers.Embedding(output_len, embed_dims)(dec_inputs)
  
  # dec_lstm = layers.LSTM(512, return_sequences=True, return_state=True)
  # dec_outputs, _, _ = dec_lstm(dec_inputs, initial_state=en_states)
  # x = layers.Dense(1024, activation=tf.nn.relu)(dec_outputs)
  # x = layers.Dropout(.5)(x)
  # outputs = layers.Dense(output_len, activation=tf.nn.softmax)(x)

  # model = Model([en_inputs, dec_inputs], outputs)

  # Model Architecure 2
  en_inputs = layers.Input(shape=(maxlen,))
  en_embedding = layers.Embedding(input_len, embed_dims)(en_inputs)
  
  en_lstm = layers.LSTM(512, return_sequences=True, return_state=True, 
                        recurrent_initializer='glorot_uniform')
  en_outputs, en_h_state, en_c_state = en_lstm(en_embedding)

  dec_inputs = layers.Input(shape=(maxlen,))
  dec_embedding = layers.Embedding(output_len, embed_dims)(dec_inputs)
  
  dec_gru = layers.GRU(512, return_sequences=True, return_state=True)
  dec_outputs, dec_state = dec_gru(dec_embedding, initial_state=en_state)

  attention_layer = layers.Attention()
  attention_out, attention_states = attention_layer([en_outputs, en_state],
                                                   [dec_outputs, dec_state])
  
  rnn_output = layers.concatenate([dec_outputs, attention_out])

  x = layers.Dense(512, activation=tf.nn.relu)(rnn_output)
  x = layers.Dropout(.5)(x)
  x = layers.Dense(1024, activation=tf.nn.relu)(x)
  x = layers.Dropout(.5)(x)
  outputs = layers.Dense(output_len, activation=tf.nn.softmax)(x)

  model = Model(en_inputs, outputs)

  # Model Architecure 3
  # model = tf.keras.Sequential()
  
  # model.add(layers.Input(shape=(maxlen,)))
  # model.add(layers.Embedding(input_len, embed_dims))
  # model.add(layers.GRU(512, return_sequences=True, return_state=True))
  # model.add(layers.Embedding(output_len, embed_dims))
  # model.add(layers.GRU(512, return_sequences=True, return_state=True))  
  # model.add(layers.Dense(512, activation=tf.nn.relu))
  # model.add(layers.Dropout(.5))
  # model.add(layers.Dense(1024, activation=tf.nn.relu))
  # model.add(layers.Dropout(.5))
  # model.add(layers.Dense(output_len, activation=tf.nn.softmax))

  model.compile(
      optimizer=optimizer,
      loss=loss,
      metrics=["accuracy"]
  )

  return model

In [59]:
checkpoint_path = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/training_checkpoints/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    save_weights_only=True,
    save_best_only=True,
    save_freq=10,
    verbose=1, 
)

model = build_model(
    len(eng_vocab) + 1, 
    len(ind_vocab) + 1, 
    embed_dims,
    eng_maxlen,
)

model.summary()

model.save_weights(checkpoint_path.format(epoch=0))

TypeError: ignored

In [40]:
model.fit(dataset,
          epochs=epochs,
          callbacks=[cp_callback],
          verbose=1)

Epoch 1/5
12/69 [====>.........................] - ETA: 25:22 - loss: 7.9313 - accuracy: 0.8862

KeyboardInterrupt: ignored

In [2]:
saved_model_path = "/content/IOH-Chat-App/Machine Learning/code/translate sentence/saved_model/model.h5"
saved_model_dir = os.path.dirname(saved_model_path)

if os.path.exists(saved_model_dir):
  shutil.rmtree(saved_model_dir)
else:
  model.save(saved_model_path)

NameError: ignored

In [3]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest

NameError: ignored

In [None]:
def load_model(model_path):
  model = tf.keras.models.load_model(model_path)

  en_outputs, en_state = model.layers[3].output
  en_model = Model(model.input[0], en_state)

  dec_state = layers.Input(shape=(512,))
  dec_inputs = dec_state

  dec_gru = model.layers[4]
  dec_outputs, dec_state= dec_gru(model.input[0], initial_state=dec_inputs)
  
  dec_dense1 = model.layers[5](dec_outputs)
  dec_dropout1 = model.layers[6](dec_dense1)
  dec_dense2 = model.layers[7](dec_dropout1)
  dec_dropout2 = model.layers[8](dec_dense2)
  output = model.layers[9](dec_dense2)

  dec_model = Model(model.input[0] + dec_inputs, 
                    [output] + dec_state)
  
  return en_model, dec_model

In [None]:
en_model, dec_model = load_model(saved_model_path)

In [None]:
def translate(text):
  tokens = list()

  sequences = eng_tokenizer.texts_to_sequences([text])
  sequences = tf.convert_to_tensor(pad_seqs(sequences))

  input = en_model.predict(sequences)
  target_seq = np.zeros((1, 1))
  
  for i in sequences:
    output_chars, h, c = dec_model.predict([target_seq] + input)
    char_index = np.argmax(output_chars)
    text_char = ind_tokenizer.index_word[char_index]
    tokens.append(text_char)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = char_index
    states_value = [h, c]
  
  sentence = " ".join(tokens)
  return sentence

In [None]:
translate("if a person has not had a chance to acquire his target language by the time")