In [100]:
import tensorflow as tf
import subprocess
import os
import pandas as pd
import random
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
GIT_DIR = "/content/IOH-Chat-App"
GIT_URL = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

if not os.path.exists(GIT_DIR):
  subprocess.call(["git", "clone", GIT_URL])

In [34]:
DATASET_DIR = "/content/IOH-Chat-App/Machine Learning/datasets/translate sentence/result/eng-ind.csv"

df = pd.read_csv(DATASET_DIR)

english_sentences = df.English.values
indonesia_sentences = df.Indonesia.values

print(f"Num of english sentence: {len(english_sentences)}")
print(f"Num of indonesia sentence: {len(indonesia_sentences)}")
print()
print(f"English example: {english_sentences[0]}")
print(f"Indonesia example: {indonesia_sentences[0]}")

Num of english sentence: 8819
Num of indonesia sentence: 8819

English example: Run!
Indonesia example: Lari!


In [42]:
def tokenizer(sentence, num_words):
  tokenizer = Tokenizer(num_words)
  tokenizer.fit_on_texts(sentence)

  return tokenizer

In [128]:
RANDOM_IDX = random.randint(0, 8000)

eng_tokenizer = tokenizer(english_sentences, 8000)
ind_tokenizer = tokenizer(indonesia_sentences, 8000)

eng_index_words = eng_tokenizer.index_word
ind_index_words = ind_tokenizer.index_word

eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
ind_sequences = ind_tokenizer.texts_to_sequences(indonesia_sentences)

eng_maxlen = max([len(i) for i in english_sentences])
ind_maxlen = max([len(i) for i in indonesia_sentences])

eng_total_words = len(eng_index_words)
ind_total_words = len(ind_index_words)

print(f"English sentence: {english_sentences[RANDOM_IDX]}")
print(f"English sequences: {eng_sequences[RANDOM_IDX]}")
print()
print(f"Indonesia sentence: {indonesia_sentences[RANDOM_IDX]}")
print(f"Indonesia sequences: {ind_sequences[RANDOM_IDX]}")

English sentence: What is this?
English sequences: [17, 6, 12]

Indonesia sentence: Ini apa?
Indonesia sequences: [8, 9]


In [129]:
def pad_seqs(sequences, maxlen=None):
  return pad_sequences(sequences, maxlen=maxlen, padding='post', truncating="post")

In [130]:
eng_pad_seqs = pad_seqs(eng_sequences)
ind_pad_seqs = pad_seqs(ind_sequences)

ind_pad_seqs = ind_pad_seqs.reshape(*ind_pad_seqs.shape, 1)

print(f"English sentence: {english_sentences[RANDOM_IDX]}")
print(f"English sequences: {eng_pad_seqs[RANDOM_IDX]}")
print()
print(f"Indonesia sentence: {indonesia_sentences[RANDOM_IDX]}")
print(f"Indonesia sequences: {ind_pad_seqs[RANDOM_IDX]}")

English sentence: What is this?
English sequences: [17  6 12  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]

Indonesia sentence: Ini apa?
Indonesia sequences: [[8]
 [9]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]


In [131]:
EMBED_DIMS = 256
LR = 0.001
EPOCHS = 15

OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=LR)
LOSS = tf.keras.losses.sparse_categorical_crossentropy

In [132]:
def build_model(input_shape, embed_dims, eng_total_words, ind_total_words):
  model = tf.keras.Sequential()

  model.add(layers.Embedding(eng_total_words, embed_dims, input_length=input_shape[1], input_shape=input_shape[1:]))
  model.add(layers.Bidirectional(layers.GRU(512)))
  model.add(layers.Dense(1024, activation=tf.nn.relu))
  model.add(layers.Dropout(0.3))
  model.add(layers.Dense(ind_total_words, activation=tf.nn.softmax))

  model.compile(
      optimizer=OPTIMIZER,
      loss=LOSS,
      metrics=["accuracy"]
  )

  model.summary()

  return model

In [133]:
x = pad_seqs(eng_pad_seqs, maxlen=ind_pad_seqs.shape[1])
x = x.reshape((-1, ind_pad_seqs.shape[-2], 1))

model = build_model(
    x.shape, 
    EMBED_DIMS, 
    eng_total_words + 1,
    ind_total_words + 1,
)

model.fit(
    x,
    ind_pad_seqs,
    batch_size=128, 
    epochs=EPOCHS, 
    validation_split=0.2,
    verbose=1,
)

ValueError: ignored