In [71]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import os
import subprocess
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
GIT_DIR = "IOH-Chat-App/"
GIT_URL = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"

if not os.path.exists(GIT_DIR):
  subprocess.call(["git", "clone", GIT_URL])

In [10]:
def read_data(file_dir):
  sentences = list()
  with open(file_dir) as f:
    for sentence in f.readlines():
      sentences.append(sentence.replace("\n", ""))

  return sentences

In [53]:
SENTENCE_DIR = "/content/IOH-Chat-App/Machine Learning/datasets/text generation/sentence.txt"

corpus = read_data(SENTENCE_DIR)

print(f"Sentence example: {corpus[0]}")
print(f"Num of data: {len(corpus)}")

Sentence example: Aku bilang aku akan bicara pada Tom tentang hal itu.
Num of data: 588


In [54]:
NUM_WORDS = 2500

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.index_word) + 1

print(f"Max Length: {total_words}")
print(tokenizer.index_word)

Max Length: 1761
{1: 'saya', 2: 'yang', 3: 'tidak', 4: 'di', 5: 'dan', 6: 'untuk', 7: 'ini', 8: 'dengan', 9: 'aku', 10: 'tom', 11: 'aplikasi', 12: 'itu', 13: 'akan', 14: 'bisa', 15: 'apa', 16: 'ada', 17: 'kamu', 18: 'anda', 19: 'dia', 20: 'lebih', 21: 'tetapi', 22: 'adalah', 23: 'banyak', 24: 'ke', 25: 'dari', 26: 'pada', 27: 'sangat', 28: 'tahu', 29: 'benar', 30: 'dapat', 31: 'telah', 32: 'orang', 33: 'baru', 34: 'dalam', 35: 'masalah', 36: 'bahwa', 37: 'bagus', 38: 'hal', 39: 'memiliki', 40: 'beberapa', 41: 'saat', 42: 'waktu', 43: 'suka', 44: 'baik', 45: 'ketika', 46: 'satu', 47: 'kalau', 48: 'kami', 49: 'jika', 50: 'kita', 51: 'tapi', 52: 'teman', 53: 'ia', 54: 'mary', 55: 'sudah', 56: 'karena', 57: 'pergi', 58: 'setiap', 59: 'anak', 60: 'harus', 61: 'pesan', 62: 'kau', 63: 'hari', 64: 'setelah', 65: 'melakukan', 66: 'melihat', 67: 'fitur', 68: 'panggilan', 69: 'semua', 70: 'mungkin', 71: 'atau', 72: 'tahun', 73: 'seperti', 74: 'membuat', 75: 'ingin', 76: 'selama', 77: 'pernah', 78

In [55]:
def n_gram_sequences(corpus, tokenizer):
  sequences = list()

  for l in corpus:
    tokens = tokenizer.texts_to_sequences([l])[0]
    for i in range(2, len(tokens)):
      n_grams = tokens[:i+1]
      sequences.append(n_grams)

  return sequences

In [56]:
sequences = n_gram_sequences(corpus, tokenizer)
maxlen = max([len(i) for i in sequences])

print(f"Max word length: {maxlen}")

Max word length: 23


In [57]:
pad_seqs = np.array(pad_sequences(sequences, maxlen=maxlen))
print(pad_seqs[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   9 157   9]


In [65]:
def get_features_and_labels(sequences, n_classes):
  features = sequences[:, :-1]
  labels = sequences[:, -1]
  one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=n_classes)

  return features, one_hot_labels

In [70]:
features, labels = get_features_and_labels(pad_seqs, total_words)

print(features[0:5])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   9 157]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   9 157   9]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    9 157   9  13]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   9
  157   9  13 299]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   9 157
    9  13 299  26]]


In [72]:
def build_model(n_classes, embedd_dims, maxlen):
  model = tf.keras.Sequential()

  model.add(layers.Embedding(n_classes, embedd_dims, input_length=maxlen-1))
  model.add(layers.Dropout(0.3))
  model.add(layers.Bidirectional(layers.GRU(128, return_sequences=True)))
  model.add(layers.Bidirectional(layers.GRU(256, return_sequences=True)))
  model.add(layers.Dropout(0.3))
  model.add(layers.GlobalMaxPooling1D())
  model.add(layers.Dense(1024, activation=tf.nn.relu))
  model.add(layers.Dense(n_classes, activation=tf.nn.softmax))

  return model

In [73]:
EPOCHS = 150
EMBED_DIMS = 64
LR = 0.001

OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=LR)
LOSS = tf.keras.losses.categorical_crossentropy

CHECKPOINT_DIR = "/content/IOH-Chat-App/Machine Learning/datasets/text generation/training_checkpoints"
checkpoint_callback = model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=CHECKPOINT_DIR,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_freq=10,
    save_best_only=True)

In [None]:
model = build_model(
    total_words, 
    EMBED_DIMS, 
    maxlen
)

model.compile(
    optimizer=OPTIMIZER,
    loss=LOSS,
    metrics=["accuracy"],
)

model.fit(
    features,
    labels,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback],
    verbose=1,
)

model.load_weights(CHECKPOINT_DIR)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150