In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import os
import subprocess
import shutil

from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
GIT_URL = "https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git"
GIT_DIR = "content/IOH-Chat-App"

if os.path.exists(GIT_DIR):
  shutil.rmtree(GIT_DIR)

subprocess.call(["git", "clone", GIT_URL])

In [None]:
def load_dataset(file_dir):
  corpus = list()

  with open(file_dir) as f:
    for sentence in f.readlines():
      corpus.append(sentence.replace("\n", ""))

  return corpus

In [None]:
DATASET_DIR = "/content/IOH-Chat-App/Machine Learning/datasets/text generation/sentence.txt"

ds = load_dataset(DATASET_DIR)

In [None]:
NUM_WORDS = 1000

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(ds)

total_words = len(tokenizer.index_word) + 1

print(f"Data length: {total_words}")
print(f"Example: {tokenizer.index_word}")

In [None]:
def n_gram_sequences(sentence):
  sequences = list()

  for line in sentence:
    tokens = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(tokens)):
      n_gram = tokens[:i+1]
      sequences.append(n_gram)
      
  return sequences

In [None]:
sequences = n_gram_sequences(ds)
maxlen = max([len(i) for i in sequences])

print(f"Sequences length: {len(sequences)}")
print(f"Max words length: {maxlen}")

In [None]:
def get_features_and_labels(sequences, n_classes):
  features = sequences[:, :-1]
  labels = sequences[:,-1]
  one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=n_classes)

  return features, one_hot_labels

In [None]:
sequences = np.array(pad_sequences(sequences, maxlen=maxlen))
features, labels = get_features_and_labels(sequences, total_words)

print(f"Features: {features[0:5]}")

In [None]:
def build_model(n_classes, embed_dims, maxlen):
  model = tf.keras.Sequential()

  model.add(layers.Embedding(n_classes, embed_dims, input_length=maxlen-1))
  model.add(layers.Dropout(0.3))
  model.add(layers.Bidirectional(layers.GRU(128, return_sequences=True)))
  model.add(layers.Bidirectional(layers.GRU(256, return_sequences=True)))
  model.add(layers.Dropout(0.3))
  model.add(layers.GlobalMaxPooling1D())
  model.add(layers.Dense(1024, activation=tf.nn.relu))
  model.add(layers.Dropout(0.3))
  model.add(layers.Dense(n_classes, activation=tf.nn.softmax))

  return model

In [None]:
EPOCHS = 150
LR = 1e-4
EMBED_DIMS = 64

OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=LR)
LOSS = tf.keras.losses.categorical_crossentropy

CP_PATH = "/content/IOH-Chat-App/Machine Learning/datasets/text generation/training_checkpoints/cp-{epoch:04d}.ckpt"
CP_DIR = os.path.dirname(CP_PATH)

In [None]:
if not os.path.exists(CP_DIR):
  os.makedirs()

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=CP_PATH, 
    save_weights_only=True,
    save_freq=10,
    save_best_only=True,
)

In [None]:
model = build_model(total_words, EMBED_DIMS, maxlen)

model.save_weights(CP_PATH.format(epoch=0))

model.compile(
    optimizer=OPTIMIZER,
    loss=LOSS,
    metrics=["accuracy"]
)

model.fit(
    features, 
    labels, 
    epochs=EPOCHS, 
    callbacks=[cp_callback],
    verbose=1
    )