In [None]:
# ============================================================
# Chatbot Seq2Seq — LSTM (Keras, Colab-ready)
# Dataset: /content/drive/MyDrive/Semester 7/NLP/intents.json
#  - Encoder: LSTM
#  - Decoder: LSTM (teacher forcing)
#  - Evaluasi: Exact Match, BLEU-1/2/4
# ============================================================

import os, re, json, random, pickle, numpy as np
from pathlib import Path

# ---- 0) Colab: Mount Google Drive ----
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# (opsional) BLEU evaluator
!pip -q install nltk
import nltk
nltk.download('punkt', quiet=True)

# ---- 1) Konfigurasi umum ----
SEED = 42
random.seed(SEED); np.random.seed(SEED)

DATA_PATH    = "/content/drive/MyDrive/Semester 7/NLP/train-SQuAD-id.json"
ARTIFACT_DIR = "/content/artifacts_seq2seq"
CKPT_DIR     = "/content/checkpoints_seq2seq"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

In [None]:
# ---- 2) TensorFlow & Keras (pakai versi yang ada di Colab) ----
import tensorflow as tf
tf.random.set_seed(SEED)
print("TF version:", tf.__version__)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

TF version: 2.19.0


In [None]:
# ============================================================
# 3) Load dataset intents.json -> Buat pasangan (src, tgt)
#     src = pattern, tgt = response
# ============================================================
with open(DATA_PATH, "r", encoding="utf-8") as f:
    data_json = json.load(f)

pairs = []   # (src_text (question), tgt_text (answer))

def clean_text(s: str) -> str:
    s = str(s).lower()
    # izinkan karakter basic + tanda baca, hapus back-to-back whitespace
    s = re.sub(r"[^a-z0-9\sÀ-ÿ',?.!/-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# iterate through SQuAD-like structure
for doc in data_json.get("data", []):
    for para in doc.get("paragraphs", []):
        context = para.get("context", "")
        # iterate qas
        for qa in para.get("qas", []):
            question = qa.get("question", "").strip()
            answers = qa.get("answers", []) or []
            if not question or not answers:
                continue
            # ambil jawaban pertama (bisa juga random atau semua, tapi kita pilih pertama)
            answer_text = answers[0].get("text", "").strip()
            if not answer_text:
                continue
            # clean question & answer
            q_clean = clean_text(question)
            a_clean = clean_text(answer_text)
            if q_clean == "" or a_clean == "":
                continue
            pairs.append((q_clean, a_clean))

if not pairs:
    raise ValueError("Tidak ditemukan pasangan question-answer dari file JSON. Cek struktur file.")

print(f"Total QA pairs: {len(pairs)}  (contoh 5 teratas)")
for i in range(min(5, len(pairs))):
    print("  Q:", pairs[i][0])
    print("  A:", pairs[i][1])

Total QA pairs: 76427  (contoh 5 teratas)
  Q: kapan beyonce mulai populer?
  A: pada akhir 1990-an
  Q: di bidang apa beyonce bersaing ketika dia tumbuh dewasa?
  A: menyanyi dan menari
  Q: kapan beyonce meninggalkan destiny's child dan menjadi penyanyi solo?
  A: 2003
  Q: di kota dan negara bagian manakah beyonce tumbuh?
  A: houston, texas
  Q: pada dekade berapa beyonce menjadi terkenal?
  A: akhir 1990-an


In [None]:
# ============================================================
# 4) Tambahkan token khusus untuk target: <sos> dan <eos>
# ============================================================
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"

src_texts = [src for src, tgt in pairs]
tgt_texts = [f"{SOS_TOKEN} {tgt} {EOS_TOKEN}" for _, tgt in pairs]   # decoder inputs akan mulai dari <sos>

In [None]:
# ============================================================
# 5) Tokenizer untuk SRC dan TGT (dipisah)
# ============================================================
MAX_VOCAB_SRC = 20000
MAX_VOCAB_TGT = 20000

src_tok = Tokenizer(num_words=MAX_VOCAB_SRC, oov_token="<oov>", filters='')
tgt_tok = Tokenizer(num_words=MAX_VOCAB_TGT, oov_token="<oov>", filters='')

src_tok.fit_on_texts(src_texts)
tgt_tok.fit_on_texts(tgt_texts)

src_seqs = src_tok.texts_to_sequences(src_texts)
tgt_seqs = tgt_tok.texts_to_sequences(tgt_texts)

# Panjang maksimal (heuristik: persentil 95)
src_lens = [len(s) for s in src_seqs]
tgt_lens = [len(s) for s in tgt_seqs]
MAX_LEN_SRC = max(5, int(np.percentile(src_lens, 95)))
MAX_LEN_TGT = max(7, int(np.percentile(tgt_lens, 95)))  # target biasanya sedikit lebih panjang karena <sos>/<eos>

X_enc = pad_sequences(src_seqs, maxlen=MAX_LEN_SRC, padding="post", truncating="post")
Y_dec_in = pad_sequences(tgt_seqs, maxlen=MAX_LEN_TGT, padding="post", truncating="post")

# Decoder target (teacher forcing) = shift left (tanpa <sos>)
Y_shift = np.concatenate([Y_dec_in[:,1:], np.zeros((Y_dec_in.shape[0],1), dtype=int)], axis=1)

vocab_size_src = min(MAX_VOCAB_SRC, len(src_tok.word_index) + 1)
vocab_size_tgt = min(MAX_VOCAB_TGT, len(tgt_tok.word_index) + 1)

print("MAX_LEN_SRC =", MAX_LEN_SRC, "| MAX_LEN_TGT =", MAX_LEN_TGT)
print("vocab_size_src =", vocab_size_src, "| vocab_size_tgt =", vocab_size_tgt)

MAX_LEN_SRC = 15 | MAX_LEN_TGT = 10
vocab_size_src = 10000 | vocab_size_tgt = 10000


In [None]:
# ============================================================
# 6) Split Train/Val (random)
# ============================================================
from sklearn.model_selection import train_test_split

X_enc_train, X_enc_val, Y_dec_in_train, Y_dec_in_val, Y_shift_train, Y_shift_val = train_test_split(
    X_enc, Y_dec_in, Y_shift, test_size=0.15, random_state=SEED
)

In [None]:
# ============================================================
# 7) Bangun Model Seq2Seq (LSTM Encoder–Decoder)
# ============================================================
EMB_DIM   = 128
RNN_UNITS = 256   # Naikkan sedikit biar decoder lebih stabil
LR         = 1e-3

# Encoder
enc_inputs = Input(shape=(MAX_LEN_SRC,), name="enc_inputs")
enc_emb = Embedding(input_dim=vocab_size_src, output_dim=EMB_DIM, name="enc_emb")(enc_inputs)
_, state_h, state_c = LSTM(RNN_UNITS, return_state=True, name="enc_lstm")(enc_emb)
enc_states = [state_h, state_c]

# Decoder
dec_inputs = Input(shape=(MAX_LEN_TGT,), name="dec_inputs")
dec_emb = Embedding(input_dim=vocab_size_tgt, output_dim=EMB_DIM, name="dec_emb")(dec_inputs)
dec_outputs, _, _ = LSTM(RNN_UNITS, return_sequences=True, return_state=True, name="dec_lstm")(dec_emb, initial_state=enc_states)
dec_logits = Dense(vocab_size_tgt, activation="softmax", name="dec_out")(dec_outputs)

seq2seq = Model([enc_inputs, dec_inputs], dec_logits)
seq2seq.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
seq2seq.summary()

# Target untuk sparse_categorical_crossentropy harus shape (batch, time, 1) atau int tanpa onehot
Y_target_train = np.expand_dims(Y_shift_train, axis=-1)
Y_target_val   = np.expand_dims(Y_shift_val, axis=-1)

In [None]:
# ============================================================
# 8) Training
# ============================================================
ckpt_path = f"{CKPT_DIR}/best_seq2seq_lstm.keras"
callbacks = [
    EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
    ModelCheckpoint(ckpt_path, monitor="val_loss", save_best_only=True)
]

history = seq2seq.fit(
    [X_enc_train, Y_dec_in_train], Y_target_train,
    validation_data=([X_enc_val, Y_dec_in_val], Y_target_val),
    epochs=30,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

# (opsional) load best checkpoint
seq2seq = tf.keras.models.load_model(ckpt_path)

Epoch 1/30
[1m2031/2031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1746s[0m 851ms/step - loss: 2.4354 - val_loss: 1.9692
Epoch 2/30
[1m2031/2031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1935s[0m 953ms/step - loss: 1.9069 - val_loss: 1.8616
Epoch 3/30
[1m2031/2031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1916s[0m 940ms/step - loss: 1.7679 - val_loss: 1.8024
Epoch 4/30
[1m2031/2031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1912s[0m 942ms/step - loss: 1.6501 - val_loss: 1.7826
Epoch 5/30
[1m2031/2031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1880s[0m 925ms/step - loss: 1.5357 - val_loss: 1.7922
Epoch 6/30
[1m2031/2031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1876s[0m 923ms/step - loss: 1.4236 - val_loss: 1.8154
Epoch 7/30
[1m2031/2031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1837s[0m 900ms/step - loss: 1.3168 - val_loss: 1.8556
Epoch 8/30
[1m2031/2031[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1773s[0m 873ms/step - loss: 1.2189 - v

In [None]:
# ============================================================
# 9) Simpan artefak
# ============================================================
with open(f"{ARTIFACT_DIR}/src_tokenizer.pkl", "wb") as f:
    pickle.dump(src_tok, f)
with open(f"{ARTIFACT_DIR}/tgt_tokenizer.pkl", "wb") as f:
    pickle.dump(tgt_tok, f)

meta = dict(
    MAX_LEN_SRC=MAX_LEN_SRC,
    MAX_LEN_TGT=MAX_LEN_TGT,
    SOS_TOKEN=SOS_TOKEN,
    EOS_TOKEN=EOS_TOKEN,
    vocab_size_src=vocab_size_src,
    vocab_size_tgt=vocab_size_tgt,
    RNN_UNITS=RNN_UNITS,
    EMB_DIM=EMB_DIM
)
with open(f"{ARTIFACT_DIR}/meta.json", "w") as f:
    json.dump(meta, f, indent=2)

seq2seq.save(f"{ARTIFACT_DIR}/seq2seq_lstm.keras")
print("Saved artifacts to:", ARTIFACT_DIR)

Saved artifacts to: /content/artifacts_seq2seq


In [None]:
# ============================================================
# 10) Inference: build encoder_model & decoder_model (greedy)
# ============================================================
# 10.1 Encoder inference (input -> [h,c])
encoder_model = Model(enc_inputs, enc_states)

# 10.2 Decoder 1-step (share weights dgn decoder training)
dec_state_input_h = Input(shape=(RNN_UNITS,), name="dec_state_input_h")
dec_state_input_c = Input(shape=(RNN_UNITS,), name="dec_state_input_c")
dec_states_inputs = [dec_state_input_h, dec_state_input_c]
dec_inp_step = Input(shape=(1,), name="dec_inp_step")           # 1 token tiap langkah

# Reuse embedding & dense layer dari model training
dec_emb_layer = seq2seq.get_layer("dec_emb")
dec_lstm_layer = seq2seq.get_layer("dec_lstm")
dec_out_layer = seq2seq.get_layer("dec_out")

# Buat LSTM 1-step untuk inference dan salin bobotnya
dec_lstm_step = LSTM(RNN_UNITS, return_state=True, name="dec_lstm_step")
# Build agar bisa set_weights (input bentuk (batch, time=1, EMB_DIM))
dec_lstm_step.build((None, 1, dec_emb_layer.output_dim))
dec_lstm_step.set_weights(dec_lstm_layer.get_weights())

# Alur 1-langkah:
dec_x = dec_emb_layer(dec_inp_step)                                  # (batch, 1, EMB_DIM)
dec_y, dec_h, dec_c = dec_lstm_step(dec_x, initial_state=dec_states_inputs)  # (batch, RNN_UNITS)
dec_y_time = Reshape((1, RNN_UNITS))(dec_y)                           # (batch, 1, RNN_UNITS)
dec_logits_step = dec_out_layer(dec_y_time)                           # (batch, 1, vocab_tgt)

decoder_model = Model([dec_inp_step] + dec_states_inputs, [dec_logits_step, dec_h, dec_c])


# 10.3 Helper id token & mapping
SOS_ID = tgt_tok.word_index.get(SOS_TOKEN, None)
EOS_ID = tgt_tok.word_index.get(EOS_TOKEN, None)
if SOS_ID is None or EOS_ID is None:
    raise ValueError("Token <sos>/<eos> tidak ada di tokenizer target. Cek preprocessing.")

# Mapping id->kata
index2word_tgt = tgt_tok.index_word

# 10.4 Greedy decoder
def greedy_decode(input_text: str, max_len=None):
    if max_len is None:
        max_len = MAX_LEN_TGT

    # Encode sumber -> [h,c]
    x = src_tok.texts_to_sequences([clean_text(input_text)])
    x = pad_sequences(x, maxlen=MAX_LEN_SRC, padding="post", truncating="post")

    states = encoder_model.predict(x, verbose=0)   # returns [h, c]

    # Mulai dari <sos>
    cur_token = np.array([[SOS_ID]], dtype="int32")
    result_ids = []

    for _ in range(max_len):
        outputs = decoder_model.predict([cur_token] + states, verbose=0)
        logits = outputs[0]        # (batch, 1, vocab)
        h = outputs[1]
        c = outputs[2]

        next_id = int(np.argmax(logits[0, 0, :]))

        # stop kalau padding/unknown/eos
        if next_id == 0 or next_id == EOS_ID:
            break

        result_ids.append(next_id)
        cur_token = np.array([[next_id]], dtype="int32")
        states = [h, c]

    words = [index2word_tgt.get(idx, "<unk>") for idx in result_ids]
    return " ".join(words).strip()

In [None]:
# ============================================================
# 11) Evaluasi (Exact Match & BLEU)
# ============================================================
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smooth = SmoothingFunction().method1

def evaluate(dataset_pairs, n_samples_eval=200):
    refs, hyps = [], []
    subset = random.sample(dataset_pairs, k=min(n_samples_eval, len(dataset_pairs)))
    for src, tgt in subset:
        hyp = greedy_decode(src)
        refs.append([tgt.split()])  # list of references (1 ref), tokenized
        hyps.append(hyp.split())

    exact = sum(1 for r, h in zip(refs, hyps) if " ".join(r[0]) == " ".join(h))
    exact_acc = exact / len(subset)

    bleu1 = np.mean([sentence_bleu(r, h, weights=(1,0,0,0), smoothing_function=smooth) for r,h in zip(refs, hyps)])
    bleu2 = np.mean([sentence_bleu(r, h, weights=(0.5,0.5,0,0), smoothing_function=smooth) for r,h in zip(refs, hyps)])
    bleu4 = np.mean([sentence_bleu(r, h, weights=(0.25,0.25,0.25,0.25), smoothing_function=smooth) for r,h in zip(refs, hyps)])

    return dict(exact_match=exact_acc, bleu1=bleu1, bleu2=bleu2, bleu4=bleu4)

pairs_clean_tgt = [(src, clean_text(tgt)) for (src, tgt) in [(p[0], p[1]) for p in pairs]]

metrics = evaluate(pairs_clean_tgt, n_samples_eval=200)
print("Eval (subset):", metrics)

Eval (subset): {'exact_match': 0.0, 'bleu1': np.float64(0.0037912956053197627), 'bleu2': np.float64(0.0013228353251744907), 'bleu4': np.float64(0.0009567653754246697)}


In [None]:
# ============================================================
# 12) Demo cepat
# ============================================================
tests = [
    "kapan beyonce mulai terkenal?",
    "siapa manajer destiny's child?",
    "apa album solo pertama beyonce?",
    "siapa suami beyonce?",
]
for q in tests:
    ans = greedy_decode(q)
    print(f"Q: {q}\nA: {ans}\n")

Q: kapan beyonce mulai terkenal?
A: <oov>

Q: siapa manajer destiny's child?
A: <oov> <oov>

Q: apa album solo pertama beyonce?
A: <oov> <oov>

Q: siapa suami beyonce?
A: <oov> <oov>



In [None]:
# ================================
# UI Chat Box (Seq2Seq - ipywidgets)
# ================================

# 1) Pastikan widget manager aktif
try:
    from google.colab import output as colab_output
    colab_output.enable_custom_widget_manager()
except Exception:
    pass

try:
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
except Exception:
    !pip -q install ipywidgets==8.1.1
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
    from google.colab import output as colab_output
    colab_output.enable_custom_widget_manager()

# 2) Komponen UI
input_box = widgets.Text(
    placeholder='Ketik pertanyaanmu di sini...',
    description='User:',
    layout=widgets.Layout(width='100%')
)
send_btn = widgets.Button(
    description='Kirim',
    button_style='primary',
    tooltip='Kirim pesan',
    layout=widgets.Layout(width='120px')
)
clear_btn = widgets.Button(
    description='Bersihkan',
    tooltip='Hapus riwayat',
    layout=widgets.Layout(width='120px')
)
status_lbl = widgets.Label(value='Siap ✅')

chat_output = widgets.Output(layout=widgets.Layout(
    border='1px solid #ddd', padding='10px', height='350px', overflow='auto'
))

controls = widgets.HBox([send_btn, clear_btn, status_lbl])
ui = widgets.VBox([input_box, controls, chat_output])

# 3) State riwayat percakapan (tanpa intent/conf)
history = []  # list of (role, text)

def render_history():
    with chat_output:
        clear_output()
        html = ['<div style="font-family: Inter, system-ui, Arial; font-size:14px">']
        for role, text in history:
            if role == 'user':
                html.append(f'''
                <div style="margin:8px 0;">
                    <div style="font-weight:600;color:#1a73e8">Kamu:</div>
                    <div style="white-space:pre-wrap">{text}</div>
                </div>''')
            else:
                html.append(f'''
                <div style="margin:8px 0;">
                    <div style="font-weight:600;color:#34a853">Bot:</div>
                    <div style="white-space:pre-wrap">{text}</div>
                </div>''')
        html.append('</div>')
        display(HTML(''.join(html)))

# 4) Handler kirim (pakai greedy_decode dari model Seq2Seq)
def handle_send(_=None):
    msg = input_box.value.strip()
    if not msg:
        status_lbl.value = "Ketik sesuatu dulu…"
        return
    status_lbl.value = "Memproses… ⏳"
    history.append(('user', msg))
    try:
        ans = greedy_decode(msg)  # <— PENTING: pakai seq2seq
        if not ans:
            ans = "(maaf, belum bisa menjawab)"
        history.append(('bot', ans))
    except Exception as e:
        history.append(('bot', f"Terjadi error: {e}"))
    render_history()
    input_box.value = ''
    status_lbl.value = "Siap ✅"

# 5) Handler clear
def handle_clear(_=None):
    history.clear()
    render_history()
    status_lbl.value = "Riwayat dibersihkan 🧹"

# 6) Event bindings
send_btn.on_click(handle_send)
clear_btn.on_click(handle_clear)
input_box.on_submit(handle_send)  # tekan Enter untuk kirim

# 7) Tampilkan UI
render_history()
display(ui)

VBox(children=(Text(value='', description='User:', layout=Layout(width='100%'), placeholder='Ketik pertanyaanm…