## Model Seq2Seq

In [5]:
# ============================================================
# Chatbot Seq2Seq — SimpleRNN (Keras, Colab-ready)
# Dataset: /content/drive/MyDrive/Semester 7/NLP/train-SQuAD-id.json
#  - Encoder: SimpleRNN
#  - Decoder: SimpleRNN (teacher forcing)
#  - Evaluasi: Exact Match, BLEU-1/2/4
# ============================================================

import os, re, json, random, pickle, numpy as np
from pathlib import Path

# ---- 0) Colab: Mount Google Drive ----
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# (opsional) BLEU evaluator
!pip -q install nltk
import nltk
nltk.download('punkt', quiet=True)

# ---- 1) Konfigurasi umum ----
SEED = 42
random.seed(SEED); np.random.seed(SEED)

DATA_PATH    = "/content/drive/MyDrive/Semester 7/NLP/train-SQuAD-id.json"
ARTIFACT_DIR = "/content/artifacts_seq2seq"
CKPT_DIR     = "/content/checkpoints_seq2seq"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

In [7]:
# ---- 2) TensorFlow & Keras (pakai versi yang ada di Colab) ----
import tensorflow as tf
tf.random.set_seed(SEED)
print("TF version:", tf.__version__)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

TF version: 2.19.0


In [8]:
# ============================================================
# 3) Load dataset SQuAD-ID -> Buat pasangan (src, tgt)
#     src = question (ID), tgt = answer_text (ID)
# ============================================================
with open(DATA_PATH, "r", encoding="utf-8") as f:
    raw = json.load(f)

pairs = []   # (src_text, tgt_text)

def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9\sÀ-ÿ',?.!/-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

for art in raw.get("data", []):
    for para in art.get("paragraphs", []):
        for qa in para.get("qas", []):
            # buang QA yang impossible atau tanpa jawaban
            if qa.get("is_impossible", False):
                continue
            answers = qa.get("answers", [])
            if not answers:
                continue
            q_text = qa.get("question", "").strip()
            a_text = answers[0].get("text", "").strip()  # ambil jawaban pertama

            q_clean = clean_text(q_text)
            a_clean = clean_text(a_text)
            if q_clean and a_clean:
                pairs.append((q_clean, a_clean))

if not pairs:
    raise ValueError("Tidak ada pasangan (question, answer) yang valid dari SQuAD-ID.")

print(f"Total pairs (SQuAD-ID): {len(pairs)}  (contoh 3 teratas)")
for i in range(min(3, len(pairs))):
    print("  Q :", pairs[i][0])
    print("  A :", pairs[i][1])

Total pairs (SQuAD-ID): 76427  (contoh 3 teratas)
  Q : kapan beyonce mulai populer?
  A : pada akhir 1990-an
  Q : di bidang apa beyonce bersaing ketika dia tumbuh dewasa?
  A : menyanyi dan menari
  Q : kapan beyonce meninggalkan destiny's child dan menjadi penyanyi solo?
  A : 2003


In [9]:
# ============================================================
# 4) Tambahkan token khusus untuk target: <sos> dan <eos>
# ============================================================
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"

src_texts = [src for src, tgt in pairs]
tgt_texts = [f"{SOS_TOKEN} {tgt} {EOS_TOKEN}" for _, tgt in pairs]   # decoder inputs akan mulai dari <sos>

In [10]:
# ============================================================
# 5) Tokenizer untuk SRC dan TGT (dipisah) + cap panjang
# ============================================================
MAX_VOCAB_SRC = 10000
MAX_VOCAB_TGT = 10000

src_tok = Tokenizer(num_words=MAX_VOCAB_SRC, oov_token="<oov>", filters='')
tgt_tok = Tokenizer(num_words=MAX_VOCAB_TGT, oov_token="<oov>", filters='')

src_tok.fit_on_texts([src for src, _ in pairs])
tgt_tok.fit_on_texts([f"<sos> {tgt} <eos>" for _, tgt in pairs])

src_seqs = src_tok.texts_to_sequences([src for src, _ in pairs])
tgt_seqs = tgt_tok.texts_to_sequences([f"<sos> {tgt} <eos>" for _, tgt in pairs])

src_lens = [len(s) for s in src_seqs]
tgt_lens = [len(s) for s in tgt_seqs]
# cap panjang agar training stabil (Q biasanya > A)
MAX_LEN_SRC = min(64, max(5, int(np.percentile(src_lens, 95))))
MAX_LEN_TGT = min(24, max(7, int(np.percentile(tgt_lens, 95))))

X_enc   = pad_sequences(src_seqs, maxlen=MAX_LEN_SRC, padding="post", truncating="post")
Y_dec_in = pad_sequences(tgt_seqs, maxlen=MAX_LEN_TGT, padding="post", truncating="post")
Y_shift  = np.concatenate([Y_dec_in[:,1:], np.zeros((Y_dec_in.shape[0],1), dtype=int)], axis=1)

vocab_size_src = min(MAX_VOCAB_SRC, len(src_tok.word_index) + 1)
vocab_size_tgt = min(MAX_VOCAB_TGT, len(tgt_tok.word_index) + 1)

print("MAX_LEN_SRC =", MAX_LEN_SRC, "| MAX_LEN_TGT =", MAX_LEN_TGT)
print("vocab_size_src =", vocab_size_src, "| vocab_size_tgt =", vocab_size_tgt)

MAX_LEN_SRC = 15 | MAX_LEN_TGT = 10
vocab_size_src = 10000 | vocab_size_tgt = 10000


In [11]:
# ============================================================
# 6) Split Train/Val (random)
# ============================================================
from sklearn.model_selection import train_test_split

X_enc_train, X_enc_val, Y_dec_in_train, Y_dec_in_val, Y_shift_train, Y_shift_val = train_test_split(
    X_enc, Y_dec_in, Y_shift, test_size=0.15, random_state=SEED
)

In [12]:
# ============================================================
# 7) Bangun Model Seq2Seq (SimpleRNN Encoder–Decoder) + dropout
# ============================================================
EMB_DIM   = 128
RNN_UNITS = 256
LR        = 5e-4  # sedikit lebih kecil agar stabil

# Encoder
enc_inputs = Input(shape=(MAX_LEN_SRC,), name="enc_inputs")
enc_emb = Embedding(input_dim=vocab_size_src, output_dim=EMB_DIM, name="enc_emb")(enc_inputs)
_, enc_state = SimpleRNN(RNN_UNITS, return_state=True, dropout=0.1, name="enc_rnn")(enc_emb)

# Decoder
dec_inputs = Input(shape=(MAX_LEN_TGT,), name="dec_inputs")
dec_emb = Embedding(input_dim=vocab_size_tgt, output_dim=EMB_DIM, name="dec_emb")(dec_inputs)
dec_outputs = SimpleRNN(RNN_UNITS, return_sequences=True, dropout=0.1, name="dec_rnn")(dec_emb, initial_state=enc_state)
dec_logits = Dense(vocab_size_tgt, activation="softmax", name="dec_out")(dec_outputs)

seq2seq = Model([enc_inputs, dec_inputs], dec_logits)
seq2seq.compile(optimizer=tf.keras.optimizers.Adam(LR), loss="sparse_categorical_crossentropy")
seq2seq.summary()

Y_target_train = np.expand_dims(Y_shift_train, axis=-1)
Y_target_val   = np.expand_dims(Y_shift_val, axis=-1)

In [13]:
# ============================================================
# 8) Training (tanpa reload, pakai restore_best_weights)
# ============================================================
ckpt_path = f"{CKPT_DIR}/best_seq2seq_simplernn.keras"
callbacks = [
    EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
    ModelCheckpoint(ckpt_path, monitor="val_loss", save_best_only=True)
]

history = seq2seq.fit(
    [X_enc_train, Y_dec_in_train], Y_target_train,
    validation_data=([X_enc_val, Y_dec_in_val], Y_target_val),
    epochs=30,
    batch_size=64,   # naikkan sedikit
    callbacks=callbacks,
    verbose=1
)

# (TIDAK ADA) seq2seq = tf.keras.models.load_model(ckpt_path)

Epoch 1/30
[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1376s[0m 1s/step - loss: 2.8198 - val_loss: 2.0202
Epoch 2/30
[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1296s[0m 1s/step - loss: 1.9942 - val_loss: 1.9929
Epoch 3/30
[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1339s[0m 1s/step - loss: 1.9283 - val_loss: 1.9716
Epoch 4/30
[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1297s[0m 1s/step - loss: 1.8672 - val_loss: 1.9272
Epoch 5/30
[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1327s[0m 1s/step - loss: 1.7907 - val_loss: 1.8834
Epoch 6/30
[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1304s[0m 1s/step - loss: 1.7158 - val_loss: 1.8702
Epoch 7/30
[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1335s[0m 1s/step - loss: 1.6625 - val_loss: 1.8641
Epoch 8/30
[1m1016/1016[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1357s[0m 1s/step - loss: 1.6193 - val_loss: 1.8638
Epoch 9/

In [14]:
# ============================================================
# 9) Simpan artefak
# ============================================================
with open(f"{ARTIFACT_DIR}/src_tokenizer.pkl", "wb") as f:
    pickle.dump(src_tok, f)
with open(f"{ARTIFACT_DIR}/tgt_tokenizer.pkl", "wb") as f:
    pickle.dump(tgt_tok, f)

meta = dict(
    MAX_LEN_SRC=MAX_LEN_SRC,
    MAX_LEN_TGT=MAX_LEN_TGT,
    SOS_TOKEN=SOS_TOKEN,
    EOS_TOKEN=EOS_TOKEN,
    vocab_size_src=vocab_size_src,
    vocab_size_tgt=vocab_size_tgt,
    RNN_UNITS=RNN_UNITS,
    EMB_DIM=EMB_DIM
)
with open(f"{ARTIFACT_DIR}/meta.json", "w") as f:
    json.dump(meta, f, indent=2)

seq2seq.save(f"{ARTIFACT_DIR}/seq2seq_simplernn.keras")
print("Saved artifacts to:", ARTIFACT_DIR)

Saved artifacts to: /content/artifacts_seq2seq


In [15]:
# ============================================================
# 10) Inference (OPSIONAL - Opsi B) — bangun encoder dari layer di dalam seq2seq yang SUDAH di-load
# PAKAI INI JIKA kamu melakukan: seq2seq = tf.keras.models.load_model(ckpt_path)
# ============================================================
from tensorflow.keras.layers import Reshape, Input

# Ambil layer dari seq2seq yang sudah di-load
enc_emb_layer = seq2seq.get_layer("enc_emb")
enc_rnn_layer = seq2seq.get_layer("enc_rnn")
dec_emb_layer = seq2seq.get_layer("dec_emb")
dec_rnn_layer = seq2seq.get_layer("dec_rnn")
dec_out_layer = seq2seq.get_layer("dec_out")

# Encoder inference (input → state)
enc_inputs_inf = Input(shape=(MAX_LEN_SRC,), name="enc_inputs_inf")
enc_x = enc_emb_layer(enc_inputs_inf)
_, enc_state_inf = enc_rnn_layer(enc_x)
encoder_model = Model(enc_inputs_inf, enc_state_inf)

# Decoder 1-step (copy bobot dari dec_rnn_layer)
RNN_UNITS = dec_rnn_layer.units  # pastikan konsisten
dec_state_in = Input(shape=(RNN_UNITS,), name="dec_state_in")
dec_inp_step = Input(shape=(1,), name="dec_inp_step")

dec_rnn_step = SimpleRNN(RNN_UNITS, return_state=True, name="dec_rnn_step")
dec_rnn_step.build((None, 1, dec_emb_layer.output_dim))
dec_rnn_step.set_weights(dec_rnn_layer.get_weights())

dec_x = dec_emb_layer(dec_inp_step)
dec_y, dec_state_out = dec_rnn_step(dec_x, initial_state=dec_state_in)
dec_y_time = Reshape((1, RNN_UNITS))(dec_y)
dec_logits_step = dec_out_layer(dec_y_time)

decoder_model = Model([dec_inp_step, dec_state_in], [dec_logits_step, dec_state_out])

# 10.4 Greedy decoder (stop juga bila memprediksi <sos>)
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"
SOS_ID = tgt_tok.word_index.get(SOS_TOKEN, None)
EOS_ID = tgt_tok.word_index.get(EOS_TOKEN, None)
if SOS_ID is None or EOS_ID is None:
    raise ValueError("Token <sos>/<eos> tidak ada di tokenizer target. Cek preprocessing.")

index2word_tgt = tgt_tok.index_word

def greedy_decode(input_text: str, max_len=None):
    if max_len is None:
        max_len = MAX_LEN_TGT

    x = src_tok.texts_to_sequences([clean_text(input_text)])
    x = pad_sequences(x, maxlen=MAX_LEN_SRC, padding="post", truncating="post")
    state = encoder_model.predict(x, verbose=0)

    cur_token = np.array([[SOS_ID]], dtype="int32")
    result_ids = []

    for _ in range(max_len):
        logits, state = decoder_model.predict([cur_token, state], verbose=0)
        next_id = int(np.argmax(logits[0, 0, :]))

        if next_id in (0, EOS_ID, SOS_ID):  # <- tambahan SOS_ID guard
            break

        result_ids.append(next_id)
        cur_token = np.array([[next_id]], dtype="int32")

    words = [index2word_tgt.get(idx, "<unk>") for idx in result_ids]
    return " ".join(words).strip()

In [16]:
# ============================================================
# 11) Evaluasi (Exact Match & BLEU) — target bersih sederhana
# ============================================================
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smooth = SmoothingFunction().method1

def evaluate(dataset_pairs, n_samples_eval=200):
    refs, hyps = [], []
    subset = random.sample(dataset_pairs, k=min(n_samples_eval, len(dataset_pairs)))
    for src, tgt in subset:
        hyp = greedy_decode(src)
        refs.append([tgt.split()])
        hyps.append(hyp.split())

    exact = sum(1 for r, h in zip(refs, hyps) if " ".join(r[0]) == " ".join(h))
    exact_acc = exact / len(subset)

    bleu1 = np.mean([sentence_bleu(r, h, weights=(1,0,0,0), smoothing_function=smooth) for r,h in zip(refs, hyps)])
    bleu2 = np.mean([sentence_bleu(r, h, weights=(0.5,0.5,0,0), smoothing_function=smooth) for r,h in zip(refs, hyps)])
    bleu4 = np.mean([sentence_bleu(r, h, weights=(0.25,0.25,0.25,0.25), smoothing_function=smooth) for r,h in zip(refs, hyps)])

    return dict(exact_match=exact_acc, bleu1=bleu1, bleu2=bleu2, bleu4=bleu4)

# target evaluasi pakai pasangan asli (tanpa <sos>/<eos>)
pairs_clean_tgt = [(src, tgt) for (src, tgt) in pairs]
metrics = evaluate(pairs_clean_tgt, n_samples_eval=200)
print("Eval (subset):", metrics)

Eval (subset): {'exact_match': 0.0, 'bleu1': np.float64(0.0), 'bleu2': np.float64(0.0), 'bleu4': np.float64(0.0)}


In [17]:
# ============================================================
# 12) Demo cepat
# ============================================================
tests = [
    "kapan beyonce mulai terkenal?",
    "siapa manajer destiny's child?",
    "apa album solo pertama beyonce?",
    "siapa suami beyonce?",
]
for q in tests:
    ans = greedy_decode(q)
    print(f"Q: {q}\nA: {ans}\n")

Q: kapan beyonce mulai terkenal?
A: <oov>

Q: siapa manajer destiny's child?
A: <oov>

Q: apa album solo pertama beyonce?
A: <oov>

Q: siapa suami beyonce?
A: <oov>



In [18]:
# ================================
# UI Chat Box (Seq2Seq - ipywidgets)
# ================================

# 1) Pastikan widget manager aktif
try:
    from google.colab import output as colab_output
    colab_output.enable_custom_widget_manager()
except Exception:
    pass

try:
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
except Exception:
    !pip -q install ipywidgets==8.1.1
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
    from google.colab import output as colab_output
    colab_output.enable_custom_widget_manager()

# 2) Komponen UI
input_box = widgets.Text(
    placeholder='Ketik pertanyaanmu di sini...',
    description='User:',
    layout=widgets.Layout(width='100%')
)
send_btn = widgets.Button(
    description='Kirim',
    button_style='primary',
    tooltip='Kirim pesan',
    layout=widgets.Layout(width='120px')
)
clear_btn = widgets.Button(
    description='Bersihkan',
    tooltip='Hapus riwayat',
    layout=widgets.Layout(width='120px')
)
status_lbl = widgets.Label(value='Siap ✅')

chat_output = widgets.Output(layout=widgets.Layout(
    border='1px solid #ddd', padding='10px', height='350px', overflow='auto'
))

controls = widgets.HBox([send_btn, clear_btn, status_lbl])
ui = widgets.VBox([input_box, controls, chat_output])

# 3) State riwayat percakapan (tanpa intent/conf)
history = []  # list of (role, text)

def render_history():
    with chat_output:
        clear_output()
        html = ['<div style="font-family: Inter, system-ui, Arial; font-size:14px">']
        for role, text in history:
            if role == 'user':
                html.append(f'''
                <div style="margin:8px 0;">
                    <div style="font-weight:600;color:#1a73e8">Kamu:</div>
                    <div style="white-space:pre-wrap">{text}</div>
                </div>''')
            else:
                html.append(f'''
                <div style="margin:8px 0;">
                    <div style="font-weight:600;color:#34a853">Bot:</div>
                    <div style="white-space:pre-wrap">{text}</div>
                </div>''')
        html.append('</div>')
        display(HTML(''.join(html)))

# 4) Handler kirim (pakai greedy_decode dari model Seq2Seq)
def handle_send(_=None):
    msg = input_box.value.strip()
    if not msg:
        status_lbl.value = "Ketik sesuatu dulu…"
        return
    status_lbl.value = "Memproses… ⏳"
    history.append(('user', msg))
    try:
        ans = greedy_decode(msg)  # <— PENTING: pakai seq2seq
        if not ans:
            ans = "(maaf, belum bisa menjawab)"
        history.append(('bot', ans))
    except Exception as e:
        history.append(('bot', f"Terjadi error: {e}"))
    render_history()
    input_box.value = ''
    status_lbl.value = "Siap ✅"

# 5) Handler clear
def handle_clear(_=None):
    history.clear()
    render_history()
    status_lbl.value = "Riwayat dibersihkan 🧹"

# 6) Event bindings
send_btn.on_click(handle_send)
clear_btn.on_click(handle_clear)
input_box.on_submit(handle_send)  # tekan Enter untuk kirim

# 7) Tampilkan UI
render_history()
display(ui)

VBox(children=(Text(value='', description='User:', layout=Layout(width='100%'), placeholder='Ketik pertanyaanm…