In [None]:
!pip install -U indobenchmark-toolkit evaluate sacrebleu rouge-score

In [1]:
!pip install -U transformers accelerate evaluate sacrebleu rouge-score sentencepiece


Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.0/12.0 MB[0m [31m100.5 MB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m
[?25hDownloading accelerate-1.12.0-py3-non

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import os, re, unicodedata
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

import evaluate
print("evaluate version:", evaluate.__version__)

# >>> ADDED: tokenizer khusus IndoBenchmark
from indobenchmark import IndoNLGTokenizer

In [2]:
import pandas as pd
import re
import unicodedata
import numpy as np

IN_PATH  = "/kaggle/input/inmad-dataset/INMAD Dataset.csv"
OUT_PATH = "inmad_clean_v2.csv"

def fix_mojibake(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)

    # heuristik sederhana: kalau ada √É/√Ç/ÔøΩ biasanya mojibake
    if any(ch in s for ch in ["√É", "√Ç", "ÔøΩ", "\uFFFD"]):
        for src_enc in ["latin-1", "cp1252"]:
            try:
                s2 = s.encode(src_enc, errors="ignore").decode("utf-8", errors="ignore")
                if len(s2.strip()) > 0:
                    s = s2
                    break
            except Exception:
                pass
    return s

def normalize_text(s: str) -> str:
    s = fix_mojibake(s)
    s = unicodedata.normalize("NFKC", s)

    # hapus control chars
    s = re.sub(r"[\u0000-\u001F\u007F-\u009F]", " ", s)
    s = s.replace("\u200b", " ").replace("\ufeff", " ")

    # normalisasi kutip/apostrof
    s = (s.replace("‚Äô","'").replace("‚Äò","'").replace("¬¥","'").replace("`","'")
           .replace("‚Äú",'"').replace("‚Äù",'"'))

    # normalisasi dash dan ellipsis
    s = s.replace("‚Äì","-").replace("‚Äî","-").replace("‚àí","-")
    s = s.replace("‚Ä¶","...")

    # rapikan spasi
    s = re.sub(r"\s+", " ", s).strip()

    # hilangkan spasi sebelum tanda baca: " ,", " .", dst
    s = re.sub(r"\s+([,.;:!?])", r"\1", s)

    # pastikan ada spasi setelah tanda baca jika langsung diikuti huruf/angka
    s = re.sub(r"([,;:!?])([A-Za-z0-9])", r"\1 \2", s)
    s = re.sub(r"(\.)([A-Za-z])", r"\1 \2", s)  # ".kata" -> ". kata"

    # rapikan kurung/bracket
    s = re.sub(r"\(\s+", "(", s)
    s = re.sub(r"\s+\)", ")", s)
    s = re.sub(r"\[\s+", "[", s)
    s = re.sub(r"\s+\]", "]", s)

    # collapse multi punctuation
    s = re.sub(r"([!?])\1{1,}", r"\1", s)
    s = re.sub(r"\.{4,}", "...", s)

    return s

def tok_len(s: str) -> int:
    return len(re.findall(r"\S+", str(s)))

# ===== Load =====
raw = pd.read_csv(IN_PATH)

# Ambil kolom yang kita butuhkan: Indonesia & Madura (buang English)
df = raw.rename(columns={"Indonesia":"id", "Madura":"mad"}).copy()
df["id"]  = df["id"].astype(str).map(normalize_text)
df["mad"] = df["mad"].astype(str).map(normalize_text)

# drop kosong + dedup
df = df[(df["id"] != "") & (df["mad"] != "")]
df = df.drop_duplicates(subset=["id","mad"]).reset_index(drop=True)

# ===== Filter kualitas (biar tidak over-noisy) =====
id_len  = df["id"].map(tok_len)
mad_len = df["mad"].map(tok_len)
ratio   = (id_len + 1) / (mad_len + 1)

# batas aman (kamu bisa adjust)
keep = (
    (id_len  >= 3)  & (mad_len >= 3) &
    (id_len  <= 200) & (mad_len <= 220) &
    (ratio >= 0.5) & (ratio <= 2.0)
)

df_clean = df[keep].reset_index(drop=True)

print("Raw rows:", len(raw))
print("After basic clean:", len(df))
print("After filter:", len(df_clean))

# ===== Save =====
df_clean[["id","mad"]].to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


Raw rows: 23098
After basic clean: 23032
After filter: 21389
Saved: inmad_clean_v2.csv


In [None]:
# MODEL
MODEL_NAME = "indobenchmark/indobart-v2"


Isi BASE_PATH:
['valid.csv', 'test.csv', 'train.csv', 'madurese.csv']


# prepro

In [3]:
def standardize_cols(df: pd.DataFrame) -> pd.DataFrame:
    cols = {c.lower(): c for c in df.columns}
    id_col  = cols.get("indonesian") or cols.get("id") or cols.get("indo") or cols.get("source")
    mad_col = cols.get("madurese") or cols.get("mad") or cols.get("madura") or cols.get("target")
    if id_col is None or mad_col is None:
        raise ValueError(f"Kolom id/mad tidak ketemu. Kolom yang ada: {list(df.columns)}")
    out = df[[id_col, mad_col]].copy()
    out.columns = ["id", "mad"]
    return out

def fix_mojibake(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)

    # heuristik sederhana: kalau ada √É/√Ç/ÔøΩ biasanya mojibake
    if any(ch in s for ch in ["√É", "√Ç", "ÔøΩ", "\uFFFD"]):
        for src_enc in ["latin-1", "cp1252"]:
            try:
                s2 = s.encode(src_enc, errors="ignore").decode("utf-8", errors="ignore")
                if len(s2.strip()) > 0:
                    s = s2
                    break
            except Exception:
                pass
    return s
def clean_text(s: str) -> str:
    s = fix_mojibake(s)
    s = s.replace("\u200b", " ").replace("\ufeff", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["id"]  = df["id"].map(clean_text)
    df["mad"] = df["mad"].map(clean_text)
    df = df[(df["id"] != "") & (df["mad"] != "")]
    df = df.drop_duplicates(subset=["id","mad"]).reset_index(drop=True)
    return df

def drop_unnamed_cols(df: pd.DataFrame) -> pd.DataFrame:
    unnamed = [c for c in df.columns if str(c).lower().startswith("unnamed")]
    if unnamed:
        df = df.drop(columns=unnamed)
    return df

def guess_column(df: pd.DataFrame, candidates):
    cols_lower = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand.lower() in cols_lower:
            return cols_lower[cand.lower()]
    return None

# import data

In [4]:

nusax_train = clean_df(standardize_cols(pd.read_csv("/kaggle/input/nusaxdata/train.csv")))
nusax_valid = clean_df(standardize_cols(pd.read_csv("/kaggle/input/nusaxdata/valid.csv")))
nusax_test  = clean_df(standardize_cols(pd.read_csv("/kaggle/input/nusaxdata/test (1).csv")))  # sesuaikan nama file test kamu

print(len(nusax_train), len(nusax_valid), len(nusax_test))


500 100 400


In [5]:
lex = pd.read_csv("/kaggle/input/nusaxdata/madurese.csv")  # file lexicon
lex = standardize_cols(lex)        # jadi id, mad
lex = clean_df(lex)

# bikin mapping mad->mad "kanonik" berbasis bentuk yang paling sering / paling pendek
# (ini sederhana tapi efektif untuk merapikan variasi ejaan)
mad2canon = {}
for _, r in lex.iterrows():
    m = r["mad"]
    # pilih bentuk canon = bentuk yang "paling clean" (panjang paling pendek)
    if m not in mad2canon:
        mad2canon[m] = m

# kalau kamu mau mapping variasi ke satu bentuk (misal bul√¢ vs bula'), kamu butuh aturan tambahan.
# Untuk versi aman: kita pakai normalisasi karakter saja + perbaiki mojibake.
def normalize_madurese_with_lexicon(text: str) -> str:
    # perbaiki encoding & rapikan spasi (yang paling aman)
    return clean_text(text)


In [6]:
inmad = pd.read_csv("inmad_clean_v2.csv")


# normalisasi madurese pakai fungsi lexicon (safe)
inmad["mad"] = inmad["mad"].map(normalize_madurese_with_lexicon)

print("inmad:", len(inmad))


inmad: 21389


In [7]:
rng = np.random.default_rng(42)
idx = np.arange(len(inmad))
rng.shuffle(idx)

valid_frac = 0.05   # 5% valid dari InMad
n_valid = max(1, int(len(inmad) * valid_frac))

inmad_valid = inmad.iloc[idx[:n_valid]].reset_index(drop=True)
inmad_train = inmad.iloc[idx[n_valid:]].reset_index(drop=True)

# (opsional) kalau InMad jauh lebih besar, batasi rasio biar NusaX nggak ketimbun
max_ratio = 3  # InMad train max 3x NusaX train
target_inmad = min(len(inmad_train), max_ratio * len(nusax_train))
inmad_train = inmad_train.sample(n=target_inmad, random_state=42).reset_index(drop=True)

# tag sumber (optional tapi bagus buat kontrol domain)
nusax_train["src"] = "nusax"
nusax_valid["src"] = "nusax"
inmad_train["src"] = "inmad"
inmad_valid["src"] = "inmad"

train_mix = pd.concat([nusax_train, inmad_train], ignore_index=True)
valid_mix = pd.concat([nusax_valid, inmad_valid], ignore_index=True)

print("train_mix:", len(train_mix), "valid_mix:", len(valid_mix))


train_mix: 2000 valid_mix: 1169


In [None]:
def build_bidirectional(df: pd.DataFrame) -> Dataset:
    rows = []
    for _, r in df.iterrows():
        rows.append({
            "direction": "id2mad",
            "source": "translate Indonesian to Madurese: " + r["id"],
            "target": r["mad"]
        })
        rows.append({
            "direction": "mad2id",
            "source": "translate Madurese to Indonesian: " + r["mad"],
            "target": r["id"]
        })
    return Dataset.from_pandas(pd.DataFrame(rows))

train_ds = build_bidirectional(train_mix)
valid_ds = build_bidirectional(valid_mix)
test_ds  = build_bidirectional(nusax_test)

train_ds[0], train_ds[1]


# model

In [35]:
from indobenchmark import IndoNLGTokenizer

# 1) RESET tokenizer (fresh instance)
tokenizer = IndoNLGTokenizer.from_pretrained(MODEL_NAME)

# 2) PATCH pad yang kebal rerun (ambil pad dari CLASS, bukan dari instance)
_base_pad = tokenizer.__class__.pad  # ini selalu "pad asli" dari kelas, bukan yang kepatch instance

def pad_compat(encoded_inputs, *args, **kwargs):
    kwargs.pop("padding_side", None)
    kwargs.pop("return_tensors", None)
    return _base_pad(tokenizer, encoded_inputs, *args, **kwargs)

tokenizer.pad = pad_compat


In [36]:
MAX_LEN_SRC = 128
MAX_LEN_TGT = 128

def tokenize_batch(batch):
    inputs = tokenizer(
        batch["source"],
        truncation=True,
        max_length=MAX_LEN_SRC
    )

    labels = tokenizer(
        batch["target"],
        truncation=True,
        max_length=MAX_LEN_TGT
    )

    inputs["labels"] = labels["input_ids"]
    return inputs


In [37]:
train_tok = train_ds.map(tokenize_batch, batched=True, remove_columns=train_ds.column_names)
valid_tok = valid_ds.map(tokenize_batch, batched=True, remove_columns=valid_ds.column_names)
test_tok  = test_ds.map(tokenize_batch,  batched=True, remove_columns=test_ds.column_names)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [48]:
train_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [49]:
print(train_tok[0])

{'input_ids': tensor([23353, 13807,  1358,  1922,   310,  5988, 39969,  7841, 14917,   609,
        39991,  1025,  1301,  1159,   365,  4067,  1574,  1896,   887,  3364,
          354,  1835,  3549,  9864, 39981]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]), 'labels': tensor([ 7841, 19380,  3704,   609, 39991,   656,   352,  1301,  2103,   262,
         7205,   390,  6181,  2030,  1523,  4376, 34897,  2828,   887,  3364,
        11509,  1835,  3549,  9864, 39981])}


In [51]:
import torch
from torch.nn.utils.rnn import pad_sequence

class TorchPadSeq2SeqCollator:
    def __init__(self, pad_token_id, label_pad_token_id=-100):
        self.pad_token_id = pad_token_id
        self.label_pad_token_id = label_pad_token_id

    def __call__(self, features):
        # features: list of dicts with torch tensors (seperti yang kamu tunjukkan)
        input_ids = [f["input_ids"] for f in features]
        attention_mask = [f["attention_mask"] for f in features]
        labels = [f["labels"] for f in features]

        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id)
        attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
        labels = pad_sequence(labels, batch_first=True, padding_value=self.label_pad_token_id)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

# pakai ini sebagai data_collator
data_collator = TorchPadSeq2SeqCollator(
    pad_token_id=tokenizer.pad_token_id,
    label_pad_token_id=-100
)


In [52]:
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # >>> ADDED: preds kadang tuple
    if isinstance(preds, tuple):
        preds = preds[0]

    pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_score = bleu.compute(predictions=pred_texts, references=[[r] for r in ref_texts])["score"]
    rouge_score = rouge.compute(predictions=pred_texts, references=ref_texts)

    return {
        "bleu": bleu_score,
        "rouge1": rouge_score["rouge1"],
        "rougeL": rouge_score["rougeL"]
    }


In [53]:
import transformers, sys
print("transformers version:", transformers.__version__)
print("python:", sys.version)

OUTPUT_DIR = "./indobenchmark-indobart-v2"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    save_steps=10**9,        # praktis tidak pernah save checkpoint
    save_total_limit=1,
    logging_steps=100,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    fp16=True,
    report_to="none",
    prediction_loss_only=True,
)

transformers version: 4.57.3
python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]


In [None]:
import transformers, sys
print("transformers version:", transformers.__version__)
print("python:", sys.version)

OUTPUT_DIR = "./indobenchmark-indobart-v2"

# ======================
# 1) PATCH: cegah autosave tokenizer (IndoNLGTokenizer tidak support save_vocabulary)
# ======================
def _noop_save_pretrained(*args, **kwargs):
    return ()

tokenizer.save_pretrained = _noop_save_pretrained
tokenizer.save_vocabulary = lambda *args, **kwargs: ()

# ======================
# 2) TrainingArguments (aman lintas versi)
# ======================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    save_steps=10**9,          # praktis tidak pernah save checkpoint
    save_total_limit=1,
    logging_steps=100,

    # ‚ùå JANGAN pakai evaluation_strategy (versi transformers lama)
    prediction_loss_only=False,   # ‚úÖ wajib False

    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    num_train_epochs=10,

    fp16=True,
    report_to="none",
)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # ‚úÖ IndoNLGTokenizer-safe decoding (hindari batch_decode)
    pred_texts = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    ref_texts = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

    bleu_score = bleu.compute(predictions=pred_texts, references=[[r] for r in ref_texts])["score"]
    rouge_score = rouge.compute(predictions=pred_texts, references=ref_texts)

    return {
        "bleu": bleu_score,
        "rouge1": rouge_score["rouge1"],
        "rougeL": rouge_score["rougeL"],
    }

# ======================
# 3) BLEU per epoch callback + AVG + summary
# ======================
import torch
import sacrebleu
from transformers import TrainerCallback

BLEU_LOG = {}  # {epoch_int: {"id2mad":..., "mad2id":..., "avg":...}}

def _epoch_key(state):
    if state.epoch is None:
        return int(getattr(state, "global_step", 0))
    return int(state.epoch)

class BleuEachEpochCallback(TrainerCallback):
    def __init__(self, tokenizer, valid_df, direction="id2mad", n_samples=100,
                 max_len_src=128, max_new_tok=128, batch_size=8, num_beams=4):
        self.tokenizer = tokenizer
        self.valid_df = valid_df
        self.direction = direction
        self.n_samples = n_samples
        self.max_len_src = max_len_src
        self.max_new_tok = max_new_tok
        self.batch_size = batch_size
        self.num_beams = num_beams

    def on_epoch_end(self, args, state, control, **kwargs):
        model = kwargs["model"]
        device = ("cuda" if torch.cuda.is_available() else "cpu")
        model.eval()

        df = self.valid_df.head(self.n_samples)

        if self.direction == "id2mad":
            sources = ["translate Indonesian to Madurese: " + x for x in df["id"].tolist()]
            refs = df["mad"].tolist()
        else:
            sources = ["translate Madurese to Indonesian: " + x for x in df["mad"].tolist()]
            refs = df["id"].tolist()

        preds = []
        with torch.no_grad():
            for i in range(0, len(sources), self.batch_size):
                batch = sources[i:i+self.batch_size]
                enc = self.tokenizer(
                    batch,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=self.max_len_src
                )
                enc = {k: v.to(device) for k, v in enc.items()}
                out = model.generate(
                    **enc,
                    max_new_tokens=self.max_new_tok,
                    num_beams=self.num_beams,
                    # OPTIONAL anti-loop (boleh aktifkan kalau sering repetisi)
                    # no_repeat_ngram_size=3,
                    # repetition_penalty=1.1,
                    # early_stopping=True,
                )

                preds.extend([self.tokenizer.decode(o, skip_special_tokens=True) for o in out])

        bleu_val = sacrebleu.corpus_bleu(preds, [refs]).score

        ep = _epoch_key(state)
        BLEU_LOG.setdefault(ep, {})
        BLEU_LOG[ep][self.direction] = bleu_val

        if "id2mad" in BLEU_LOG[ep] and "mad2id" in BLEU_LOG[ep]:
            avg_bleu = (BLEU_LOG[ep]["id2mad"] + BLEU_LOG[ep]["mad2id"]) / 2.0
            BLEU_LOG[ep]["avg"] = avg_bleu
            print(
                f"\nüèÜ Epoch {ep} | ID2MAD BLEU@{self.n_samples}: {BLEU_LOG[ep]['id2mad']:.2f} | "
                f"MAD2ID BLEU@{self.n_samples}: {BLEU_LOG[ep]['mad2id']:.2f} | "
                f"AVG: {avg_bleu:.2f}\n"
            )
        else:
            print(f"\nüèÜ Epoch {ep} | {self.direction.upper()} BLEU@{self.n_samples}: {bleu_val:.2f}\n")

        model.train()
        return control

class BleuAvgSummaryCallback(TrainerCallback):
    def on_train_end(self, args, state, control, **kwargs):
        if not BLEU_LOG:
            print("\n‚ö†Ô∏è BLEU_LOG kosong (tidak ada BLEU yang tercatat)\n")
            return control

        print("\n==============================")
        print("üìå RINGKASAN BLEU PER EPOCH (AVG)")
        print("==============================")
        for ep in sorted(BLEU_LOG.keys()):
            rec = BLEU_LOG[ep]
            id2 = rec.get("id2mad", float("nan"))
            m2i = rec.get("mad2id", float("nan"))
            avg = rec.get("avg", float("nan"))
            print(f"Epoch {ep}: ID2MAD={id2:.2f} | MAD2ID={m2i:.2f} | AVG={avg:.2f}")

        avgs = [BLEU_LOG[ep]["avg"] for ep in sorted(BLEU_LOG.keys()) if "avg" in BLEU_LOG[ep]]
        if avgs:
            overall = sum(avgs) / len(avgs)
            print("------------------------------")
            print(f"‚úÖ Overall AVG BLEU across epochs: {overall:.2f}")
        print("==============================\n")
        return control

# callbacks dua arah + summary
bleu_cb_id2mad = BleuEachEpochCallback(
    tokenizer=tokenizer,
    valid_df=valid_mix,
    direction="id2mad",
    n_samples=100,
    max_len_src=128,
    max_new_tok=128,
    batch_size=8,
    num_beams=4
)

bleu_cb_mad2id = BleuEachEpochCallback(
    tokenizer=tokenizer,
    valid_df=valid_mix,
    direction="mad2id",
    n_samples=100,
    max_len_src=128,
    max_new_tok=128,
    batch_size=8,
    num_beams=4
)

bleu_summary = BleuAvgSummaryCallback()

# ======================
# 4) Trainer + train
# ======================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # ‚úÖ kalau mau metric tampil di log
    callbacks=[bleu_cb_id2mad, bleu_cb_mad2id, bleu_summary],
)

trainer.train()

# ======================
# 5) Save model (tokenizer tidak disave)
# ======================
trainer.save_model(OUTPUT_DIR)
print("‚úÖ Model disimpan ke:", OUTPUT_DIR)
print("‚ÑπÔ∏è Tokenizer tidak disimpan (pakai tokenizer bawaan indobenchmark/indobart-v2).")

best_ckpt = getattr(trainer.state, "best_model_checkpoint", None)
best_metric = getattr(trainer.state, "best_metric", None)
print("Best checkpoint:", best_ckpt)
print("Best metric:", best_metric)


You are adding a <class '__main__.BleuEachEpochCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
BleuEachEpochCallback


transformers version: 4.57.3
python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]




Step,Training Loss
100,0.8996
200,0.6901
300,0.5455
400,0.4503
500,0.3676
600,0.2894
700,0.2541
800,0.2238
900,0.2011
1000,0.2079



üèÜ Epoch 1 | ID2MAD BLEU@100: 15.75


üèÜ Epoch 1 | ID2MAD BLEU@100: 15.75 | MAD2ID BLEU@100: 21.33 | AVG: 18.54


üèÜ Epoch 2 | ID2MAD BLEU@100: 16.47


üèÜ Epoch 2 | ID2MAD BLEU@100: 16.47 | MAD2ID BLEU@100: 18.76 | AVG: 17.61


üèÜ Epoch 3 | ID2MAD BLEU@100: 12.50


üèÜ Epoch 3 | ID2MAD BLEU@100: 12.50 | MAD2ID BLEU@100: 16.31 | AVG: 14.40


üèÜ Epoch 4 | ID2MAD BLEU@100: 15.13


üèÜ Epoch 4 | ID2MAD BLEU@100: 15.13 | MAD2ID BLEU@100: 18.61 | AVG: 16.87


üèÜ Epoch 5 | ID2MAD BLEU@100: 15.35


üèÜ Epoch 5 | ID2MAD BLEU@100: 15.35 | MAD2ID BLEU@100: 19.74 | AVG: 17.55


üèÜ Epoch 6 | ID2MAD BLEU@100: 14.73


üèÜ Epoch 6 | ID2MAD BLEU@100: 14.73 | MAD2ID BLEU@100: 19.48 | AVG: 17.11


üèÜ Epoch 7 | ID2MAD BLEU@100: 14.91


üèÜ Epoch 7 | ID2MAD BLEU@100: 14.91 | MAD2ID BLEU@100: 20.72 | AVG: 17.81


üèÜ Epoch 8 | ID2MAD BLEU@100: 14.17


üèÜ Epoch 8 | ID2MAD BLEU@100: 14.17 | MAD2ID BLEU@100: 21.62 | AVG: 17.90


üèÜ Epoch 9 | ID2MAD BLEU@100: 14.96


üèÜ Epoch 9 | 

In [57]:
try:
    tokenizer.save_pretrained(OUTPUT_DIR)
except Exception as e:
    print("Tokenizer tidak bisa disave dengan save_pretrained (aman di-skip):", repr(e))

In [None]:
# ======================
# CUSTOM EVALUATION (PRINT + BLEU + CONTOH)
# ======================

from tqdm.auto import tqdm
import sacrebleu
import torch

print("üìÇ Memuat model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = trainer.model.to(device)
model.eval()

TEST_PATH = "/kaggle/input/nusaxdata/test (1).csv"  

print(f"üìÇ Membaca data tes: {TEST_PATH}")
test_df = pd.read_csv(TEST_PATH)
test_df = drop_unnamed_cols(test_df)

id_col = guess_column(test_df, ["ind", "id", "indo", "indonesian"])
mad_col = guess_column(test_df, ["mad", "madurese", "madura"])
test_df = test_df[[id_col, mad_col]].rename(columns={id_col: "id", mad_col: "mad"})

test_df["id"]  = test_df["id"].apply(clean_text)
test_df["mad"] = test_df["mad"].apply(clean_text)

N = 100
test_df = test_df.head(N)
print(f"‚úÖ Menguji pada {len(test_df)} kalimat pertama.")
print("üöÄ Mulai Menerjemahkan...")

sources = ["translate Indonesian to Madurese: " + x for x in test_df["id"].tolist()]
refs    = test_df["mad"].tolist()

preds = []
batch_size = 8

for i in tqdm(range(0, len(sources), batch_size)):
    batch = sources[i:i+batch_size]
    enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=128, num_beams=4)

    # >>> FIX DI SINI
    preds.extend([tokenizer.decode(o, skip_special_tokens=True) for o in out])

bleu = sacrebleu.corpus_bleu(preds, [refs]).score

print("\n==============================")
print(f"üèÜ REAL BLEU SCORE: {bleu:.2f}")
print("==============================\n")

print("üîç 5 CONTOH HASIL:")
for i in range(min(5, len(test_df))):
    print(f"üáÆüá© Indo  : {test_df.iloc[i]['id']}")
    print(f"ü§ñ Model : {preds[i]}")
    print(f"üîë Kunci : {refs[i]}")
    print("-" * 20)


üìÇ Memuat model...
üìÇ Membaca data tes: /kaggle/working/nusax/test.csv
‚úÖ Menguji pada 100 kalimat pertama.
üöÄ Mulai Menerjemahkan...


  0%|          | 0/13 [00:00<?, ?it/s]


üèÜ REAL BLEU SCORE: 15.50

üîç 5 CONTOH HASIL:
üáÆüá© Indo  : Dekat dengan hotel saya menginap, hanya ditempuh jalan kaki, di sini banyak sekali pilihan makanannya, tempat yang luas, dan menyenangkan
ü§ñ Model :  parjelenan bik hotel engkok nginep, pera' ditempuh jelen kaki, e diye bennyak sarah pelean kakananna, kennengngan se leber, ben masenneng sarah. eman ongghu. ban kaso'on.'. bannya' sarah.', bennya'.''.''
üîë Kunci : Semmak bik hotel engkok nginep, pera' ejeleni ajelen soko, ediye bennyak sarah pelean kakananna, kenengngan se leber, ben masenneng
--------------------
üáÆüá© Indo  : Iya benar, dia sedang jaga warung.
ü§ñ Model :  iye bhender, engkok bik selaen jaga warung. bhenderre'. bhendinga'.
üîë Kunci : Iye bhender, rua ajege berung.
--------------------
üáÆüá© Indo  : Kangkungnya lumayan tapi kepiting saus padangnya mengecewakan kami dikasih kepiting yang kopong akhir kami tidak makan keptingnya dan dikembalikan.
ü§ñ Model :  kangkungnga pendhenan tape kepit

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

model = trainer.model.to(device)
model.eval()

MAX_LEN_SRC = 128
MAX_NEW_TOK = 128

def generate_batch(sources, batch_size=8):
    preds = []
    for i in range(0, len(sources), batch_size):
        batch = sources[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LEN_SRC)
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model.generate(**enc, max_new_tokens=MAX_NEW_TOK, num_beams=4)

        # >>> CHANGED: jangan batch_decode untuk IndoNLGTokenizer
        preds.extend([tokenizer.decode(o, skip_special_tokens=True) for o in out])

    return preds

import evaluate
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

def score(preds, refs):
    b = bleu.compute(predictions=preds, references=[[r] for r in refs])["score"]
    r = rouge.compute(predictions=preds, references=refs)
    return {"BLEU": b, "ROUGE-1": r["rouge1"], "ROUGE-L": r["rougeL"]}

# VALID
src_id2mad = ["translate Indonesian to Madurese: " + x for x in valid_mix["id"].tolist()]
ref_id2mad = valid_mix["mad"].tolist()
pred_id2mad = generate_batch(src_id2mad)
print("VALID ID ‚Üí MAD:", score(pred_id2mad, ref_id2mad))

src_mad2id = ["translate Madurese to Indonesian: " + x for x in valid_mix["mad"].tolist()]
ref_mad2id = valid_mix["id"].tolist()
pred_mad2id = generate_batch(src_mad2id)
print("VALID MAD ‚Üí ID:", score(pred_mad2id, ref_mad2id))

# TEST
src_id2mad = ["translate Indonesian to Madurese: " + x for x in nusax_test["id"].tolist()]
ref_id2mad = nusax_test["mad"].tolist()
pred_id2mad = generate_batch(src_id2mad)
print("TEST ID ‚Üí MAD:", score(pred_id2mad, ref_id2mad))

src_mad2id = ["translate Madurese to Indonesian: " + x for x in nusax_test["mad"].tolist()]
ref_mad2id = nusax_test["id"].tolist()
pred_mad2id = generate_batch(src_mad2id)
print("TEST MAD ‚Üí ID:", score(pred_mad2id, ref_mad2id))


VALID ID ‚Üí MAD: {'BLEU': 14.746241869737535, 'ROUGE-1': 0.4462242276523025, 'ROUGE-L': 0.4331282451227373}
VALID MAD ‚Üí ID: {'BLEU': 20.93414376974587, 'ROUGE-1': 0.5262434375864424, 'ROUGE-L': 0.5129215980981093}
TEST ID ‚Üí MAD: {'BLEU': 15.713515275134364, 'ROUGE-1': 0.46672209513455976, 'ROUGE-L': 0.4532801193530669}
TEST MAD ‚Üí ID: {'BLEU': 22.071250626656674, 'ROUGE-1': 0.5451958332518148, 'ROUGE-L': 0.5312517830491592}


In [62]:
# ======================
# CUSTOM EVALUATION (PRINT + BLEU + CONTOH) - MADURA -> INDONESIA
# ======================

from tqdm.auto import tqdm
import sacrebleu
import torch
import pandas as pd

print("üìÇ Memuat model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = trainer.model.to(device)
model.eval()

print(f"üìÇ Membaca data tes: {TEST_PATH}")
test_df = pd.read_csv(TEST_PATH)
test_df = drop_unnamed_cols(test_df)

id_col  = guess_column(test_df, ["ind", "id", "indo", "indonesian"])
mad_col = guess_column(test_df, ["mad", "madurese", "madura"])

test_df = test_df[[id_col, mad_col]].rename(columns={id_col: "id", mad_col: "mad"})

test_df["id"]  = test_df["id"].apply(clean_text)
test_df["mad"] = test_df["mad"].apply(clean_text)

N = 100
test_df = test_df.head(N)
print(f"‚úÖ Menguji pada {len(test_df)} kalimat pertama.")
print("üöÄ Mulai Menerjemahkan...")

sources = ["translate Madurese to Indonesian: " + x for x in test_df["mad"].tolist()]
refs    = test_df["id"].tolist()

preds = []
batch_size = 8

for i in tqdm(range(0, len(sources), batch_size)):
    batch = sources[i:i+batch_size]
    enc = tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        out = model.generate(
            **enc,
            max_new_tokens=128,
            num_beams=4
        )

    # ‚úÖ FIX: IndoNLGTokenizer tidak kompatibel dengan batch_decode (clean_up_tokenization_spaces)
    preds.extend([tokenizer.decode(o, skip_special_tokens=True) for o in out])

bleu = sacrebleu.corpus_bleu(preds, [refs]).score

print("\n==============================")
print(f"üèÜ REAL BLEU SCORE (MAD‚ÜíID): {bleu:.2f}")
print("==============================\n")

print("üîç 5 CONTOH HASIL:")
for i in range(min(5, len(test_df))):
    print(f"üü´ Madura : {test_df.iloc[i]['mad']}")
    print(f"ü§ñ Model  : {preds[i]}")
    print(f"üîë Kunci  : {refs[i]}")
    print("-" * 20)


üìÇ Memuat model...
üìÇ Membaca data tes: /kaggle/working/nusax/test.csv
‚úÖ Menguji pada 100 kalimat pertama.
üöÄ Mulai Menerjemahkan...


  0%|          | 0/13 [00:00<?, ?it/s]


üèÜ REAL BLEU SCORE (MAD‚ÜíID): 23.21

üîç 5 CONTOH HASIL:
üü´ Madura : Semmak bik hotel engkok nginep, pera' ejeleni ajelen soko, ediye bennyak sarah pelean kakananna, kenengngan se leber, ben masenneng
ü§ñ Model  :  semmak dengan hotel saya menginap, hanya dengan alas daun, di sini banyak sekali pilihan makanannya, tempat yang luas, dan sangat menyenangkan. sangat direkomendasikan.
üîë Kunci  : Dekat dengan hotel saya menginap, hanya ditempuh jalan kaki, di sini banyak sekali pilihan makanannya, tempat yang luas, dan menyenangkan
--------------------
üü´ Madura : Iye bhender, rua ajege berung.
ü§ñ Model  :  iye bhender, itu ayam berung. hehehe.
üîë Kunci  : Iya benar, dia sedang jaga warung.
--------------------
üü´ Madura : Kangkongnga pendhanan tape kopeteng saos padangnga ma kocaba, engko' bi' laenna e bharri' kopeteng se kopong akherra engko' bi' laenna ta' ngakan kopeteng ban e pabali.
ü§ñ Model  :  makanannya lumayan tapi nasi goreng saos padangnya mengecewakan, saya

In [None]:
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Ensure clean_text and fix_mojibake are available
# If you encounter a NameError for clean_text or fix_mojibake, please run the preprocessing cells (tS4yoEHnq713) first.

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the best model and tokenizer from the final checkpoint
# MODEL_DIR is defined in previous cells and points to the best checkpoint.
# Assuming tokenizer and model from cell uzoYf3xlOElU are the desired ones.
# If model and tokenizer are not defined, please re-run cell uzoYf3xlOElU.
if 'model' not in globals() or 'tokenizer' not in globals():
    print("Loading model and tokenizer from MODEL_DIR...")
    MODEL_DIR = "./cendol_mt5_id_mad/checkpoint-2500/checkpoint-3750"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).to(device)
model.eval()

MAX_LEN_SRC = 128
MAX_NEW_TOK = 128

rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu") # Load BLEU metric

def generate_single_text(raw_text: str, direction: str, current_model, current_tokenizer):
    if 'clean_text' not in globals():
        raise NameError("clean_text function is not defined. Please run cell tS4yoEHnq713.")

    cleaned_text = clean_text(raw_text) # Apply clean_text to the actual content
    if direction == "id2mad":
        src_prompt = "translate Indonesian to Madurese: " + cleaned_text
    elif direction == "mad2id":
        src_prompt = "translate Madurese to Indonesian: " + cleaned_text
    else:
        raise ValueError("Invalid direction for translation.")

    enc = current_tokenizer(src_prompt, return_tensors="pt", truncation=True, max_length=MAX_LEN_SRC).to(current_model.device)
    with torch.no_grad():
        out = current_model.generate(**enc, max_new_tokens=MAX_NEW_TOK)
    return current_tokenizer.decode(out[0], skip_special_tokens=True)

def score_single_translation(prediction: str, reference: str):
    # Calculate ROUGE scores
    rouge_scores = rouge_metric.compute(predictions=[prediction], references=[reference])
    # Calculate BLEU score
    bleu_score = bleu_metric.compute(predictions=[prediction], references=[[reference]])["score"]
    return {
        "BLEU": bleu_score,
        "ROUGE-1": rouge_scores["rouge1"],
        "ROUGE-L": rouge_scores["rougeL"]
    }

print("\n--- Analysis of Test Data Translations ---")

# Assuming test_clean DataFrame is available from earlier cells
if 'test_clean' not in globals():
    print("Error: 'test_clean' DataFrame not found. Please ensure preprocessing cells are run.")
else:
    print("\nIndonesian -> Madurese Translations:")
    for i, row in nusax_test.head(30).iterrows(): # Limit to 30 samples
        id_text = row["id"]
        mad_ref = row["mad"]

        mad_pred = generate_single_text(id_text, "id2mad", model, tokenizer)
        scores = score_single_translation(mad_pred, mad_ref)

        print(f"--- Sample {i+1} (ID -> MAD) ---")
        print(f"Source (ID):     {id_text}")
        print(f"Reference (MAD): {mad_ref}")
        print(f"Prediction (MAD):{mad_pred}")
        print(f"BLEU Score:      {scores['BLEU']:.4f}")
        print(f"ROUGE-1 Score:   {scores['ROUGE-1']:.4f}")
        print(f"ROUGE-L Score:   {scores['ROUGE-L']:.4f}\n")

    print("\nMadurese -> Indonesian Translations:")
    for i, row in nusax_test.head(30).iterrows(): # Limit to 30 samples
        mad_text = row["mad"]
        id_ref = row["id"]

        id_pred = generate_single_text(mad_text, "mad2id", model, tokenizer)
        scores = score_single_translation(id_pred, id_ref)

        print(f"--- Sample {i+1} (MAD -> ID) ---")
        print(f"Source (MAD):     {mad_text}")
        print(f"Reference (ID): {id_ref}")
        print(f"Prediction (ID):{id_pred}")
        print(f"BLEU Score:      {scores['BLEU']:.4f}")
        print(f"ROUGE-1 Score:   {scores['ROUGE-1']:.4f}")
        print(f"ROUGE-L Score:   {scores['ROUGE-L']:.4f}\n")


--- Analysis of Test Data Translations ---

Indonesian -> Madurese Translations:
--- Sample 1 (ID -> MAD) ---
Source (ID):     Dekat dengan hotel saya menginap, hanya ditempuh jalan kaki, di sini banyak sekali pilihan makanannya, tempat yang luas, dan menyenangkan
Reference (MAD): Semmak bik hotel engkok nginep, pera' ejeleni ajelen soko, ediye bennyak sarah pelean kakananna, kenengngan se leber, ben masenneng
Prediction (MAD): parjelenan bik hotel engkok nginep, pera' ditempuh jelen kaki, e diye bennyak sarah pelean kakananna, kennengngan se leber, ben masenneng sarah. eman ongghu. ban kaso'on.'. bannya' sarah.', bennya'.''.''
BLEU Score:      26.2503
ROUGE-1 Score:   0.5417
ROUGE-L Score:   0.5417

--- Sample 2 (ID -> MAD) ---
Source (ID):     Iya benar, dia sedang jaga warung.
Reference (MAD): Iye bhender, rua ajege berung.
Prediction (MAD): iye bhender, engkok bik selaen jaga warung. bhenderre'. bhendinga'.
BLEU Score:      6.8372
ROUGE-1 Score:   0.2857
ROUGE-L Score:   0.2857

-

In [64]:
def translate(text: str, direction="id2mad"):
    text = text.strip()
    if direction == "id2mad":
        src = "translate Indonesian to Madurese: " + text
    elif direction == "mad2id":
        src = "translate Madurese to Indonesian: " + text
    else:
        raise ValueError("direction harus 'id2mad' atau 'mad2id'")

    enc = tokenizer(src, return_tensors="pt", truncation=True, max_length=MAX_LEN_SRC).to(device)
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=MAX_NEW_TOK, num_beams=4)
    return tokenizer.decode(out[0], skip_special_tokens=True)

while True:
    direction = input("\nPilih arah (id2mad / mad2id) atau ketik q: ").strip()
    if direction.lower() == "q":
        break
    text = input("Masukkan teks: ").strip()
    print("Hasil:", translate(text, direction=direction))



Pilih arah (id2mad / mad2id) atau ketik q:  q


In [65]:
model.save_pretrained("/kaggle/working/saved_model")
tokenizer.save_pretrained("/kaggle/working/saved_model")

()

In [67]:
from transformers import AutoModelForSeq2SeqLM
from indobenchmark import IndoNLGTokenizer

# load model hasil fine-tuning
model = AutoModelForSeq2SeqLM.from_pretrained("./saved_model")

# load tokenizer dari model ASAL (BUKAN saved_model)
tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")
