In [None]:
!pip install -U indobenchmark-toolkit evaluate sacrebleu rouge-score

In [None]:
!pip install -U transformers accelerate evaluate sacrebleu rouge-score sentencepiece


In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import os, re, unicodedata
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

import evaluate
print("evaluate version:", evaluate.__version__)

# >>> ADDED: tokenizer khusus IndoBenchmark
from indobenchmark import IndoNLGTokenizer

2025-12-13 23:01:44.049637: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765666904.071119     363 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765666904.077576     363 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

evaluate version: 0.4.6


In [1]:
import pandas as pd
import re
import unicodedata
import numpy as np

IN_PATH  = "/kaggle/input/inmad-dataset/INMAD Dataset.csv"
OUT_PATH = "inmad_clean_v2.csv"

def fix_mojibake(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)

    # heuristik sederhana: kalau ada √É/√Ç/ÔøΩ biasanya mojibake
    if any(ch in s for ch in ["√É", "√Ç", "ÔøΩ", "\uFFFD"]):
        for src_enc in ["latin-1", "cp1252"]:
            try:
                s2 = s.encode(src_enc, errors="ignore").decode("utf-8", errors="ignore")
                if len(s2.strip()) > 0:
                    s = s2
                    break
            except Exception:
                pass
    return s

def normalize_text(s: str) -> str:
    s = fix_mojibake(s)
    s = unicodedata.normalize("NFKC", s)

    # hapus control chars
    s = re.sub(r"[\u0000-\u001F\u007F-\u009F]", " ", s)
    s = s.replace("\u200b", " ").replace("\ufeff", " ")

    # normalisasi kutip/apostrof
    s = (s.replace("‚Äô","'").replace("‚Äò","'").replace("¬¥","'").replace("`","'")
           .replace("‚Äú",'"').replace("‚Äù",'"'))

    # normalisasi dash dan ellipsis
    s = s.replace("‚Äì","-").replace("‚Äî","-").replace("‚àí","-")
    s = s.replace("‚Ä¶","...")

    # rapikan spasi
    s = re.sub(r"\s+", " ", s).strip()

    # hilangkan spasi sebelum tanda baca: " ,", " .", dst
    s = re.sub(r"\s+([,.;:!?])", r"\1", s)

    # pastikan ada spasi setelah tanda baca jika langsung diikuti huruf/angka
    s = re.sub(r"([,;:!?])([A-Za-z0-9])", r"\1 \2", s)
    s = re.sub(r"(\.)([A-Za-z])", r"\1 \2", s)  # ".kata" -> ". kata"

    # rapikan kurung/bracket
    s = re.sub(r"\(\s+", "(", s)
    s = re.sub(r"\s+\)", ")", s)
    s = re.sub(r"\[\s+", "[", s)
    s = re.sub(r"\s+\]", "]", s)

    # collapse multi punctuation
    s = re.sub(r"([!?])\1{1,}", r"\1", s)
    s = re.sub(r"\.{4,}", "...", s)

    return s

def tok_len(s: str) -> int:
    return len(re.findall(r"\S+", str(s)))

# ===== Load =====
raw = pd.read_csv(IN_PATH)

# Ambil kolom yang kita butuhkan: Indonesia & Madura (buang English)
df = raw.rename(columns={"Indonesia":"id", "Madura":"mad"}).copy()
df["id"]  = df["id"].astype(str).map(normalize_text)
df["mad"] = df["mad"].astype(str).map(normalize_text)

# drop kosong + dedup
df = df[(df["id"] != "") & (df["mad"] != "")]
df = df.drop_duplicates(subset=["id","mad"]).reset_index(drop=True)

# ===== Filter kualitas (biar tidak over-noisy) =====
id_len  = df["id"].map(tok_len)
mad_len = df["mad"].map(tok_len)
ratio   = (id_len + 1) / (mad_len + 1)

# batas aman (kamu bisa adjust)
keep = (
    (id_len  >= 3)  & (mad_len >= 3) &
    (id_len  <= 200) & (mad_len <= 220) &
    (ratio >= 0.5) & (ratio <= 2.0)
)

df_clean = df[keep].reset_index(drop=True)

print("Raw rows:", len(raw))
print("After basic clean:", len(df))
print("After filter:", len(df_clean))

# ===== Save =====
df_clean[["id","mad"]].to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


Raw rows: 23098
After basic clean: 23032
After filter: 21389
Saved: inmad_clean_v2.csv


In [3]:
# MODEL
MODEL_NAME = "indobenchmark/indobart-v2"


# prepro

In [2]:
def standardize_cols(df: pd.DataFrame) -> pd.DataFrame:
    cols = {c.lower(): c for c in df.columns}
    id_col  = cols.get("indonesian") or cols.get("id") or cols.get("indo") or cols.get("source")
    mad_col = cols.get("madurese") or cols.get("mad") or cols.get("madura") or cols.get("target")
    if id_col is None or mad_col is None:
        raise ValueError(f"Kolom id/mad tidak ketemu. Kolom yang ada: {list(df.columns)}")
    out = df[[id_col, mad_col]].copy()
    out.columns = ["id", "mad"]
    return out

def fix_mojibake(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)

    # heuristik sederhana: kalau ada √É/√Ç/ÔøΩ biasanya mojibake
    if any(ch in s for ch in ["√É", "√Ç", "ÔøΩ", "\uFFFD"]):
        for src_enc in ["latin-1", "cp1252"]:
            try:
                s2 = s.encode(src_enc, errors="ignore").decode("utf-8", errors="ignore")
                if len(s2.strip()) > 0:
                    s = s2
                    break
            except Exception:
                pass
    return s
def clean_text(s: str) -> str:
    s = fix_mojibake(s)
    s = s.replace("\u200b", " ").replace("\ufeff", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["id"]  = df["id"].map(clean_text)
    df["mad"] = df["mad"].map(clean_text)
    df = df[(df["id"] != "") & (df["mad"] != "")]
    df = df.drop_duplicates(subset=["id","mad"]).reset_index(drop=True)
    return df

def drop_unnamed_cols(df: pd.DataFrame) -> pd.DataFrame:
    unnamed = [c for c in df.columns if str(c).lower().startswith("unnamed")]
    if unnamed:
        df = df.drop(columns=unnamed)
    return df

def guess_column(df: pd.DataFrame, candidates):
    cols_lower = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand.lower() in cols_lower:
            return cols_lower[cand.lower()]
    return None

# import data

In [4]:

nusax_train = clean_df(standardize_cols(pd.read_csv("/kaggle/input/nusaxdata/train.csv")))
nusax_valid = clean_df(standardize_cols(pd.read_csv("/kaggle/input/nusaxdata/valid.csv")))
nusax_test  = clean_df(standardize_cols(pd.read_csv("/kaggle/input/nusaxdata/test (1).csv")))  # sesuaikan nama file test kamu

print(len(nusax_train), len(nusax_valid), len(nusax_test))


500 100 400


In [5]:
lex = pd.read_csv("/kaggle/input/nusaxdata/madurese.csv")  # file lexicon
lex = standardize_cols(lex)        # jadi id, mad
lex = clean_df(lex)

# bikin mapping mad->mad "kanonik" berbasis bentuk yang paling sering / paling pendek
# (ini sederhana tapi efektif untuk merapikan variasi ejaan)
mad2canon = {}
for _, r in lex.iterrows():
    m = r["mad"]
    # pilih bentuk canon = bentuk yang "paling clean" (panjang paling pendek)
    if m not in mad2canon:
        mad2canon[m] = m

# kalau kamu mau mapping variasi ke satu bentuk (misal bul√¢ vs bula'), kamu butuh aturan tambahan.
# Untuk versi aman: kita pakai normalisasi karakter saja + perbaiki mojibake.
def normalize_madurese_with_lexicon(text: str) -> str:
    # perbaiki encoding & rapikan spasi (yang paling aman)
    return clean_text(text)


In [6]:
inmad = pd.read_csv("inmad_clean_v2.csv")


# normalisasi madurese pakai fungsi lexicon (safe)
inmad["mad"] = inmad["mad"].map(normalize_madurese_with_lexicon)

print("inmad:", len(inmad))


inmad: 21389


In [7]:
rng = np.random.default_rng(42)
idx = np.arange(len(inmad))
rng.shuffle(idx)

valid_frac = 0.05   # 5% valid dari InMad
n_valid = max(1, int(len(inmad) * valid_frac))

inmad_valid = inmad.iloc[idx[:n_valid]].reset_index(drop=True)
inmad_train = inmad.iloc[idx[n_valid:]].reset_index(drop=True)

# (opsional) kalau InMad jauh lebih besar, batasi rasio biar NusaX nggak ketimbun
max_ratio = 3  # InMad train max 3x NusaX train
target_inmad = min(len(inmad_train), max_ratio * len(nusax_train))
inmad_train = inmad_train.sample(n=target_inmad, random_state=42).reset_index(drop=True)

# tag sumber (optional tapi bagus buat kontrol domain)
nusax_train["src"] = "nusax"
nusax_valid["src"] = "nusax"
inmad_train["src"] = "inmad"
inmad_valid["src"] = "inmad"

train_mix = pd.concat([nusax_train, inmad_train], ignore_index=True)
valid_mix = pd.concat([nusax_valid, inmad_valid], ignore_index=True)

print("train_mix:", len(train_mix), "valid_mix:", len(valid_mix))


train_mix: 2000 valid_mix: 1169


In [9]:
pip install dataset

Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading SQLAlchemy-1.4.54-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Downloading SQLAlchemy-1.4.54-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.6/1.6 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: banal, sqlalchemy, dataset
  Attempting uninstall: sqlalchemy
    Found existing installation: SQLAlchemy 2.0.41
  

In [None]:
def build_bidirectional(df: pd.DataFrame) -> Dataset:
    rows = []
    for _, r in df.iterrows():
        rows.append({
            "direction": "id2mad",
            "source": "translate Indonesian to Madurese: " + r["id"],
            "target": r["mad"]
        })
        rows.append({
            "direction": "mad2id",
            "source": "translate Madurese to Indonesian: " + r["mad"],
            "target": r["id"]
        })
    return Dataset.from_pandas(pd.DataFrame(rows))

train_ds = build_bidirectional(train_mix)
valid_ds = build_bidirectional(valid_mix)
test_ds  = build_bidirectional(nusax_test)

train_ds[0], train_ds[1]


# model

In [16]:
from indobenchmark import IndoNLGTokenizer
tokenizer = IndoNLGTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


In [17]:
# >>> ADDED: patch supaya IndoNLGTokenizer.pad() kompatibel dengan transformers baru
_orig_pad = tokenizer.pad

def _pad_compat(encoded_inputs, *args, **kwargs):
    # buang argumen yang bikin crash di IndoNLGTokenizer versi lama
    kwargs.pop("padding_side", None)
    kwargs.pop("return_tensor", None)  # kadang typo lama/beda nama
    return _orig_pad(encoded_inputs, *args, **kwargs)

tokenizer.pad = _pad_compat


In [18]:
MAX_LEN_SRC = 128
MAX_LEN_TGT = 128

def tokenize_batch(batch):
    inputs = tokenizer(
        batch["source"],
        truncation=True,
        max_length=MAX_LEN_SRC
    )

    labels = tokenizer(
        batch["target"],
        truncation=True,
        max_length=MAX_LEN_TGT
    )

    inputs["labels"] = labels["input_ids"]
    return inputs

train_tok = train_ds.map(tokenize_batch, batched=True, remove_columns=train_ds.column_names)
valid_tok = valid_ds.map(tokenize_batch, batched=True, remove_columns=valid_ds.column_names)
test_tok  = test_ds.map(tokenize_batch,  batched=True, remove_columns=test_ds.column_names)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100
)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [19]:
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # >>> ADDED: preds kadang tuple
    if isinstance(preds, tuple):
        preds = preds[0]

    pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_score = bleu.compute(predictions=pred_texts, references=[[r] for r in ref_texts])["score"]
    rouge_score = rouge.compute(predictions=pred_texts, references=ref_texts)

    return {
        "bleu": bleu_score,
        "rouge1": rouge_score["rouge1"],
        "rougeL": rouge_score["rougeL"]
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [24]:
import transformers, sys
print("transformers version:", transformers.__version__)
print("python:", sys.version)

OUTPUT_DIR = "./indobenchmark-indobart-v2"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    save_steps=10**9,        # praktis tidak pernah save checkpoint
    save_total_limit=1,
    logging_steps=100,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    fp16=True,
    report_to="none",
    prediction_loss_only=True,
)

transformers version: 4.57.3
python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]


In [None]:
import transformers, sys
print("transformers version:", transformers.__version__)
print("python:", sys.version)

OUTPUT_DIR = "./indobenchmark-indobart-v2"

# ======================
# 1) PATCH: cegah autosave tokenizer (IndoNLGTokenizer tidak support save_vocabulary)
# ======================
def _noop_save_pretrained(*args, **kwargs):
    return ()

tokenizer.save_pretrained = _noop_save_pretrained
tokenizer.save_vocabulary = lambda *args, **kwargs: ()

# ======================
# 2) TrainingArguments (aman lintas versi)
# ======================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    save_steps=10**9,         # praktis tidak pernah save checkpoint
    save_total_limit=1,
    logging_steps=100,

    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    num_train_epochs=10,

    fp16=True,
    report_to="none",
    prediction_loss_only=True,
)

# ======================
# 3) BLEU per epoch callback + AVG + summary
# ======================
import torch
import sacrebleu
from transformers import TrainerCallback

BLEU_LOG = {}  # {epoch_int: {"id2mad":..., "mad2id":..., "avg":...}}

def _epoch_key(state):
    if state.epoch is None:
        return int(getattr(state, "global_step", 0))
    return int(state.epoch)

class BleuEachEpochCallback(TrainerCallback):
    def __init__(self, tokenizer, valid_df, direction="id2mad", n_samples=100,
                 max_len_src=128, max_new_tok=128, batch_size=8, num_beams=4):
        self.tokenizer = tokenizer
        self.valid_df = valid_df
        self.direction = direction
        self.n_samples = n_samples
        self.max_len_src = max_len_src
        self.max_new_tok = max_new_tok
        self.batch_size = batch_size
        self.num_beams = num_beams

    def on_epoch_end(self, args, state, control, **kwargs):
        model = kwargs["model"]
        device = ("cuda" if torch.cuda.is_available() else "cpu")
        model.eval()

        df = self.valid_df.head(self.n_samples)

        if self.direction == "id2mad":
            sources = ["translate Indonesian to Madurese: " + x for x in df["id"].tolist()]
            refs = df["mad"].tolist()
        else:
            sources = ["translate Madurese to Indonesian: " + x for x in df["mad"].tolist()]
            refs = df["id"].tolist()

        preds = []
        with torch.no_grad():
            for i in range(0, len(sources), self.batch_size):
                batch = sources[i:i+self.batch_size]
                enc = self.tokenizer(
                    batch,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=self.max_len_src
                )
                enc = {k: v.to(device) for k, v in enc.items()}
                out = model.generate(**enc, max_new_tokens=self.max_new_tok, num_beams=self.num_beams)

                # FIX: IndoNLGTokenizer.decode tidak support kwargs dari batch_decode
                preds.extend([self.tokenizer.decode(o, skip_special_tokens=True) for o in out])

        bleu = sacrebleu.corpus_bleu(preds, [refs]).score

        ep = _epoch_key(state)
        BLEU_LOG.setdefault(ep, {})
        BLEU_LOG[ep][self.direction] = bleu

        # print avg jika dua arah sudah ada
        if "id2mad" in BLEU_LOG[ep] and "mad2id" in BLEU_LOG[ep]:
            avg_bleu = (BLEU_LOG[ep]["id2mad"] + BLEU_LOG[ep]["mad2id"]) / 2.0
            BLEU_LOG[ep]["avg"] = avg_bleu
            print(
                f"\nüèÜ Epoch {ep} | ID2MAD BLEU@{self.n_samples}: {BLEU_LOG[ep]['id2mad']:.2f} | "
                f"MAD2ID BLEU@{self.n_samples}: {BLEU_LOG[ep]['mad2id']:.2f} | "
                f"AVG: {avg_bleu:.2f}\n"
            )
        else:
            print(f"\nüèÜ Epoch {ep} | {self.direction.upper()} BLEU@{self.n_samples}: {bleu:.2f}\n")

        model.train()
        return control

class BleuAvgSummaryCallback(TrainerCallback):
    def on_train_end(self, args, state, control, **kwargs):
        if not BLEU_LOG:
            print("\n‚ö†Ô∏è BLEU_LOG kosong (tidak ada BLEU yang tercatat)\n")
            return control

        print("\n==============================")
        print("üìå RINGKASAN BLEU PER EPOCH (AVG)")
        print("==============================")
        for ep in sorted(BLEU_LOG.keys()):
            rec = BLEU_LOG[ep]
            id2 = rec.get("id2mad", float("nan"))
            m2i = rec.get("mad2id", float("nan"))
            avg = rec.get("avg", float("nan"))
            print(f"Epoch {ep}: ID2MAD={id2:.2f} | MAD2ID={m2i:.2f} | AVG={avg:.2f}")

        avgs = [BLEU_LOG[ep]["avg"] for ep in sorted(BLEU_LOG.keys()) if "avg" in BLEU_LOG[ep]]
        if avgs:
            overall = sum(avgs) / len(avgs)
            print("------------------------------")
            print(f"‚úÖ Overall AVG BLEU across epochs: {overall:.2f}")
        print("==============================\n")
        return control

# buat callback dua arah + summary
bleu_cb_id2mad = BleuEachEpochCallback(
    tokenizer=tokenizer,
    valid_df=valid_mix,
    direction="id2mad",
    n_samples=100,
    max_len_src=128,
    max_new_tok=128,
    batch_size=8,
    num_beams=4
)

bleu_cb_mad2id = BleuEachEpochCallback(
    tokenizer=tokenizer,
    valid_df=valid_mix,
    direction="mad2id",
    n_samples=100,
    max_len_src=128,
    max_new_tok=128,
    batch_size=8,
    num_beams=4
)

bleu_summary = BleuAvgSummaryCallback()

# ======================
# 4) Trainer + train
# ======================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    data_collator=data_collator,
    callbacks=[bleu_cb_id2mad, bleu_cb_mad2id, bleu_summary]
)

trainer.train()

# ======================
# 5) Save model (tokenizer tidak disave)
# ======================
trainer.save_model(OUTPUT_DIR)
print("‚úÖ Model disimpan ke:", OUTPUT_DIR)
print("‚ÑπÔ∏è Tokenizer tidak disimpan (pakai tokenizer bawaan indobenchmark/indobart-v2).")

best_ckpt = getattr(trainer.state, "best_model_checkpoint", None)
best_metric = getattr(trainer.state, "best_metric", None)
print("Best checkpoint:", best_ckpt)
print("Best metric:", best_metric)


You are adding a <class '__main__.BleuEachEpochCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
BleuEachEpochCallback


transformers version: 4.57.3
python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]




Step,Training Loss
100,0.0185
200,0.032
300,0.0334
400,0.0316
500,0.0262
600,0.0291
700,0.0243
800,0.0216
900,0.0212
1000,0.0212



üèÜ Epoch 1 | ID2MAD BLEU@100: 8.08


üèÜ Epoch 1 | ID2MAD BLEU@100: 8.08 | MAD2ID BLEU@100: 17.94 | AVG: 13.01


üèÜ Epoch 2 | ID2MAD BLEU@100: 8.50


üèÜ Epoch 2 | ID2MAD BLEU@100: 8.50 | MAD2ID BLEU@100: 16.89 | AVG: 12.70


üèÜ Epoch 3 | ID2MAD BLEU@100: 7.57


üèÜ Epoch 3 | ID2MAD BLEU@100: 7.57 | MAD2ID BLEU@100: 14.34 | AVG: 10.96


üèÜ Epoch 4 | ID2MAD BLEU@100: 8.79


üèÜ Epoch 4 | ID2MAD BLEU@100: 8.79 | MAD2ID BLEU@100: 15.56 | AVG: 12.18


üèÜ Epoch 5 | ID2MAD BLEU@100: 8.64


üèÜ Epoch 5 | ID2MAD BLEU@100: 8.64 | MAD2ID BLEU@100: 14.98 | AVG: 11.81


üèÜ Epoch 6 | ID2MAD BLEU@100: 8.65


üèÜ Epoch 6 | ID2MAD BLEU@100: 8.65 | MAD2ID BLEU@100: 14.83 | AVG: 11.74


üèÜ Epoch 7 | ID2MAD BLEU@100: 8.08


üèÜ Epoch 7 | ID2MAD BLEU@100: 8.08 | MAD2ID BLEU@100: 14.21 | AVG: 11.15


üèÜ Epoch 8 | ID2MAD BLEU@100: 9.01


üèÜ Epoch 8 | ID2MAD BLEU@100: 9.01 | MAD2ID BLEU@100: 13.65 | AVG: 11.33


üèÜ Epoch 9 | ID2MAD BLEU@100: 8.61


üèÜ Epoch 9 | ID2MAD BLEU@100: 

In [31]:
try:
    tokenizer.save_pretrained(OUTPUT_DIR)
except Exception as e:
    print("Tokenizer tidak bisa disave dengan save_pretrained (aman di-skip):", repr(e))

In [None]:
# ======================
# CUSTOM EVALUATION (PRINT + BLEU + CONTOH)
# ======================

from tqdm.auto import tqdm
import sacrebleu
import torch

print("üìÇ Memuat model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = trainer.model.to(device)
model.eval()

TEST_PATH = "/kaggle/input/nusaxdata/test (1).csv"  

print(f"üìÇ Membaca data tes: {TEST_PATH}")
test_df = pd.read_csv(TEST_PATH)
test_df = drop_unnamed_cols(test_df)

id_col = guess_column(test_df, ["ind", "id", "indo", "indonesian"])
mad_col = guess_column(test_df, ["mad", "madurese", "madura"])
test_df = test_df[[id_col, mad_col]].rename(columns={id_col: "id", mad_col: "mad"})

test_df["id"]  = test_df["id"].apply(clean_text)
test_df["mad"] = test_df["mad"].apply(clean_text)

N = 100
test_df = test_df.head(N)
print(f"‚úÖ Menguji pada {len(test_df)} kalimat pertama.")
print("üöÄ Mulai Menerjemahkan...")

sources = ["translate Indonesian to Madurese: " + x for x in test_df["id"].tolist()]
refs    = test_df["mad"].tolist()

preds = []
batch_size = 8

for i in tqdm(range(0, len(sources), batch_size)):
    batch = sources[i:i+batch_size]
    enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=128, num_beams=4)

    # >>> FIX DI SINI
    preds.extend([tokenizer.decode(o, skip_special_tokens=True) for o in out])

bleu = sacrebleu.corpus_bleu(preds, [refs]).score

print("\n==============================")
print(f"üèÜ REAL BLEU SCORE: {bleu:.2f}")
print("==============================\n")

print("üîç 5 CONTOH HASIL:")
for i in range(min(5, len(test_df))):
    print(f"üáÆüá© Indo  : {test_df.iloc[i]['id']}")
    print(f"ü§ñ Model : {preds[i]}")
    print(f"üîë Kunci : {refs[i]}")
    print("-" * 20)


üìÇ Memuat model...
üìÇ Membaca data tes: /kaggle/working/nusax/test.csv
‚úÖ Menguji pada 100 kalimat pertama.
üöÄ Mulai Menerjemahkan...


  0%|          | 0/13 [00:00<?, ?it/s]


üèÜ REAL BLEU SCORE: 8.81

üîç 5 CONTOH HASIL:
üáÆüá© Indo  : Dekat dengan hotel saya menginap, hanya ditempuh jalan kaki, di sini banyak sekali pilihan makanannya, tempat yang luas, dan menyenangkan
ü§ñ Model :  semma' bik hotel engkok nginep, pera' ditempuh jelen kaki, e diye bennyak sarah pelean kakananna, kennengngan se leber, ben masenneng sarah. aminnih. entara bik hotel riya. bhuktena se lebbi mude... tapegha. bannya' sarah.. bhaghus.. ban mapegghel.. bhekal..ghau..ha..ah..adeh..kadherse.adek..egghu.regghu bik hotel.raddhuk
üîë Kunci : Semmak bik hotel engkok nginep, pera' ejeleni ajelen soko, ediye bennyak sarah pelean kakananna, kenengngan se leber, ben masenneng
--------------------
üáÆüá© Indo  : Iya benar, dia sedang jaga warung.
ü§ñ Model :  iye bhender, engkok bik selaen jaga warung. bhenderre' sarah, bekna bherseh.'. bhuktena bhender.'egus..'raddhis.' bhuk.'ade'.. bhekallis.'adeh..raddhuk. bhegusse..regghu..ajherse.ajhek.ajraddhik.. deddhi bhegghik.ajik.dhik en

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

model = trainer.model.to(device)
model.eval()

MAX_LEN_SRC = 128
MAX_NEW_TOK = 128

def generate_batch(sources, batch_size=8):
    preds = []
    for i in range(0, len(sources), batch_size):
        batch = sources[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LEN_SRC)
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model.generate(**enc, max_new_tokens=MAX_NEW_TOK, num_beams=4)

        # >>> CHANGED: jangan batch_decode untuk IndoNLGTokenizer
        preds.extend([tokenizer.decode(o, skip_special_tokens=True) for o in out])

    return preds

import evaluate
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

def score(preds, refs):
    b = bleu.compute(predictions=preds, references=[[r] for r in refs])["score"]
    r = rouge.compute(predictions=preds, references=refs)
    return {"BLEU": b, "ROUGE-1": r["rouge1"], "ROUGE-L": r["rougeL"]}

# VALID
src_id2mad = ["translate Indonesian to Madurese: " + x for x in valid_clean["id"].tolist()]
ref_id2mad = valid_mix["mad"].tolist()
pred_id2mad = generate_batch(src_id2mad)
print("VALID ID ‚Üí MAD:", score(pred_id2mad, ref_id2mad))

src_mad2id = ["translate Madurese to Indonesian: " + x for x in valid_clean["mad"].tolist()]
ref_mad2id = valid_mix["id"].tolist()
pred_mad2id = generate_batch(src_mad2id)
print("VALID MAD ‚Üí ID:", score(pred_mad2id, ref_mad2id))

# TEST
src_id2mad = ["translate Indonesian to Madurese: " + x for x in test_clean["id"].tolist()]
ref_id2mad = nusax_test["mad"].tolist()
pred_id2mad = generate_batch(src_id2mad)
print("TEST ID ‚Üí MAD:", score(pred_id2mad, ref_id2mad))

src_mad2id = ["translate Madurese to Indonesian: " + x for x in test_clean["mad"].tolist()]
ref_mad2id = nusax_test["id"].tolist()
pred_mad2id = generate_batch(src_mad2id)
print("TEST MAD ‚Üí ID:", score(pred_mad2id, ref_mad2id))


VALID ID ‚Üí MAD: {'BLEU': 8.52114696833872, 'ROUGE-1': 0.3320086378972633, 'ROUGE-L': 0.3141487554458656}
VALID MAD ‚Üí ID: {'BLEU': 14.100656809525388, 'ROUGE-1': 0.45444854182740113, 'ROUGE-L': 0.4439560797590716}
TEST ID ‚Üí MAD: {'BLEU': 9.082167730131204, 'ROUGE-1': 0.3410521121564547, 'ROUGE-L': 0.32573339839176685}
TEST MAD ‚Üí ID: {'BLEU': 14.444095656747404, 'ROUGE-1': 0.4696922027914887, 'ROUGE-L': 0.45547109612726866}


In [37]:
def translate(text: str, direction="id2mad"):
    text = text.strip()
    if direction == "id2mad":
        src = "translate Indonesian to Madurese: " + text
    elif direction == "mad2id":
        src = "translate Madurese to Indonesian: " + text
    else:
        raise ValueError("direction harus 'id2mad' atau 'mad2id'")

    enc = tokenizer(src, return_tensors="pt", truncation=True, max_length=MAX_LEN_SRC).to(device)
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=MAX_NEW_TOK, num_beams=4)
    return tokenizer.decode(out[0], skip_special_tokens=True)

while True:
    direction = input("\nPilih arah (id2mad / mad2id) atau ketik q: ").strip()
    if direction.lower() == "q":
        break
    text = input("Masukkan teks: ").strip()
    print("Hasil:", translate(text, direction=direction))



Pilih arah (id2mad / mad2id) atau ketik q:  id2mad
Masukkan teks:  aku mau makan nasi goreng


Hasil:  engkok terro ngakan nasek ghuring ghuringnga. bhuktena engkok bhender.gggh.gghu.gha. bhallis.bbhina.bhina engkok.ggha.bherse.bhei.beh.begghu engkok bik bekna.bik.gik.regghu bi' bi'.reghi engkok ghik.bi'.bi...bigus..regik.. engkokregghuk.. lebbi engkok lebbi nyaman..raddhuk.



Pilih arah (id2mad / mad2id) atau ketik q:  mad2id
Masukkan teks:  engkok terro ngakan nasek ghuring


Hasil:  saya ingin makan nasi goreng pedas. tidak rekomendasi. tidak direkomendasikan. tidakenak.



Pilih arah (id2mad / mad2id) atau ketik q:  q
