In [1]:
!pip install -U transformers accelerate evaluate sacrebleu rouge-score sentencepiece


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading col

In [22]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


# import

In [None]:
import os, re, unicodedata
import numpy as np
import pandas as pd

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
import evaluate





# prepro

In [1]:
import pandas as pd
import re
import unicodedata
import numpy as np

IN_PATH  = "/kaggle/input/inmad-dataset/INMAD Dataset.csv"
OUT_PATH = "inmad_clean_v2.csv"

def fix_mojibake(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)

    # heuristik sederhana: kalau ada Ã/Â/� biasanya mojibake
    if any(ch in s for ch in ["Ã", "Â", "�", "\uFFFD"]):
        for src_enc in ["latin-1", "cp1252"]:
            try:
                s2 = s.encode(src_enc, errors="ignore").decode("utf-8", errors="ignore")
                if len(s2.strip()) > 0:
                    s = s2
                    break
            except Exception:
                pass
    return s

def normalize_text(s: str) -> str:
    s = fix_mojibake(s)
    s = unicodedata.normalize("NFKC", s)

    # hapus control chars
    s = re.sub(r"[\u0000-\u001F\u007F-\u009F]", " ", s)
    s = s.replace("\u200b", " ").replace("\ufeff", " ")

    # normalisasi kutip/apostrof
    s = (s.replace("’","'").replace("‘","'").replace("´","'").replace("`","'")
           .replace("“",'"').replace("”",'"'))

    # normalisasi dash dan ellipsis
    s = s.replace("–","-").replace("—","-").replace("−","-")
    s = s.replace("…","...")

    # rapikan spasi
    s = re.sub(r"\s+", " ", s).strip()

    # hilangkan spasi sebelum tanda baca: " ,", " .", dst
    s = re.sub(r"\s+([,.;:!?])", r"\1", s)

    # pastikan ada spasi setelah tanda baca jika langsung diikuti huruf/angka
    s = re.sub(r"([,;:!?])([A-Za-z0-9])", r"\1 \2", s)
    s = re.sub(r"(\.)([A-Za-z])", r"\1 \2", s)  # ".kata" -> ". kata"

    # rapikan kurung/bracket
    s = re.sub(r"\(\s+", "(", s)
    s = re.sub(r"\s+\)", ")", s)
    s = re.sub(r"\[\s+", "[", s)
    s = re.sub(r"\s+\]", "]", s)

    # collapse multi punctuation
    s = re.sub(r"([!?])\1{1,}", r"\1", s)
    s = re.sub(r"\.{4,}", "...", s)

    return s

def tok_len(s: str) -> int:
    return len(re.findall(r"\S+", str(s)))

# ===== Load =====
raw = pd.read_csv(IN_PATH)

# Ambil kolom yang kita butuhkan: Indonesia & Madura (buang English)
df = raw.rename(columns={"Indonesia":"id", "Madura":"mad"}).copy()
df["id"]  = df["id"].astype(str).map(normalize_text)
df["mad"] = df["mad"].astype(str).map(normalize_text)

# drop kosong + dedup
df = df[(df["id"] != "") & (df["mad"] != "")]
df = df.drop_duplicates(subset=["id","mad"]).reset_index(drop=True)

# ===== Filter kualitas (biar tidak over-noisy) =====
id_len  = df["id"].map(tok_len)
mad_len = df["mad"].map(tok_len)
ratio   = (id_len + 1) / (mad_len + 1)

# batas aman (kamu bisa adjust)
keep = (
    (id_len  >= 3)  & (mad_len >= 3) &
    (id_len  <= 200) & (mad_len <= 220) &
    (ratio >= 0.5) & (ratio <= 2.0)
)

df_clean = df[keep].reset_index(drop=True)

print("Raw rows:", len(raw))
print("After basic clean:", len(df))
print("After filter:", len(df_clean))

# ===== Save =====
df_clean[["id","mad"]].to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


Raw rows: 23098
After basic clean: 23032
After filter: 21389
Saved: inmad_clean_v2.csv


## prepro all

In [3]:
def standardize_cols(df: pd.DataFrame) -> pd.DataFrame:
    cols = {c.lower(): c for c in df.columns}
    id_col  = cols.get("indonesian") or cols.get("id") or cols.get("indo") or cols.get("source")
    mad_col = cols.get("madurese") or cols.get("mad") or cols.get("madura") or cols.get("target")
    if id_col is None or mad_col is None:
        raise ValueError(f"Kolom id/mad tidak ketemu. Kolom yang ada: {list(df.columns)}")
    out = df[[id_col, mad_col]].copy()
    out.columns = ["id", "mad"]
    return out

def fix_mojibake(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)

    # heuristik sederhana: kalau ada Ã/Â/� biasanya mojibake
    if any(ch in s for ch in ["Ã", "Â", "�", "\uFFFD"]):
        for src_enc in ["latin-1", "cp1252"]:
            try:
                s2 = s.encode(src_enc, errors="ignore").decode("utf-8", errors="ignore")
                if len(s2.strip()) > 0:
                    s = s2
                    break
            except Exception:
                pass
    return s
def clean_text(s: str) -> str:
    s = fix_mojibake(s)
    s = s.replace("\u200b", " ").replace("\ufeff", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["id"]  = df["id"].map(clean_text)
    df["mad"] = df["mad"].map(clean_text)
    df = df[(df["id"] != "") & (df["mad"] != "")]
    df = df.drop_duplicates(subset=["id","mad"]).reset_index(drop=True)
    return df


### import data nusax

In [4]:

nusax_train = clean_df(standardize_cols(pd.read_csv("/kaggle/input/nusaxdata/train.csv")))
nusax_valid = clean_df(standardize_cols(pd.read_csv("/kaggle/input/nusaxdata/valid.csv")))
nusax_test  = clean_df(standardize_cols(pd.read_csv("/kaggle/input/nusaxdata/test (1).csv")))  # sesuaikan nama file test kamu

print(len(nusax_train), len(nusax_valid), len(nusax_test))


500 100 400


In [5]:
lex = pd.read_csv("/kaggle/input/nusaxdata/madurese.csv")  # file lexicon
lex = standardize_cols(lex)        # jadi id, mad
lex = clean_df(lex)

# bikin mapping mad->mad "kanonik" berbasis bentuk yang paling sering / paling pendek
# (ini sederhana tapi efektif untuk merapikan variasi ejaan)
mad2canon = {}
for _, r in lex.iterrows():
    m = r["mad"]
    # pilih bentuk canon = bentuk yang "paling clean" (panjang paling pendek)
    if m not in mad2canon:
        mad2canon[m] = m

# kalau kamu mau mapping variasi ke satu bentuk (misal bulâ vs bula'), kamu butuh aturan tambahan.
# Untuk versi aman: kita pakai normalisasi karakter saja + perbaiki mojibake.
def normalize_madurese_with_lexicon(text: str) -> str:
    # perbaiki encoding & rapikan spasi (yang paling aman)
    return clean_text(text)


In [6]:
inmad = pd.read_csv("inmad_clean_v2.csv")


# normalisasi madurese pakai fungsi lexicon (safe)
inmad["mad"] = inmad["mad"].map(normalize_madurese_with_lexicon)

print("inmad:", len(inmad))


inmad: 21389


In [7]:
rng = np.random.default_rng(42)
idx = np.arange(len(inmad))
rng.shuffle(idx)

valid_frac = 0.05   # 5% valid dari InMad
n_valid = max(1, int(len(inmad) * valid_frac))

inmad_valid = inmad.iloc[idx[:n_valid]].reset_index(drop=True)
inmad_train = inmad.iloc[idx[n_valid:]].reset_index(drop=True)

# (opsional) kalau InMad jauh lebih besar, batasi rasio biar NusaX nggak ketimbun
max_ratio = 3  # InMad train max 3x NusaX train
target_inmad = min(len(inmad_train), max_ratio * len(nusax_train))
inmad_train = inmad_train.sample(n=target_inmad, random_state=42).reset_index(drop=True)

# tag sumber (optional tapi bagus buat kontrol domain)
nusax_train["src"] = "nusax"
nusax_valid["src"] = "nusax"
inmad_train["src"] = "inmad"
inmad_valid["src"] = "inmad"

train_mix = pd.concat([nusax_train, inmad_train], ignore_index=True)
valid_mix = pd.concat([nusax_valid, inmad_valid], ignore_index=True)

print("train_mix:", len(train_mix), "valid_mix:", len(valid_mix))


train_mix: 2000 valid_mix: 1169


In [None]:
def build_bidir(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # ID -> MAD
    a = pd.DataFrame({
        "src_text": df.apply(lambda r: f"translate Indonesian to Madurese [{r['src']}]: {r['id']}", axis=1),
        "tgt_text": df["mad"].tolist()
    })
    # MAD -> ID
    b = pd.DataFrame({
        "src_text": df.apply(lambda r: f"translate Madurese to Indonesian [{r['src']}]: {r['mad']}", axis=1),
        "tgt_text": df["id"].tolist()
    })
    out = pd.concat([a, b], ignore_index=True)
    out = out.drop_duplicates(subset=["src_text","tgt_text"]).reset_index(drop=True)
    return out

train_bi = build_bidir(train_mix)
valid_bi = build_bidir(valid_mix)



# model

In [None]:
MODEL_NAME = "indonlp/cendol-mt5-small-inst"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.23G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
MAX_LEN_SRC = 128
MAX_LEN_TGT = 128

def tokenize_df(df: pd.DataFrame):
    src = df["src_text"].tolist()
    tgt = df["tgt_text"].tolist()

    model_inputs = tokenizer(
        src, max_length=MAX_LEN_SRC, truncation=True, padding=False
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            tgt, max_length=MAX_LEN_TGT, truncation=True, padding=False
        )["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

train_tok = tokenize_df(train_bi)
valid_tok = tokenize_df(valid_bi)

# bungkus jadi dataset sederhana
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.enc = encodings
    def __len__(self):
        return len(self.enc["input_ids"])
    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.enc.items()}

train_ds = SimpleDataset(train_tok)
valid_ds = SimpleDataset(valid_tok)


In [None]:
OUTPUT_DIR = "./cendol_mt5_id_mad"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,

    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,

    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    num_train_epochs=10,          

    fp16=True,
    logging_steps=100,
    report_to="none",

    prediction_loss_only=True,      

    load_best_model_at_end=True,
    metric_for_best_model="loss",    
    greater_is_better=False,
)



In [25]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

# Simpan model final (last used)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.7869,0.647276
2,0.7378,0.627724
3,0.7203,0.617788
4,0.6985,0.610469
5,0.6706,0.604466
6,0.6591,0.600183
7,0.6703,0.597467
8,0.6484,0.594805
9,0.6282,0.59405
10,0.6615,0.59394


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


('./cendol_mt5_id_mad/tokenizer_config.json',
 './cendol_mt5_id_mad/special_tokens_map.json',
 './cendol_mt5_id_mad/spiece.model',
 './cendol_mt5_id_mad/added_tokens.json',
 './cendol_mt5_id_mad/tokenizer.json')

In [26]:
import os
print([x for x in os.listdir(OUTPUT_DIR) if x.startswith("checkpoint-")])


['checkpoint-2500', 'checkpoint-2000', 'checkpoint-2250']


In [None]:
ckpt_paths = [os.path.join(OUTPUT_DIR, x) for x in os.listdir(OUTPUT_DIR) if x.startswith("checkpoint-")]
ckpt_paths = sorted(ckpt_paths, key=lambda p: int(p.split("-")[-1]))

# tambahkan model final (folder root)
ckpt_paths.append(OUTPUT_DIR)

ckpt_paths


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_loaded = AutoModelForSeq2SeqLM.from_pretrained(OUTPUT_DIR)
tokenizer_loaded = AutoTokenizer.from_pretrained(OUTPUT_DIR)


# eval

In [27]:
import torch, evaluate

bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = trainer.model.to(device)
model.eval()

MAX_LEN_SRC = 128
MAX_NEW_TOK = 128

def generate_batch(sources, batch_size=8):
    preds = []
    for i in range(0, len(sources), batch_size):
        batch = sources[i:i+batch_size]
        enc = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LEN_SRC
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model.generate(**enc, max_new_tokens=MAX_NEW_TOK)
        preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return preds

def score(preds, refs):
    b = bleu.compute(predictions=preds, references=[[r] for r in refs])["score"]
    r = rouge.compute(predictions=preds, references=refs)
    return {"BLEU": b, "ROUGE-1": r["rouge1"], "ROUGE-L": r["rougeL"]}


## 10 epoch

In [None]:
# ID -> MAD
src_id2mad = ["translate Indonesian to Madurese: " + x for x in valid_mix["id"].tolist()]
ref_id2mad = valid_mix["mad"].tolist()
pred_id2mad = generate_batch(src_id2mad)
print("VALID ID → MAD:", score(pred_id2mad, ref_id2mad))

# MAD -> ID
src_mad2id = ["translate Madurese to Indonesian: " + x for x in valid_mix["mad"].tolist()]
ref_mad2id = valid_mix["id"].tolist()
pred_mad2id = generate_batch(src_mad2id)
print("VALID MAD → ID:", score(pred_mad2id, ref_mad2id))


VALID ID → MAD: {'BLEU': 24.71788162514927, 'ROUGE-1': np.float64(0.5011481927645369), 'ROUGE-L': np.float64(0.49727339984448127)}
VALID MAD → ID: {'BLEU': 38.37127563969642, 'ROUGE-1': np.float64(0.6216973155338186), 'ROUGE-L': np.float64(0.6161272815137366)}


In [None]:
# ID -> MAD
src_id2mad = ["translate Indonesian to Madurese: " + x for x in nusax_test["id"].tolist()]
ref_id2mad = nusax_test["mad"].tolist()
pred_id2mad = generate_batch(src_id2mad)
print("TEST ID → MAD:", score(pred_id2mad, ref_id2mad))

# MAD -> ID
src_mad2id = ["translate Madurese to Indonesian: " + x for x in nusax_test["mad"].tolist()]
ref_mad2id = nusax_test["id"].tolist()
pred_mad2id = generate_batch(src_mad2id)
print("TEST MAD → ID:", score(pred_mad2id, ref_mad2id))


TEST ID → MAD: {'BLEU': 27.10209210093647, 'ROUGE-1': np.float64(0.5355933192747154), 'ROUGE-L': np.float64(0.5303790040774587)}
TEST MAD → ID: {'BLEU': 37.99576781716567, 'ROUGE-1': np.float64(0.6282942687347244), 'ROUGE-L': np.float64(0.6212819222784924)}


In [32]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model = trainer.model.to(device)   # kalau trainer masih ada
model.eval()

MAX_LEN_SRC = 128
MAX_NEW_TOK = 128

def translate(text: str, direction="id2mad"):
    """
    direction:
      - "id2mad" : Indonesian -> Madurese
      - "mad2id" : Madurese -> Indonesian
    """
    text = text.strip()
    if direction == "id2mad":
        src = "translate Indonesian to Madurese: " + text
    elif direction == "mad2id":
        src = "translate Madurese to Indonesian: " + text
    else:
        raise ValueError("direction harus 'id2mad' atau 'mad2id'")

    enc = tokenizer(src, return_tensors="pt", truncation=True, max_length=MAX_LEN_SRC).to(device)
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=MAX_NEW_TOK)
    return tokenizer.decode(out[0], skip_special_tokens=True)

# loop input
while True:
    direction = input("\nPilih arah (id2mad / mad2id) atau ketik q: ").strip()
    if direction.lower() == "q":
        break
    text = input("Masukkan teks: ").strip()
    print("Hasil:", translate(text, direction=direction))



Pilih arah (id2mad / mad2id) atau ketik q: id2mad
Masukkan teks: saya pergi ke pasar kemarin
Hasil: Engkok meyos ka pasar e bekto

Pilih arah (id2mad / mad2id) atau ketik q: q


In [33]:
print("Best checkpoint:", trainer.state.best_model_checkpoint)
print("Best metric (eval_loss):", trainer.state.best_metric)


Best checkpoint: ./cendol_mt5_id_mad/checkpoint-2500
Best metric (eval_loss): 0.5939401388168335


In [35]:
load_best_model_at_end=True
trainer.save_model(OUTPUT_DIR)
print("Best checkpoint:", trainer.state.best_model_checkpoint)
print("Best val loss:", trainer.state.best_metric)


Best checkpoint: ./cendol_mt5_id_mad/checkpoint-2500
Best val loss: 0.5939401388168335


# coba 15 epoch

In [41]:
best_ckpt = trainer.state.best_model_checkpoint
print("Resume from:", best_ckpt)


Resume from: ./cendol_mt5_id_mad/checkpoint-2500


In [42]:
from transformers import TrainingArguments

training_args_2 = TrainingArguments(
    output_dir=OUTPUT_DIR,

    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,

    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,

    num_train_epochs=15,      # ⬅️ DITAMBAH
    fp16=True,

    prediction_loss_only=True,
    logging_steps=100,
    report_to="none",

    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
)


In [43]:
from transformers import Trainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer2 = Trainer(
    model=model,                  # model hasil training kemarin
    args=training_args_2,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer2.train(resume_from_checkpoint=best_ckpt)
trainer2.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


  trainer2 = Trainer(
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Epoch,Training Loss,Validation Loss
11,0.6348,0.591472
12,0.6031,0.589894
13,0.6212,0.589024
14,0.6147,0.587794
15,0.6352,0.5876


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


('./cendol_mt5_id_mad/checkpoint-2500/tokenizer_config.json',
 './cendol_mt5_id_mad/checkpoint-2500/special_tokens_map.json',
 './cendol_mt5_id_mad/checkpoint-2500/spiece.model',
 './cendol_mt5_id_mad/checkpoint-2500/added_tokens.json',
 './cendol_mt5_id_mad/checkpoint-2500/tokenizer.json')

In [44]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch, evaluate

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
model = AutoModelForSeq2SeqLM.from_pretrained(OUTPUT_DIR).to(device)
model.eval()

bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
def generate_batch(sources, batch_size=8):
    preds = []
    for i in range(0, len(sources), batch_size):
        batch = sources[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model.generate(**enc, max_new_tokens=128)
        preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return preds

def score(preds, refs):
    b = bleu.compute(predictions=preds, references=[[r] for r in refs])["score"]
    r = rouge.compute(predictions=preds, references=refs)
    return {"BLEU": b, "ROUGE-L": r["rougeL"]}


The tokenizer you are loading from './cendol_mt5_id_mad/checkpoint-2500' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


In [None]:
# ID -> MAD
src = ["translate Indonesian to Madurese: " + x for x in nusax_test["id"].tolist()]
ref = nusax_test["mad"].tolist()
pred = generate_batch(src)
print("TEST ID → MAD:", score(pred, ref))

# MAD -> ID
src = ["translate Madurese to Indonesian: " + x for x in nusax_test["mad"].tolist()]
ref = nusax_test["id"].tolist()
pred = generate_batch(src)
print("TEST MAD → ID:", score(pred, ref))


TEST ID → MAD: {'BLEU': 27.240936358955565, 'ROUGE-L': np.float64(0.5333184670384086)}
TEST MAD → ID: {'BLEU': 38.30457557061089, 'ROUGE-L': np.float64(0.6240571430890124)}


In [46]:
print("Best checkpoint:", trainer2.state.best_model_checkpoint)
print("Best val loss:", trainer2.state.best_metric)


Best checkpoint: ./cendol_mt5_id_mad/checkpoint-2500/checkpoint-3750
Best val loss: 0.5876002907752991


In [47]:
import torch, evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda" if torch.cuda.is_available() else "cpu"

# pakai model terbaik (hasil training lanjutan)
model = trainer2.model.to(device)
model.eval()

tokenizer = tokenizer  # tokenizer yang sama

bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
MAX_LEN_SRC = 128
MAX_NEW_TOK = 128

def generate_batch(sources, batch_size=8):
    preds = []
    for i in range(0, len(sources), batch_size):
        batch = sources[i:i+batch_size]
        enc = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LEN_SRC
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model.generate(**enc, max_new_tokens=MAX_NEW_TOK)
        preds.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
    return preds

def score(preds, refs):
    bleu_score = bleu.compute(
        predictions=preds,
        references=[[r] for r in refs]
    )["score"]
    rouge_score = rouge.compute(
        predictions=preds,
        references=refs
    )
    return {
        "BLEU": bleu_score,
        "ROUGE-1": rouge_score["rouge1"],
        "ROUGE-L": rouge_score["rougeL"]
    }



In [None]:
# ID -> MAD
src = ["translate Indonesian to Madurese: " + x for x in nusax_test["id"].tolist()]
ref = nusax_test["mad"].tolist()
pred = generate_batch(src)
print("TEST ID → MAD:", score(pred, ref))

# MAD -> ID
src = ["translate Madurese to Indonesian: " + x for x in nusax_test["mad"].tolist()]
ref = nusax_test["id"].tolist()
pred = generate_batch(src)
print("TEST MAD → ID:", score(pred, ref))


TEST ID → MAD: {'BLEU': 27.249789752505592, 'ROUGE-1': np.float64(0.5380458731147865), 'ROUGE-L': np.float64(0.5329918131419573)}
TEST MAD → ID: {'BLEU': 38.2623941177844, 'ROUGE-1': np.float64(0.6301389789275638), 'ROUGE-L': np.float64(0.6234563975235359)}


### analisis kesahalan

In [None]:
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Ensure clean_text and fix_mojibake are available
# If you encounter a NameError for clean_text or fix_mojibake, please run the preprocessing cells (tS4yoEHnq713) first.

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the best model and tokenizer from the final checkpoint
# MODEL_DIR is defined in previous cells and points to the best checkpoint.
# Assuming `tokenizer` and `model` from cell `uzoYf3xlOElU` are the desired ones.
# If `model` and `tokenizer` are not defined, please re-run cell `uzoYf3xlOElU`.
if 'model' not in globals() or 'tokenizer' not in globals():
    print("Loading model and tokenizer from MODEL_DIR...")
    MODEL_DIR = "./cendol_mt5_id_mad/checkpoint-2500/checkpoint-3750"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).to(device)
model.eval()

MAX_LEN_SRC = 128
MAX_NEW_TOK = 128

rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu") # Load BLEU metric

def generate_single_text(raw_text: str, direction: str, current_model, current_tokenizer):
    if 'clean_text' not in globals():
        raise NameError("clean_text function is not defined. Please run cell tS4yoEHnq713.")

    cleaned_text = clean_text(raw_text) # Apply clean_text to the actual content
    if direction == "id2mad":
        src_prompt = "translate Indonesian to Madurese: " + cleaned_text
    elif direction == "mad2id":
        src_prompt = "translate Madurese to Indonesian: " + cleaned_text
    else:
        raise ValueError("Invalid direction for translation.")

    enc = current_tokenizer(src_prompt, return_tensors="pt", truncation=True, max_length=MAX_LEN_SRC).to(current_model.device)
    with torch.no_grad():
        out = current_model.generate(**enc, max_new_tokens=MAX_NEW_TOK)
    return current_tokenizer.decode(out[0], skip_special_tokens=True)

def score_single_translation(prediction: str, reference: str):
    # Calculate ROUGE scores
    rouge_scores = rouge_metric.compute(predictions=[prediction], references=[reference])
    # Calculate BLEU score
    bleu_score = bleu_metric.compute(predictions=[prediction], references=[[reference]])["score"]
    return {
        "BLEU": bleu_score,
        "ROUGE-1": rouge_scores["rouge1"],
        "ROUGE-L": rouge_scores["rougeL"]
    }

print("\n--- Analysis of Test Data Translations ---")

# Assuming test_clean DataFrame is available from earlier cells
if 'test_clean' not in globals():
    print("Error: 'test_clean' DataFrame not found. Please ensure preprocessing cells are run.")
else:
    print("\nIndonesian -> Madurese Translations:")
    for i, row in nusax_test.head(30).iterrows(): # Limit to 30 samples
        id_text = row["id"]
        mad_ref = row["mad"]

        mad_pred = generate_single_text(id_text, "id2mad", model, tokenizer)
        scores = score_single_translation(mad_pred, mad_ref)

        print(f"--- Sample {i+1} (ID -> MAD) ---")
        print(f"Source (ID):     {id_text}")
        print(f"Reference (MAD): {mad_ref}")
        print(f"Prediction (MAD):{mad_pred}")
        print(f"BLEU Score:      {scores['BLEU']:.4f}")
        print(f"ROUGE-1 Score:   {scores['ROUGE-1']:.4f}")
        print(f"ROUGE-L Score:   {scores['ROUGE-L']:.4f}\n")

    print("\nMadurese -> Indonesian Translations:")
    for i, row in nusax_test.head(30).iterrows(): # Limit to 30 samples
        mad_text = row["mad"]
        id_ref = row["id"]

        id_pred = generate_single_text(mad_text, "mad2id", model, tokenizer)
        scores = score_single_translation(id_pred, id_ref)

        print(f"--- Sample {i+1} (MAD -> ID) ---")
        print(f"Source (MAD):     {mad_text}")
        print(f"Reference (ID): {id_ref}")
        print(f"Prediction (ID):{id_pred}")
        print(f"BLEU Score:      {scores['BLEU']:.4f}")
        print(f"ROUGE-1 Score:   {scores['ROUGE-1']:.4f}")
        print(f"ROUGE-L Score:   {scores['ROUGE-L']:.4f}\n")

Downloading builder script: 0.00B [00:00, ?B/s]


--- Analysis of Test Data Translations ---

Indonesian -> Madurese Translations:
--- Sample 1 (ID -> MAD) ---
Source (ID):     Dekat dengan hotel saya menginap, hanya ditempuh jalan kaki, di sini banyak sekali pilihan makanannya, tempat yang luas, dan menyenangkan
Reference (MAD): Semmak bik hotel engkok nginep, pera' ejeleni ajelen soko, ediye bennyak sarah pelean kakananna, kenengngan se leber, ben masenneng
Prediction (MAD):Dekat bik hotel engkok nginep, pera' empuh jelen kaki, e diye bennya' sarah pelean kakananna, kennengngan se luas, ben nyennengngaghi
BLEU Score:      32.9990
ROUGE-1 Score:   0.5128
ROUGE-L Score:   0.5128

--- Sample 2 (ID -> MAD) ---
Source (ID):     Iya benar, dia sedang jaga warung.
Reference (MAD): Iye bhender, rua ajege berung.
Prediction (MAD):Iya ongghu, rowa teppak jaga warung.
BLEU Score:      6.5673
ROUGE-1 Score:   0.0000
ROUGE-L Score:   0.0000

--- Sample 3 (ID -> MAD) ---
Source (ID):     Kangkungnya lumayan tapi kepiting saus padangnya mengecewa

In [50]:
import zipfile, os

SRC = "./cendol_mt5_id_mad/checkpoint-2500/checkpoint-3750"
OUT = "cendol_mt5_id_mad_15ep.zip"

with zipfile.ZipFile(OUT, "w", compression=zipfile.ZIP_STORED) as z:
    for root, _, files in os.walk(SRC):
        for f in files:
            full = os.path.join(root, f)
            arc = os.path.relpath(full, SRC)
            z.write(full, arcname=arc)

print("OK ->", OUT)


OK -> cendol_mt5_id_mad_15ep.zip


In [51]:
!tar -cf cendol_mt5_id_mad_best.tar ./cendol_mt5_id_mad/checkpoint-2500/checkpoint-3750


In [54]:
!cp cendol_mt5_id_mad_best.tar "/content/drive/MyDrive/Colab Notebooks/NLP/UAS/"


In [55]:
!ls -lh "/content/drive/MyDrive/Colab Notebooks/NLP/UAS"


total 3.4G
-rw------- 1 root root 3.4G Dec 13 06:08  cendol_mt5_id_mad_best.tar
drwx------ 2 root root 4.0K Dec 13 02:59  nusax
-rw------- 1 root root 582K Dec  5 14:11 'UAS_NLP_PAPER_KELOMPOK I.docx'


In [57]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [58]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

MODEL_DIR = "./cendol_mt5_id_mad/checkpoint-2500/checkpoint-3750"
REPO_ID = "addinda/cendol-mt5-id-mad-15ep"   # ganti username

AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).push_to_hub(REPO_ID)
AutoTokenizer.from_pretrained(MODEL_DIR).push_to_hub(REPO_ID)


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...xf6obqa/model.safetensors:   0%|          | 11.6kB / 1.20GB            

The tokenizer you are loading from './cendol_mt5_id_mad/checkpoint-2500/checkpoint-3750' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  .../tmp1c1wdes5/spiece.model:  95%|#########5| 4.11MB / 4.31MB            

  ...mp1c1wdes5/tokenizer.json:   3%|2         |  433kB / 16.3MB            

CommitInfo(commit_url='https://huggingface.co/addinda/cendol-mt5-id-mad-15ep/commit/a135c4d1f1eb871428a4f0e7a416b6ac11903418', commit_message='Upload tokenizer', commit_description='', oid='a135c4d1f1eb871428a4f0e7a416b6ac11903418', pr_url=None, repo_url=RepoUrl('https://huggingface.co/addinda/cendol-mt5-id-mad-15ep', endpoint='https://huggingface.co', repo_type='model', repo_id='addinda/cendol-mt5-id-mad-15ep'), pr_revision=None, pr_num=None)