In [None]:
# 1. Bersihkan environment lama
!pip uninstall -y pyarrow datasets evaluate transformers

# 2. Install versi STABIL & KOMPATIBEL (Sesuai Request)
!pip install pyarrow==14.0.1 datasets==2.15.0 evaluate==0.4.1 sacrebleu sentencepiece accelerate openpyxl -q
!pip install transformers==4.46.3 tokenizers==0.20.3 -q
!pip install pandas numpy scikit-learn -q



In [None]:
import os
import glob
import pandas as pd
import numpy as np
import torch
import gc
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
from transformers.optimization import Adafactor

# --- KONFIGURASI SESUAI SPEK SUKSES ---
# Model Pengganti Cendol: mt5-small (Ringan & Stabil)
MODEL_CHECKPOINT = "google/mt5-small" 

MAX_LENGTH = 128      # 100-128 aman
BATCH_SIZE = 4        # Batch Kecil biar aman
GRAD_ACCUM = 4        # Akumulasi 4x (Total Batch = 16)
EPOCHS = 5
OUTPUT_DIR = "/kaggle/working/model_madura_bolak_balik" 

# Cek GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# --- 1. LOAD DATASET (NusaX, INMAD, Lexicon) ---
data_total = []

# Path File (Sesuai request sebelumnya)
DIR_NUSAX   = "/kaggle/input/nusaxdata"
FILE_LEXICON = "/kaggle/input/nusaxdata/madurese.csv"
FILE_INMAD  = "/kaggle/input/inmad-dataset/INMAD Dataset.csv"

print("üìÇ Sedang membaca data...")

# A. Load NusaX
for f in glob.glob(f"{DIR_NUSAX}/*.csv"):
    if "madurese.csv" not in f and any(k in f for k in ['train', 'valid', 'test']):
        try:
            df = pd.read_csv(f)
            cols = [c.lower() for c in df.columns]
            temp = pd.DataFrame()
            if 'indonesian' in cols: temp['indo'] = df['indonesian']
            elif 'indonesia' in cols: temp['indo'] = df['indonesia']
            if 'madurese' in cols: temp['madura'] = df['madurese']
            elif 'madura' in cols: temp['madura'] = df['madura']
            
            if not temp.empty: data_total.append(temp)
        except: pass

# B. Load Lexicon
try:
    if os.path.exists(FILE_LEXICON):
        df_lex = pd.read_csv(FILE_LEXICON)
        temp_lex = pd.DataFrame({'indo': df_lex['indonesian'], 'madura': df_lex['madurese']})
        data_total.append(temp_lex)
except: pass

# C. Load INMAD
try:
    if os.path.exists(FILE_INMAD):
        df_inmad = pd.read_csv(FILE_INMAD)
        df_inmad.columns = [c.strip() for c in df_inmad.columns] # Hapus spasi nama kolom
        temp_inmad = pd.DataFrame({'indo': df_inmad['Indonesia'], 'madura': df_inmad['Madura']})
        data_total.append(temp_inmad)
except: pass

# Gabung Semua
df_raw = pd.concat(data_total, ignore_index=True)

# Bersihkan Data (Hapus yang kosong/pendek)
df_raw['indo'] = df_raw['indo'].astype(str).str.strip()
df_raw['madura'] = df_raw['madura'].astype(str).str.strip()
df_raw = df_raw[(df_raw['indo'].str.len() > 2) & (df_raw['madura'].str.len() > 2)]
df_raw = df_raw[(df_raw['indo'] != "nan") & (df_raw['madura'] != "nan")]

print(f"‚úÖ Total Data Mentah: {len(df_raw)} pasang kalimat")

# --- 2. SETUP BOLAK-BALIK (MIRRORING) ---
# Arah 1: Indo -> Madura
df_indo_mad = pd.DataFrame({
    'source': df_raw['indo'],
    'target': df_raw['madura'],
    'prefix': "terjemahkan dari Bahasa Indonesia ke Bahasa Madura: "
})

# Arah 2: Madura -> Indo
df_mad_indo = pd.DataFrame({
    'source': df_raw['madura'],
    'target': df_raw['indo'],
    'prefix': "terjemahkan dari Bahasa Madura ke Bahasa Indonesia: "
})

# Gabung Jadi Satu Dataset Besar
df_final = pd.concat([df_indo_mad, df_mad_indo], ignore_index=True)
print(f"‚úÖ Total Data Training (Bolak-Balik): {len(df_final)}")

# Split Data
train_df, val_df = train_test_split(df_final, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# --- 3. TOKENISASI ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(examples):
    # Gabungkan Prefix + Source Text
    inputs = [p + str(s) for p, s in zip(examples["prefix"], examples["source"])]
    targets = [str(t) for t in examples["target"]]
    
    model_inputs = tokenizer(inputs, max_length=MAX_LENGTH, truncation=True)
    labels = tokenizer(text_target=targets, max_length=MAX_LENGTH, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("‚è≥ Sedang Tokenisasi...")
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)
print("‚úÖ Tokenisasi Selesai.")

In [None]:
import evaluate
import numpy as np

# Setup Metrics
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple): preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Ganti -100 di label agar valid
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

# 1. Reset Memori GPU
gc.collect()
torch.cuda.empty_cache()

# 2. Load Model
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
model.gradient_checkpointing_enable() # Hemat Memori
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 3. CONFIG TRAINING (PERSIS SPEK SUKSES)
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    
    # --- SPEK KUNCI ---
    per_device_train_batch_size=BATCH_SIZE, # 4
    per_device_eval_batch_size=BATCH_SIZE,  # 4
    gradient_accumulation_steps=GRAD_ACCUM, # 4 (Total Batch 16)
    gradient_checkpointing=True,
    optim="adafactor",           # Optimizer Wajib buat T5/MT5
    learning_rate=1e-3,          # LR standar Adafactor
    weight_decay=0.0,
    fp16=False,                  # WAJIB FALSE (Biar gak error NaN/0.00 loss)
    # ------------------
    
    save_total_limit=1,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer, 
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print(f"üöÄ MULAI TRAINING MODEL: {MODEL_CHECKPOINT}")
trainer.train()

# Simpan Model Final
print("üíæ Menyimpan model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("‚úÖ SELESAI!")

# eval

In [4]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
from tqdm import tqdm
import re

# ==========================================
# ‚öôÔ∏è KONFIGURASI
# ==========================================
# 1. Path Model (Sesuai yang tadi)
MODEL_PATH = r"D:/UNAIR/NLP/Project_Madura/model_google_mt5"

# 2. Path File Test (Cari file test.csv dari NusaX di laptopmu)
# Kalau tidak ada, pakai file train.csv tapi kita ambil 100 baris aja buat tes
FILE_TEST_PATH = r"D:/UNAIR/NLP/Project_Madura/dataset4/test.csv"  # <--- GANTI INI DENGAN LOKASI FILE CSV KAMU!

# ==========================================
# üîß FUNGSI BERSIH-BERSIH (PENTING!)
# ==========================================
def bersihkan_teks(text):
    # Buang link https://...
    text = re.sub(r'http\S+', '', text)
    # Buang mention @user
    text = re.sub(r'@\w+', '', text)
    # Hapus spasi berlebih
    return text.strip()

# ==========================================
# üöÄ EVALUASI FULL
# ==========================================
def main():
    print(f"üìÇ Memuat model...")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
    except:
        print("‚ùå Model tidak ketemu. Cek path lagi.")
        return

    # Load Data Test
    print(f"üìÇ Membaca data tes: {FILE_TEST_PATH}")
    try:
        # Coba baca CSV. Sesuaikan nama kolomnya nanti
        df = pd.read_csv(FILE_TEST_PATH)
        
        # Cari kolom indo dan madura
        cols = [c.lower() for c in df.columns]
        src_col, tgt_col = None, None
        
        if 'indonesian' in cols: src_col = 'indonesian'
        elif 'indonesia' in cols: src_col = 'indonesia'
        
        if 'madurese' in cols: tgt_col = 'madurese'
        elif 'madura' in cols: tgt_col = 'madura'
        
        if not src_col or not tgt_col:
            print("‚ùå Kolom 'indonesian' atau 'madurese' tidak ditemukan di CSV.")
            print(f"Kolom yang ada: {cols}")
            return
            
        # Ambil 100 data saja biar cepat (kalau mau semua, hapus .head(100))
        df_sample = df.head(100) 
        print(f"‚úÖ Menguji pada {len(df_sample)} kalimat pertama.")
        
    except Exception as e:
        print(f"‚ùå Gagal baca CSV: {e}")
        return

    metric = evaluate.load("sacrebleu")
    predictions = []
    references = []

    print("üöÄ Mulai Menerjemahkan...")
    
    for _, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
        src_text = str(row[src_col])
        ref_text = str(row[tgt_col])
        
        # Tambah Prefix
        input_text = f"terjemahkan dari Bahasa Indonesia ke Bahasa Madura: {src_text}"
        
        inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)

        with torch.no_grad():
            # Kita atur parameter biar gak halusinasi (repetition_penalty)
            outputs = model.generate(
                **inputs, 
                max_length=128, 
                num_beams=4,
                repetition_penalty=1.5, # Hukuman biar gak ngulang kata
                early_stopping=True
            )
        
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Bersihkan hasil (buang link dll)
        pred_bersih = bersihkan_teks(pred)
        
        predictions.append(pred_bersih)
        references.append([ref_text])

    # Hitung Skor
    results = metric.compute(predictions=predictions, references=references)
    
    print("\n" + "="*30)
    print(f"üèÜ REAL BLEU SCORE: {results['score']:.2f}")
    print("="*30)
    
    # Tampilkan 5 contoh hasil
    print("\nüîç 5 CONTOH HASIL:")
    for i in range(5):
        print(f"üáÆüá© Indo  : {df_sample.iloc[i][src_col]}")
        print(f"ü§ñ Model : {predictions[i]}")
        print(f"üîë Kunci : {references[i][0]}")
        print("-" * 20)

if __name__ == "__main__":
    main()

üìÇ Memuat model...
üìÇ Membaca data tes: D:/UNAIR/NLP/Project_Madura/dataset4/test.csv
‚úÖ Menguji pada 100 kalimat pertama.
üöÄ Mulai Menerjemahkan...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [02:13<00:00,  1.34s/it]



üèÜ REAL BLEU SCORE: 18.89

üîç 5 CONTOH HASIL:
üáÆüá© Indo  : Dekat dengan hotel saya menginap, hanya ditempuh jalan kaki, di sini banyak sekali pilihan makanannya, tempat yang luas, dan menyenangkan
ü§ñ Model : semma' sareng hotel kaula ngenep, coma e ajh√¢l√¢n kaki, √® dinna' b√¢nnya' pilihan kakanan, kennengngan s√® lebar, b√¢n senneng
üîë Kunci : Semmak bik hotel engkok nginep, pera' ejeleni ajelen soko, ediye bennyak sarah pelean kakananna, kenengngan se leber, ben masenneng
--------------------
üáÆüá© Indo  : Iya benar, dia sedang jaga warung.
ü§ñ Model : Iya bendher, rowa teppa' jaga warung.
üîë Kunci : Iye bhender, rua ajege berung.
--------------------
üáÆüá© Indo  : Kangkungnya lumayan tapi kepiting saus padangnya mengecewakan kami dikasih kepiting yang kopong akhir kami tidak makan keptingnya dan dikembalikan.
ü§ñ Model : Kangkungna pendhenan tape kepiting saos padhangnga ngacewaaghi kami eberri' kepiting se kopong akhir kami ta' ngakan kepiting ban ekembaliag

In [5]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
from tqdm import tqdm
import re

# ==========================================
# ‚öôÔ∏è KONFIGURASI
# ==========================================
# 1. Path Model
MODEL_PATH = r"D:/UNAIR/NLP/Project_Madura/model_google_mt5"

# 2. Path File Test
FILE_TEST_PATH = r"D:/UNAIR/NLP/Project_Madura/dataset4/test.csv"

# ==========================================
# üîß FUNGSI BERSIH-BERSIH
# ==========================================
def bersihkan_teks(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    return text.strip()

# ==========================================
# üöÄ EVALUASI FULL (MADURA -> INDO)
# ==========================================
def main():
    print(f"üìÇ Memuat model...")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
    except:
        print("‚ùå Model tidak ketemu. Cek path lagi.")
        return

    # Load Data Test
    print(f"üìÇ Membaca data tes: {FILE_TEST_PATH}")
    try:
        df = pd.read_csv(FILE_TEST_PATH)
        
        # Deteksi nama kolom
        cols = [c.lower() for c in df.columns]
        col_indo, col_madura = None, None
        
        # Cari kolom Indo
        if 'indonesian' in cols: col_indo = 'indonesian'
        elif 'indonesia' in cols: col_indo = 'indonesia'
        
        # Cari kolom Madura
        if 'madurese' in cols: col_madura = 'madurese'
        elif 'madura' in cols: col_madura = 'madura'
        
        if not col_indo or not col_madura:
            print("‚ùå Kolom tidak lengkap.")
            return
            
        # Ambil 100 data pertama
        df_sample = df.head(400) 
        print(f"‚úÖ Menguji arah MADURA -> INDONESIA pada {len(df_sample)} kalimat.")
        
    except Exception as e:
        print(f"‚ùå Gagal baca CSV: {e}")
        return

    metric = evaluate.load("sacrebleu")
    predictions = []
    references = []

    print("üöÄ Mulai Menerjemahkan...")
    
    for _, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
        # --- PERUBAHAN UTAMA DI SINI ---
        
        # 1. Input sekarang Bahasa MADURA
        src_text = str(row[col_madura]) 
        
        # 2. Kunci Jawaban sekarang Bahasa INDONESIA
        ref_text = str(row[col_indo])   
        
        # 3. Prefix DIBALIK
        input_text = f"terjemahkan dari Bahasa Madura ke Bahasa Indonesia: {src_text}"
        # -------------------------------
        
        inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_length=128, 
                num_beams=4,
                repetition_penalty=1.5,
                early_stopping=True
            )
        
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred_bersih = bersihkan_teks(pred)
        
        predictions.append(pred_bersih)
        references.append([ref_text])

    # Hitung Skor
    results = metric.compute(predictions=predictions, references=references)
    
    print("\n" + "="*30)
    print(f"üèÜ REAL BLEU SCORE (MAD -> INDO): {results['score']:.2f}")
    print("="*30)
    
    # Tampilkan 5 contoh hasil
    print("\nüîç 5 CONTOH HASIL:")
    for i in range(5):
        # Tampilkan label yang benar
        print(f"üêÇ Madura : {df_sample.iloc[i][col_madura]}") 
        print(f"ü§ñ Model  : {predictions[i]}")
        print(f"üáÆüá© Kunci  : {references[i][0]}") # Kunci Jawaban Indo
        print("-" * 20)

if __name__ == "__main__":
    main()

üìÇ Memuat model...
üìÇ Membaca data tes: D:/UNAIR/NLP/Project_Madura/dataset4/test.csv
‚úÖ Menguji arah MADURA -> INDONESIA pada 400 kalimat.
üöÄ Mulai Menerjemahkan...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [12:28<00:00,  1.87s/it]



üèÜ REAL BLEU SCORE (MAD -> INDO): 43.01

üîç 5 CONTOH HASIL:
üêÇ Madura : Semmak bik hotel engkok nginep, pera' ejeleni ajelen soko, ediye bennyak sarah pelean kakananna, kenengngan se leber, ben masenneng
ü§ñ Model  : dekat dengan hotel saya menginap, hanya dijalankan jalan kaki, di sini banyak sekali pilihan makanannya, tempat yang luas, dan menyenangkan
üáÆüá© Kunci  : Dekat dengan hotel saya menginap, hanya ditempuh jalan kaki, di sini banyak sekali pilihan makanannya, tempat yang luas, dan menyenangkan
--------------------
üêÇ Madura : Iye bhender, rua ajege berung.
ü§ñ Model  : iya benar, itu jaga warung.
üáÆüá© Kunci  : Iya benar, dia sedang jaga warung.
--------------------
üêÇ Madura : Kangkongnga pendhanan tape kopeteng saos padangnga ma kocaba, engko' bi' laenna e bharri' kopeteng se kopong akherra engko' bi' laenna ta' ngakan kopeteng ban e pabali.
ü§ñ Model  : makasih lumayan tapi kopeteng saos padangnya mengatakan, saya dan lainnya dikasih kopeteng yang kopo

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import re
import os

# ==========================================
# ‚öôÔ∏è KONFIGURASI PATH
# ==========================================
MODEL_PATH = r"D:/UNAIR/NLP/Project_Madura/model_google_mt5"

# ==========================================
# üîß FUNGSI PEMBERSIH (Opsional)
# ==========================================
def bersihkan_hasil(text):
    # Hapus spasi ganda
    text = re.sub(r'\s+', ' ', text).strip()
    # (Opsional) Hapus tanda petik jika ingin hasil polos
    # text = re.sub(r"['`‚Äô‚Äò]", "", text) 
    return text

# ==========================================
# üöÄ APLIKASI UTAMA
# ==========================================
def main():
    print("="*50)
    print("ü§ñ MEMUAT MODEL PENERJEMAH MADURA...")
    print(f"üìÇ Path: {MODEL_PATH}")
    print("="*50)

    # 1. Cek Device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"‚ö° Sedang memuat ke {device.upper()}... Mohon tunggu.")

    # 2. Load Model
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
        print("‚úÖ Model SIAP digunakan!")
    except Exception as e:
        print(f"‚ùå Gagal load model: {e}")
        print("Pastikan path benar dan library protobuf sudah diinstall.")
        return

    # 3. Loop Interaktif
    while True:
        print("\n" + "-"*30)
        print("PILIH MENU:")
        print("1. Indo -> Madura")
        print("2. Madura -> Indo")
        print("3. Keluar")
        
        pilihan = input("üëâ Masukkan angka (1/2/3): ").strip()
        
        if pilihan == '3':
            print("üëã Sampai jumpa!")
            break
        
        if pilihan not in ['1', '2']:
            print("‚ö†Ô∏è Pilihan salah, coba lagi.")
            continue

        # Minta Input Kalimat
        print("\nüìù Ketik kalimat yang ingin diterjemahkan:")
        kalimat = input("üëâ Input: ").strip()
        
        if not kalimat:
            print("‚ö†Ô∏è Kalimat kosong.")
            continue

        # Tentukan Prefix
        if pilihan == '1':
            prefix = "terjemahkan dari Bahasa Indonesia ke Bahasa Madura: "
            arah = "üáÆüá© INDO -> üêÇ MADURA"
        else:
            prefix = "terjemahkan dari Bahasa Madura ke Bahasa Indonesia: "
            arah = "üêÇ MADURA -> üáÆüá© INDO"

        # Gabung Prefix + Kalimat
        input_text = prefix + kalimat
        
        # Proses Translate
        inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_length=128, 
                num_beams=5,            # Beam 5 biar mikir lebih cerdas
                repetition_penalty=1.2, # Cegah kata berulang
                early_stopping=True
            )
        
        hasil = tokenizer.decode(outputs[0], skip_special_tokens=True)
        hasil_bersih = bersihkan_hasil(hasil)

        # Tampilkan Hasil
        print("\n" + "="*40)
        print(f"üîÑ {arah}")
        print(f"üìù Input : {kalimat}")
        print(f"ü§ñ Hasil : {hasil_bersih}")
        print("="*40)

if __name__ == "__main__":
    main()

ü§ñ MEMUAT MODEL PENERJEMAH MADURA...
üìÇ Path: D:/UNAIR/NLP/Project_Madura/model_google_mt5
‚ö° Sedang memuat ke CUDA... Mohon tunggu.




‚úÖ Model SIAP digunakan!

------------------------------
PILIH MENU:
1. Indo -> Madura
2. Madura -> Indo
3. Keluar

üìù Ketik kalimat yang ingin diterjemahkan:

üîÑ üáÆüá© INDO -> üêÇ MADURA
üìù Input : halo
ü§ñ Hasil : halo

------------------------------
PILIH MENU:
1. Indo -> Madura
2. Madura -> Indo
3. Keluar

üìù Ketik kalimat yang ingin diterjemahkan:

üîÑ üáÆüá© INDO -> üêÇ MADURA
üìù Input : halo apa kabar
ü§ñ Hasil : halo apa kabar

------------------------------
PILIH MENU:
1. Indo -> Madura
2. Madura -> Indo
3. Keluar
üëã Sampai jumpa!


In [2]:
import torch
import pandas as pd
import re
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
from tqdm import tqdm

# ==========================================
# ‚öôÔ∏è KONFIGURASI PATH (SESUAI REQUEST)
# ==========================================
# 1. Lokasi Model (Sesuai log error Bos sebelumnya)
MODEL_PATH = r"D:/UNAIR/NLP/Project_Madura/model_google_mt5"

# 2. Lokasi Dataset (Asumsi ada di folder dataset_nusax di drive D)
# Pastikan nama file csv-nya sesuai dengan yang ada di laptop Bos
PATH_NUSAX   = r"D:/UNAIR/NLP/Project_Madura/dataset4/test.csv"       # File Test NusaX
PATH_LEXICON = r"D:/UNAIR/NLP/Project_Madura/dataset4/madurese.csv"   # File Kamus (Lexicon)
PATH_INMAD   = r"D:/UNAIR/NLP/Project_Madura/dataset3/INMAD Dataset.csv" # File Formal INMAD

# Jumlah soal per tipe data (Total soal = 3 x 50 = 150 soal)
JUMLAH_SAMPEL_PER_DATA = 50 

# ==========================================
# üßπ FUNGSI PEMBERSIH EJAAN (JURUS RAHASIA)
# ==========================================
def normalisasi_madura(text):
    text = str(text).lower()
    
    # 1. Buang Link & Mention sampah
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    
    # 2. HAPUS TANDA PETIK & EJAAN KHAS
    # Ini biar 'nase`' == 'nase' == "nase'"
    # Kita hapus petik satu ('), backtick (`), dan petik miring (‚Äô)
    text = re.sub(r"['`‚Äô‚Äò]", "", text) 
    
    # 3. Samakan Ejaan Umum (Opsional)
    # Kadang 'dh' ditulis 'd', 'bh' ditulis 'b', 'jh' ditulis 'j'
    text = text.replace("dh", "d").replace("bh", "b").replace("jh", "j")
    
    # 4. Hapus tanda baca lain & spasi ganda
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()

# ==========================================
# üìÇ LOAD DATA MIX (Chat + Kamus + Formal)
# ==========================================
def load_data_ujian():
    soal_ujian = []
    
    # A. NUSAX (Bahasa Kasar/Chat)
    if os.path.exists(PATH_NUSAX):
        try:
            df = pd.read_csv(PATH_NUSAX)
            cols = [c.lower() for c in df.columns]
            src, tgt = None, None
            if 'indonesian' in cols: src = 'indonesian'
            elif 'indonesia' in cols: src = 'indonesia'
            if 'madurese' in cols: tgt = 'madurese'
            elif 'madura' in cols: tgt = 'madura'
            
            if src and tgt:
                sample = df.sample(n=min(len(df), JUMLAH_SAMPEL_PER_DATA), random_state=42)
                for _, row in sample.iterrows():
                    soal_ujian.append({"tipe": "Sosmed (NusaX)", "indo": row[src], "madura": row[tgt]})
                print(f"‚úÖ Masuk: {len(sample)} soal dari NusaX.")
        except: print("‚ö†Ô∏è Gagal baca NusaX.")
    else:
        print(f"‚ùå File tidak ketemu: {PATH_NUSAX}")

    # B. LEXICON (Kamus)
    if os.path.exists(PATH_LEXICON):
        try:
            df = pd.read_csv(PATH_LEXICON)
            sample = df.sample(n=min(len(df), JUMLAH_SAMPEL_PER_DATA), random_state=42)
            for _, row in sample.iterrows():
                soal_ujian.append({"tipe": "Kamus (Lexicon)", "indo": row['indonesian'], "madura": row['madurese']})
            print(f"‚úÖ Masuk: {len(sample)} soal dari Lexicon.")
        except: print("‚ö†Ô∏è Gagal baca Lexicon.")
    else:
        print(f"‚ùå File tidak ketemu: {PATH_LEXICON}")

    # C. INMAD (Formal)
    if os.path.exists(PATH_INMAD):
        try:
            df = pd.read_csv(PATH_INMAD)
            df.columns = [c.strip() for c in df.columns]
            sample = df.sample(n=min(len(df), JUMLAH_SAMPEL_PER_DATA), random_state=42)
            for _, row in sample.iterrows():
                soal_ujian.append({"tipe": "Formal (INMAD)", "indo": row['Indonesia'], "madura": row['Madura']})
            print(f"‚úÖ Masuk: {len(sample)} soal dari INMAD.")
        except: print("‚ö†Ô∏è Gagal baca INMAD.")
    else:
        print(f"‚ùå File tidak ketemu: {PATH_INMAD}")
    
    return pd.DataFrame(soal_ujian)

# ==========================================
# üöÄ MULAI PENILAIAN
# ==========================================
def main():
    print(f"\nü§ñ MEMUAT MODEL DARI: {MODEL_PATH}")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"‚ö° Device: {device.upper()}")
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
    except Exception as e:
        print(f"‚ùå MODEL GAGAL DILOAD: {e}")
        print("Saran: Pastikan path foldernya benar dan sudah install protobuf.")
        return

    df_test = load_data_ujian()
    if len(df_test) == 0:
        print("‚ùå Tidak ada soal ujian! Cek path file csv mu.")
        return

    print(f"\nüöÄ MULAI UJIAN ({len(df_test)} Soal)...")
    print("Parameter: Beam=5, Normalisasi=AKTIF (Anti-Petik)\n")
    
    metric = evaluate.load("sacrebleu")
    preds, refs = [], []
    
    # Loop Evaluasi
    for i, row in tqdm(df_test.iterrows(), total=len(df_test)):
        indo = str(row['indo'])
        kunci_raw = str(row['madura'])
        
        # Format Input (Wajib sama dengan training)
        text_input = f"terjemahkan dari Bahasa Indonesia ke Bahasa Madura: {indo}"
        inputs = tokenizer(text_input, return_tensors="pt", max_length=128, truncation=True).to(device)
        
        # Generate Jawaban
        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_length=128, 
                num_beams=5,             # Cari 5 kemungkinan terbaik
                repetition_penalty=1.2,  # Jangan ngulang kata
                early_stopping=True
            )
            
        jawaban_model = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # --- PROSES PEMBERSIHAN (NORMALISASI) ---
        jawaban_bersih = normalisasi_madura(jawaban_model)
        kunci_bersih   = normalisasi_madura(kunci_raw)
        # ----------------------------------------
        
        preds.append(jawaban_bersih)
        refs.append([kunci_bersih])
        
        # Tampilkan contoh setiap 25 soal
        if i % 25 == 0:
            print(f"\n[{row['tipe']}] Indo: {indo}")
            print(f"ü§ñ Jawab : {jawaban_model}  \t-> (Norm: {jawaban_bersih})")
            print(f"üîë Kunci : {kunci_raw}      \t-> (Norm: {kunci_bersih})")

    # Hitung Skor Akhir
    score = metric.compute(predictions=preds, references=refs)
    
    print("\n" + "="*40)
    print(f"üèÜ SKOR BLEU FINAL (ADIL): {score['score']:.2f}")
    print("="*40)
    
    if score['score'] > 20:
        print("‚úÖ Kualitas BAGUS! (Model paham makna, walau beda gaya)")
    elif score['score'] > 10:
        print("‚ö†Ô∏è LUMAYAN. (Perlu perbaikan gaya bahasa)")
    else:
        print("‚ùå RENDAH. (Mungkin data training terlalu sedikit/kotor)")

if __name__ == "__main__":
    main()


ü§ñ MEMUAT MODEL DARI: D:/UNAIR/NLP/Project_Madura/model_google_mt5
‚ö° Device: CUDA




‚úÖ Masuk: 50 soal dari NusaX.
‚úÖ Masuk: 50 soal dari Lexicon.
‚úÖ Masuk: 50 soal dari INMAD.

üöÄ MULAI UJIAN (150 Soal)...
Parameter: Beam=5, Normalisasi=AKTIF (Anti-Petik)



  1%|          | 1/150 [00:04<11:36,  4.67s/it]


[Sosmed (NusaX)] Indo: Untuk menuju ke tempat ini jalannya sangat macet dan jauh dari pusat kota. Sesampainya di sana tidak ada hal yang spesial yang membuat perjalanan jauh terbayarkan. Menunya biasa saja.
ü§ñ Jawab : Kaangghuy entar ka kennengngan pan√®ka jh√¢l√¢nna c√®' macet b√¢n jh√¢u d√¢ri pusat kottha. Samp√®' √® dhissa' tad√¢' hal s√® istimewa s√® agh√¢b√¢y parjh√¢l√¢nan jh√¢u √®b√¢yar. Menuna biasa sajan.  	-> (Norm: kaangghuy entar ka kennengngan pan√®ka j√¢l√¢nna c√® macet b√¢n j√¢u d√¢ri pusat kottha samp√® √® dissa tad√¢ hal s√® istimewa s√® agh√¢b√¢y parj√¢l√¢nan j√¢u √®b√¢yar menuna biasa sajan)
üîë Kunci : Untuk depak ka kennengngan ria jhelenna macet ben jheu deri pusat kota. Sa depak en ka dissak tadek se spesial se ghebey perjhelenan jheu ria terbayarkan. Menuna biasa bhei.      	-> (Norm: untuk depak ka kennengngan ria jelenna macet ben jeu deri pusat kota sa depak en ka dissak tadek se spesial se ghebey perjelenan jeu ria terbayarkan menuna biasa bei)


 17%|‚ñà‚ñã        | 26/150 [00:59<05:00,  2.42s/it]


[Sosmed (NusaX)] Indo: Banyak orang yang kurang suka untuk berobat ke dokter. Selain biaya yang relatif cukup mahal bagi sebagian orang, ke dokter tak selamanya bisa menyembuhkan. Banyak dokter yang salah diagnosa. Pentingkah ataukah sudah lumrah, toh dokter juga manusia.
ü§ñ Jawab : B√¢nnya' or√®ng s√® ta' senneng kaangghuy aobat ka dokter. Sala√®n ongkos s√® relatif cokop larang gh√¢b√¢y sab√¢giy√¢n or√®ng, ka dokter ta' samp√®yan bisa mab√¢li. b√¢nnya' dokter s√® sala diagnosa. Pentingkah otab√¢ ampon lumrah, toh dokter jhugh√¢n manossa.  	-> (Norm: b√¢nnya or√®ng s√® ta senneng kaangghuy aobat ka dokter sala√®n ongkos s√® relatif cokop larang gh√¢b√¢y sab√¢giy√¢n or√®ng ka dokter ta samp√®yan bisa mab√¢li b√¢nnya dokter s√® sala diagnosa pentingkah otab√¢ ampon lumrah toh dokter jugh√¢n manossa)
üîë Kunci : Bennyak oreng se korang seneng ghebey aobat ka dokter. Salaen ongkos se bek cokop larang ghebey sabegien oreng, ka dokter tak saterrossa bisa maberes. Bennyak dokter se salah

 34%|‚ñà‚ñà‚ñà‚ñç      | 51/150 [02:00<02:45,  1.67s/it]


[Kamus (Lexicon)] Indo: mengembalikan
ü§ñ Jawab : ngembalikan  	-> (Norm: ngembalikan)
üîë Kunci : mabeliagi      	-> (Norm: mabeliagi)


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 77/150 [02:05<00:10,  6.94it/s]


[Kamus (Lexicon)] Indo: tidak
ü§ñ Jawab : ta'  	-> (Norm: ta)
üîë Kunci : √™nt√™n      	-> (Norm: √™nt√™n)


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 101/150 [02:10<00:24,  1.98it/s]


[Formal (INMAD)] Indo: Kau yang aku tunggu dengan Afgan oleh rossa?
ü§ñ Jawab : Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi Budi [URL]  	-> (Norm: budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi budi url)
üîë Kunci : edisi posang malem minggu ? Kamu yang kutunggu ( bik Afgan) deri Rossa ? http://path.com/p/nrLjm      	-> (Norm: edisi posang malem minggu  kamu yang kutunggu  bik afgan deri rossa)


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 126/150 [02:54<00:36,  1.53s/it]


[Formal (INMAD)] Indo: dia memberiku handuk kotor tidak ada wifi di dalam ruangan jangan katakan wifi!
ü§ñ Jawab : kaul√¢h aberri' handuk gheddhe' tad√¢' wifi √® kamar jh√¢' √®kab√¢'aghi wifi!  	-> (Norm: kaul√¢h aberri handuk ghedde tad√¢ wifi √® kamar j√¢ √®kab√¢aghi wifi)
üîë Kunci : sempat eberri' andok gheddhe'. tade' WiFi e kamar. jhe' nyator bede WiFi!      	-> (Norm: sempat eberri andok ghedde tade wifi e kamar je nyator bede wifi)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 150/150 [03:40<00:00,  1.47s/it]



üèÜ SKOR BLEU FINAL (ADIL): 14.32
‚ö†Ô∏è LUMAYAN. (Perlu perbaikan gaya bahasa)
