<a href="https://colab.research.google.com/github/anantapk03/farmer-rice-chatbot-model/blob/main/BLEU_SCORE_EVAL_DETAIL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Langkah 1: Setup Lingkungan
!pip install -q transformers datasets torch accelerate nltk

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForCausalLM
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import csv
import math

In [None]:
model_name = "afrizalha/Bakpia-V1-0.5B-Javanese" #change this base model

# Tentukan device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Pastikan punkt tokenizer NLTK sudah diunduh
nltk.download('punkt')

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = AutoModelForCausalLM.from_pretrained(model_name) # Sesuaikan dengan tokenizer
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

In [None]:
TOTAL_TRIAL_DATA = 10000 # Ganti dengan nilai yang sesuai jika berbeda
TOTAL_EPOCH_TRIAL = 10 # Ganti dengan nilai yang sesuai jika berbeda
model_path = "/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/EPOCH_"+str(TOTAL_EPOCH_TRIAL)+"/model_state.pt"

# Muat state dictionary model
model.load_state_dict(torch.load(model_path, map_location=device))

# Set model ke mode evaluasi
model.eval()

print("Model state loaded successfully.")

In [None]:
# Langkah 3: Muat Dataset (Definisikan ulang ChatData jika belum ada di notebook ini)
# Dataset Class
class ChatData(Dataset):
    def __init__(self, path: str, tokenizer, max_length: int = 150):
        df = pd.read_csv(path)
        df.dropna(subset=['pertanyaan', 'jawaban'], inplace=True)
        df.drop_duplicates(subset=['pertanyaan', 'jawaban'], inplace=True)

        self.X = [
            f"<startofstring> {row['pertanyaan']} <bot>: {row['jawaban']} <endofstring>"
            for _, row in df.iterrows()
        ]

        self.X_encoded = tokenizer(
            self.X,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]

# Muat dataset
train_data_path = f"/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/{TOTAL_TRIAL_DATA}/DATASET/train.csv"
val_data_path = f"/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/{TOTAL_TRIAL_DATA}/DATASET/valid.csv"
test_data_path = f"/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/{TOTAL_TRIAL_DATA}/DATASET/test.csv"

train_data = ChatData(train_data_path, tokenizer)
val_data = ChatData(val_data_path, tokenizer)
test_data = ChatData(test_data_path, tokenizer)

print("Datasets loaded successfully.")

In [None]:
# Langkah 4: Siapkan Data Loaders
batch_size = 32

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False) # Tidak perlu shuffle untuk evaluasi
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

print("DataLoaders created.")

In [None]:
import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def compute_bleu(preds, refs):
    smoothie = SmoothingFunction().method4
    scores = []
    for pred, ref in zip(preds, refs):
        pred_tokens = pred.split()
        ref_tokens = [ref.split()]
        if pred_tokens:
            score = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)
        else:
            score = 0.0
        scores.append(score)
    return scores  # Kembalikan list skor BLEU per sample


def evaluate(data_loader, model, tokenizer, return_details=False):
    model.eval()
    preds = []
    refs = []
    questions = []

    with torch.no_grad():
        for X, a in tqdm.tqdm(data_loader, desc="Evaluating"):
            X, a = X.to(device), a.to(device)

            # Decode input untuk mendapatkan prompt sebelum <bot>:
            decoded_inputs = [tokenizer.decode(x, skip_special_tokens=False) for x in X]

            # Buat prompt yang hanya sampai <bot>:
            prompts = []
            reference_answers = []
            extracted_questions = []

            for text in decoded_inputs:
                # Cari posisi tag
                user_start = text.find("<startofstring>")
                bot_start = text.find("<bot>:")
                end_tag = text.find("<endofstring>")

                # Ambil prompt sampai <bot>: (untuk dijadikan input ke generate)
                if user_start != -1 and bot_start != -1:
                    prompt = text[user_start:bot_start + len("<bot>:")]
                    prompts.append(prompt.strip())

                    # Ambil question (untuk logging)
                    extracted_questions.append(text[user_start + len("<startofstring>"):bot_start].strip())
                else:
                    prompts.append("")  # fallback kosong
                    extracted_questions.append("")

                # Ambil referensi jawaban dari <bot>: sampai <endofstring>
                if bot_start != -1 and end_tag != -1:
                    reference = text[bot_start + len("<bot>"):end_tag].strip()
                elif bot_start != -1:
                    reference = text[bot_start + len("<bot>"):].strip()
                else:
                    reference = ""
                reference = reference.replace("<pad>", "").replace("<startofstring>", "").strip()
                reference_answers.append(reference)

            # Tokenisasi ulang prompt-only
            prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
            prompt_input_ids = prompt_tokens["input_ids"].to(device)
            prompt_attention_mask = prompt_tokens["attention_mask"].to(device)

            # Generate prediksi dari prompt saja
            generated_outputs = model.generate(
                input_ids=prompt_input_ids,
                attention_mask=prompt_attention_mask,
                max_new_tokens=150,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode hasil generate
            decoded_preds = [tokenizer.decode(g, skip_special_tokens=False) for g in generated_outputs]

            # Ambil isi setelah <bot>: sampai <endofstring> dari prediksi
            cleaned_preds = []
            for pred_text in decoded_preds:
                start_pred = pred_text.find("<bot>:")
                end_pred = pred_text.find("<endofstring>")

                if start_pred != -1:
                    answer = pred_text[start_pred + len("<bot>"): end_pred] if end_pred > start_pred else pred_text[start_pred + len("<bot>"):]
                    answer = answer.replace("<pad>", "").replace("<startofstring>", "").strip()
                    cleaned_preds.append(answer)
                else:
                    cleaned_preds.append("")

            preds.extend(cleaned_preds)
            refs.extend(reference_answers)
            questions.extend(extracted_questions)

    # Hitung BLEU per sample
    bleu_scores_list = compute_bleu(preds, refs)
    avg_bleu_score = sum(bleu_scores_list) / len(bleu_scores_list) if bleu_scores_list else 0.0

    if return_details:
        return questions, refs, preds, bleu_scores_list
    else:
        return avg_bleu_score


In [None]:
# Langkah 6 & 7: Lakukan Evaluasi dan Simpan Hasil
import datetime # Import datetime

print("Starting evaluation...")

# Lakukan evaluasi untuk setiap set data (rata-rata BLEU)
train_bleu = evaluate(train_loader, model, tokenizer)
val_bleu = evaluate(val_loader, model, tokenizer)

# Lakukan evaluasi untuk data test dan dapatkan detail per baris
test_questions, test_refs, test_preds, test_bleu_scores_list = evaluate(test_loader, model, tokenizer, return_details=True)

# Hitung rata-rata BLEU untuk data test
test_bleu = sum(test_bleu_scores_list) / len(test_bleu_scores_list) if test_bleu_scores_list else 0.0

print("Evaluation finished.")

# Dapatkan tanggal saat ini
current_date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Simpan hasil rata-rata BLEU ke CSV
results_path_avg = f"/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/{TOTAL_TRIAL_DATA}/EPOCH_{TOTAL_EPOCH_TRIAL}/bleu_evaluation_results_avg_{current_date}.csv"

results_df_avg = pd.DataFrame({
    "Dataset": ["Train", "Validation", "Test"],
    "BLEU Score": [train_bleu, val_bleu, test_bleu]
})

results_df_avg.to_csv(results_path_avg, index=False)

print(f"Average BLEU scores saved to {results_path_avg}")

# Simpan hasil detail BLEU untuk data test ke CSV
results_path_test_details = f"/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/{TOTAL_TRIAL_DATA}/EPOCH_{TOTAL_EPOCH_TRIAL}/bleu_evaluation_results_test_details_{current_date}.csv"

results_df_test_details = pd.DataFrame({
    "Pertanyaan": test_questions,
    "Jawaban Referensi": test_refs,
    "Jawaban Prediksi": test_preds,
    "BLEU Score": test_bleu_scores_list
})

results_df_test_details.to_csv(results_path_test_details, index=False)

print(f"Test dataset BLEU details saved to {results_path_test_details}")


# Langkah 8: Tampilkan Hasil
print("\n--- Average BLEU Scores ---")
print(results_df_avg)

# Langkah 9: Finish task
print(f"\nEvaluation complete. Average BLEU scores for Train, Validation, and Test datasets have been calculated and saved to {results_path_avg}.")
print(f"Individual BLEU scores and details for the Test dataset have been saved to {results_path_test_details}.")

In [None]:
def manual_inference(question, model, tokenizer, device, max_new_tokens=150):
    model.eval()
    with torch.no_grad():
        # Buat prompt yang sesuai format pelatihan
        input_text = f"<startofstring> {question} <bot>:"
        tokens = tokenizer(input_text, return_tensors='pt', padding=True)
        input_ids = tokens["input_ids"].to(device)
        attention_mask = tokens["attention_mask"].to(device)

        # Generate
        generated_output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

        # Decode
        decoded_output = tokenizer.decode(generated_output[0], skip_special_tokens=False)

        # Ambil jawaban dari <bot>: sampai <endofstring>
        start_idx = decoded_output.find("<bot>:")
        if start_idx != -1:
            response = decoded_output[start_idx + len("<bot>"):]
            end_idx = response.find("<endofstring>")
            if end_idx != -1:
                response = response[:end_idx]
            response = response.replace("<startofstring>", "").replace("<pad>", "").strip()
        else:
            response = "Maaf, saya tidak dapat memahami pertanyaan Anda."

        return response


In [None]:
# Contoh penggunaan manual inference
question = "Priben proses penyiapan lahan kanggo budidaya padi khusus?"
response = manual_inference(question, model, tokenizer, device)
print(f"Pertanyaan: {question}")
print(f"Jawaban Bot: {response}")

#JAWABAN ACTUAL
# Keuntungan nganggo biopestisida ning pengendalian hama lan penyakit tanduran yaiku murah lan bahan gampang didapat, ora menimbulkan residu ning tanaman, aman bagi manusia, hewan, lan ramah lingkungan, aman dinggo ning dosis tinggi, produk pertanian sing dihasilnang luwih sehat, ora gampang menyebab resistensi hama, kesehatan lema luwih terjaga, bisa ningkataken bahan organik tanah, bisa mempertahankan keberada

# JAWABAN PREDIKSI TEST BLEU
# Keuntungan nganggo biopestisida ning pengendalian hama lan penyakit tanduran yaiku murah lan bahan gampang didapat, ora menimbulkan residu ning tanaman, aman bagi manusia, hewan, lan ramah lingkungan, aman dinggo ning dosis tinggi, produk pertanian sing dihasilnang luwih sehat, ora gampang menyebab resistensi hama, kesehatan lema luwih terjaga, bisa ningkataken bahan organik tanah, bisa mempertahankan keberadaan sumber pestisida misale dadi sistem non PTT, lan nguntak siklus insektisida sejen.