<a href="https://colab.research.google.com/github/anantapk03/farmer-rice-chatbot-model/blob/main/EvaluationRougeMeteorRevisi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets torch accelerate nltk rouge_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import math
import tqdm
import csv
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader

model_name = "gpt2"

# Pastikan punkt tokenizer NLTK sudah diunduh
nltk.download('punkt')

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = AutoModelForCausalLM.from_pretrained(model_name) # Sesuaikan dengan tokenizer
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Tentukan path ke file model_state.pt yang sudah Anda simpan
# Sesuaikan path ini dengan lokasi file Anda
TOTAL_TRIAL_DATA = 10000 # Ganti dengan nilai yang sesuai jika berbeda
TOTAL_EPOCH_TRIAL = 10 # Ganti dengan nilai yang sesuai jika berbeda
model_path = "/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/EPOCH_"+str(TOTAL_EPOCH_TRIAL)+"/model_state.pt"

# Muat state dictionary model
model.load_state_dict(torch.load(model_path, map_location=device))

# Set model ke mode evaluasi
model.eval()

print("Model state loaded successfully.")

Model state loaded successfully.


In [None]:
# Definisikan kembali kelas ChatData atau pastikan sudah tersedia di notebook ini
class ChatData(Dataset):
    def __init__(self, path: str, tokenizer, max_length: int = 150):
        df = pd.read_csv(path)
        df.dropna(subset=['pertanyaan', 'jawaban'], inplace=True)
        df.drop_duplicates(subset=['pertanyaan', 'jawaban'], inplace=True)

        self.X = [
            f"<startofstring> {row['pertanyaan']} <bot>: {row['jawaban']} <endofstring>"
            for _, row in df.iterrows()
        ]

        self.X_encoded = tokenizer(
            self.X,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]

# Tentukan path ke file test.csv
# Sesuaikan path ini dengan lokasi file Anda
test_data_path = "/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/DATASET/test.csv"

# Muat data test
test_data = ChatData(test_data_path, tokenizer)
test_loader = DataLoader(test_data, batch_size=32) # Sesuaikan batch size jika perlu

print("Test data loaded successfully.")

Test data loaded successfully.


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# def compute_rouge_meteor(preds, refs):
#     """
#     Menghitung skor ROUGE dan METEOR.

#     Args:
#         preds (list): List teks hipotesis (output model).
#         refs (list): List teks referensi (jawaban sebenarnya).

#     Returns:
#         dict: Dictionary berisi skor ROUGE (rouge1, rouge2, rougel - precision, recall, fmeasure) dan METEOR.
#     """
#     scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
#     rouge_scores = {'rouge1': {'precision': [], 'recall': [], 'fmeasure': []},
#                     'rouge2': {'precision': [], 'recall': [], 'fmeasure': []},
#                     'rougeL': {'precision': [], 'recall': [], 'fmeasure': []}}
#     meteor_scores = []

#     # Pastikan panjang preds dan refs sama
#     if len(preds) != len(refs):
#         print("Warning: Panjang prediksi dan referensi tidak sama.")
#         min_len = min(len(preds), len(refs))
#         preds = preds[:min_len]
#         refs = refs[:min_len]


#     for pred, ref in zip(preds, refs):
#         # ROUGE Score
#         # Pastikan referensi dan prediksi tidak kosong sebelum menghitung skor
#         if ref and pred:
#             try:
#                 scores = scorer.score(ref, pred) # Referensi sebagai argumen pertama
#                 for metric in ['rouge1', 'rouge2', 'rougeL']:
#                     rouge_scores[metric]['precision'].append(scores[metric].precision)
#                     rouge_scores[metric]['recall'].append(scores[metric].recall)
#                     rouge_scores[metric]['fmeasure'].append(scores[metric].fmeasure)
#             except Exception as e:
#                 print(f"Error calculating ROUGE for ref: '{ref}', pred: '{pred}' - {e}")


#         # METEOR Score
#         # Tokenisasi untuk METEOR
#         ref_tokens = word_tokenize(ref)
#         pred_tokens = word_tokenize(pred)
#         # Pastikan token tidak kosong sebelum menghitung skor METEOR
#         if ref_tokens and pred_tokens:
#              try:
#                 meteor_scores.append(meteor_score([ref_tokens], pred_tokens)) # Referensi sebagai list of lists
#              except Exception as e:
#                 print(f"Error calculating METEOR for ref: '{ref}', pred: '{pred}' - {e}")


#     # Hitung rata-rata skor
#     avg_rouge_scores = {}
#     for metric in ['rouge1', 'rouge2', 'rougeL']:
#         avg_rouge_scores[metric] = {}
#         for score_type in ['precision', 'recall', 'fmeasure']:
#             if rouge_scores[metric][score_type]:
#                 avg_rouge_scores[metric][score_type] = sum(rouge_scores[metric][score_type]) / len(rouge_scores[metric][score_type])
#             else:
#                 avg_rouge_scores[metric][score_type] = 0.0


#     if meteor_scores:
#         avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
#     else:
#         avg_meteor_score = 0.0


#     return {"rouge": avg_rouge_scores, "meteor": avg_meteor_score}

In [None]:
def compute_rouge_meteor(preds, refs):
    """
    Menghitung skor ROUGE dan METEOR berdasarkan prediksi dan referensi teks.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': {'precision': [], 'recall': [], 'fmeasure': []},
                    'rouge2': {'precision': [], 'recall': [], 'fmeasure': []},
                    'rougeL': {'precision': [], 'recall': [], 'fmeasure': []}}
    meteor_scores = []

    if len(preds) != len(refs):
        print("⚠️ Warning: Panjang prediksi dan referensi tidak sama.")
        min_len = min(len(preds), len(refs))
        preds = preds[:min_len]
        refs = refs[:min_len]

    for pred, ref in zip(preds, refs):
        try:
            # Hitung ROUGE
            scores = scorer.score(ref, pred)
            for metric in ['rouge1', 'rouge2', 'rougeL']:
                rouge_scores[metric]['precision'].append(scores[metric].precision)
                rouge_scores[metric]['recall'].append(scores[metric].recall)
                rouge_scores[metric]['fmeasure'].append(scores[metric].fmeasure)
        except Exception as e:
            print(f"[ROUGE Error] Ref: {ref} | Pred: {pred} | Err: {e}")

        try:
            ref_tokens = word_tokenize(ref)
            pred_tokens = word_tokenize(pred)
            if ref_tokens and pred_tokens:
                meteor_scores.append(meteor_score([ref_tokens], pred_tokens))
        except Exception as e:
            print(f"[METEOR Error] Ref: {ref} | Pred: {pred} | Err: {e}")

    avg_rouge_scores = {}
    for metric in rouge_scores:
        avg_rouge_scores[metric] = {
            score_type: (
                sum(rouge_scores[metric][score_type]) / len(rouge_scores[metric][score_type])
                if rouge_scores[metric][score_type] else 0.0
            )
            for score_type in ['precision', 'recall', 'fmeasure']
        }

    avg_meteor_score = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0.0

    return {'rouge': avg_rouge_scores, 'meteor': avg_meteor_score}

In [None]:
# test_references = []
# test_hypotheses = []

# print("Generating predictions for test set...")

# with torch.no_grad():
#     for X, a in tqdm.tqdm(test_loader, desc="Evaluating"): # Gunakan tqdm untuk progress bar
#         X, a = X.to(device), a.to(device)

#         # Generate predictions
#         # Sesuaikan parameter generate seperti max_new_tokens sesuai kebutuhan
#         generated = model.generate(
#             X,
#             attention_mask=a,
#             max_new_tokens=150, # Sesuaikan dengan kebutuhan
#             eos_token_id=tokenizer.eos_token_id,
#             pad_token_id=tokenizer.pad_token_id # Tambahkan pad_token_id
#             )

#         # Decode predictions and references
#         # Hapus token khusus saat decoding untuk evaluasi
#         decoded_preds = [tokenizer.decode(g, skip_special_tokens=True) for g in generated]
#         decoded_refs = [tokenizer.decode(x, skip_special_tokens=True) for x in X]


#         test_hypotheses.extend(decoded_preds)
#         test_references.extend(decoded_refs)


# print("Calculating ROUGE and METEOR scores...")

# # Hitung skor ROUGE dan METEOR
# evaluation_results = compute_rouge_meteor(test_hypotheses, test_references)

# # Tampilkan hasilnya
# print("\nHasil Evaluasi ROUGE dan METEOR pada Test Set:")
# print(f"ROUGE Scores: {evaluation_results['rouge']}")
# print(f"METEOR Score: {evaluation_results['meteor']:.4f}")

In [None]:
true_answers = []
pred_answers = []

print("🔍 Generating predictions for test set...")

with torch.no_grad():
    for X, a in tqdm.tqdm(test_loader, desc="Evaluating"):
        X, a = X.to(device), a.to(device)

        # Decode input untuk mengambil hanya prompt (pertanyaan + "<bot>:")
        prompts = []
        for x in X:
            decoded = tokenizer.decode(x, skip_special_tokens=False)
            if "<bot>:" in decoded:
                prompt = decoded.split("<bot>:")[0] + "<bot>:"
                prompts.append(prompt)
            else:
                prompts.append(decoded)

        prompt_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)

        # Generate output dari prompt
        generated = model.generate(
            prompt_inputs['input_ids'],
            attention_mask=prompt_inputs['attention_mask'],
            max_new_tokens=150,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )

        # Decode prediksi
        decoded_preds = tokenizer.batch_decode(generated, skip_special_tokens=False)

        # Ekstrak bagian <bot>: ... <endofstring> dari hasil prediksi
        for text in decoded_preds:
            start = text.find("<bot>:")
            end = text.find("<endofstring>")
            answer = text[start + len("<bot>:"):end if end != -1 else None].strip() if start != -1 else ""
            pred_answers.append(answer)

        # Ekstrak jawaban referensi dari input X
        for x in X:
            decoded = tokenizer.decode(x, skip_special_tokens=False)
            start = decoded.find("<bot>:")
            end = decoded.find("<endofstring>")
            answer = decoded[start + len("<bot>:"):end if end != -1 else None].strip() if start != -1 else ""
            true_answers.append(answer)


🔍 Generating predictions for test set...


Evaluating: 100%|██████████| 32/32 [04:11<00:00,  7.86s/it]


In [None]:
# import datetime

# # Tentukan path untuk menyimpan hasil evaluasi ROUGE dan METEOR
# # Tambahkan timestamp pada nama file
# timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# rouge_meteor_log_path = "/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/"+str(TOTAL_TRIAL_DATA)+"/EPOCH_"+str(TOTAL_EPOCH_TRIAL)+f"/rouge_meteor_evaluation_log_{timestamp}.csv"

# # Buka file CSV dalam mode 'write' ('w')
# with open(rouge_meteor_log_path, "w", newline='') as f:
#     writer = csv.writer(f)

#     # Tulis header
#     writer.writerow(["Metric", "Type", "Score"])

#     # Tulis skor ROUGE
#     for metric in ['rouge1', 'rouge2', 'rougeL']:
#         for score_type in ['precision', 'recall', 'fmeasure']:
#             writer.writerow([metric.upper(), score_type.capitalize(), evaluation_results['rouge'][metric][score_type]])

#     # Tulis skor METEOR
#     writer.writerow(["METEOR", "Score", evaluation_results['meteor']])

# print(f"Hasil evaluasi ROUGE dan METEOR telah disimpan ke: {rouge_meteor_log_path}")

In [None]:
# -------------------------------
# HITUNG DAN SIMPAN HASIL EVALUASI
# -------------------------------

import datetime
import os
print("📊 Calculating ROUGE and METEOR scores...")
evaluation_results = compute_rouge_meteor(pred_answers, true_answers)

print("\n✅ Hasil Evaluasi ROUGE dan METEOR pada Test Set:")
for metric, score_dict in evaluation_results["rouge"].items():
    print(f"{metric.upper()}: P={score_dict['precision']:.4f}, R={score_dict['recall']:.4f}, F1={score_dict['fmeasure']:.4f}")
print(f"METEOR Score: {evaluation_results['meteor']:.4f}")

# Simpan hasil evaluasi ke CSV
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
save_dir = f"/content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/{TOTAL_TRIAL_DATA}/EPOCH_{TOTAL_EPOCH_TRIAL}/"
os.makedirs(save_dir, exist_ok=True)
log_path = os.path.join(save_dir, f"rouge_meteor_evaluation_log_{timestamp}.csv")

with open(log_path, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Metric", "Type", "Score"])
    for metric in ['rouge1', 'rouge2', 'rougeL']:
        for score_type in ['precision', 'recall', 'fmeasure']:
            score_val = evaluation_results["rouge"][metric][score_type]
            writer.writerow([metric.upper(), score_type.capitalize(), round(score_val, 4)])
    writer.writerow(["METEOR", "F1", round(evaluation_results["meteor"], 4)])

print(f"📁 Hasil evaluasi disimpan di: {log_path}")

📊 Calculating ROUGE and METEOR scores...

✅ Hasil Evaluasi ROUGE dan METEOR pada Test Set:
ROUGE1: P=0.1549, R=0.1214, F1=0.1276
ROUGE2: P=0.0193, R=0.0133, F1=0.0148
ROUGEL: P=0.1187, R=0.0909, F1=0.0963
METEOR Score: 0.0986
📁 Hasil evaluasi disimpan di: /content/drive/MyDrive/POLINDRA/SKRIPSI/chatbot_trial_finetuning/Komodo7B/10000/EPOCH_10/rouge_meteor_evaluation_log_20250702_075820.csv
