In [1]:
# Step 1: Load the Synthetic Evaluation Dataset
import pandas as pd

dataset_path = "evaluation_dataset_synthetic.csv"
print(f"Loading synthetic evaluation dataset from: {dataset_path}")

eval_df_neutral = pd.read_csv(dataset_path)

# RAGAs expects the 'contexts' column to be a list, but CSV loads it as a string.
# We must convert it back to a list.
eval_df_neutral['contexts'] = eval_df_neutral['contexts'].apply(lambda x: eval(x))

print(f"Successfully loaded {len(eval_df_neutral)} questions from synthetic dataset.")
display(eval_df_neutral.head())

Loading synthetic evaluation dataset from: evaluation_dataset_synthetic.csv
Successfully loaded 30 questions from synthetic dataset.


Unnamed: 0,question,ground_truth,contexts
0,Berapa lama tempoh pemulangan standard untuk b...,Tempoh pemulangan standard adalah 15 hari dari...,[Polisi pemulangan standard kami adalah selama...
1,Apakah polisi pemulangan untuk item dari premi...,Item dari premium mall mempunyai tempoh pemula...,"[Untuk item yang dibeli dari premium mall, and..."
2,Bagaimana cara untuk memohon bayaran balik jik...,Anda boleh memohon bayaran balik melalui pusat...,"[Jika anda menerima produk yang rosak, anda bo..."
3,Adakah saya akan mendapat bayaran balik penuh ...,Bayaran balik penuh biasanya tidak termasuk ko...,[Bayaran balik penuh biasanya tidak termasuk k...
4,Berapa lama masa yang diambil untuk bayaran ba...,Bayaran balik ke e-wallet biasanya diproses da...,[Bayaran balik yang diproses ke e-wallet anda ...


In [2]:
# Step 2: Generate Responses from Both V1 and V2 Systems
# Full, Final, and Correct Code for Cell 2 (with Quality Gate)

import sys
from tqdm import tqdm
from datasets import Dataset

# Add project folders to the Python path
sys.path.append('./v1_malay_selfhosted')
sys.path.append('./v2_multilingual_api/backend')

from v1_adapter import get_v1_rag_response
from v2_adapter import get_v2_rag_response

print("\n--- Step 2: Generating responses with a Quality Gate ---")

# --- V2 Evaluation (Assumed to be reliable) ---
v2_results = []
for index, row in tqdm(eval_df_neutral.iterrows(), total=len(eval_df_neutral), desc="Evaluating V2 System"):
    response = get_v2_rag_response(row['question'])
    v2_results.append({
        "question": row['question'],
        "answer": response['answer'],
        "contexts": response['contexts'],
        "ground_truth": row['ground_truth']
    })
    
# --- V1 Evaluation with a strict Quality Gate ---
v1_results_good = []  # To be sent to RAGAs
v1_results_failed = [] # To count failures

for index, row in tqdm(eval_df_neutral.iterrows(), total=len(eval_df_neutral), desc="Evaluating V1 System"):
    response = get_v1_rag_response(row['question'])
    
    # --- THE CORRECTED AND COMPLETE QUALITY GATE ---
    # A response is considered a failure if the answer is invalid AND there's no context to fall back on.
    answer_text = response.get('answer', '')
    is_answer_valid = answer_text and answer_text.strip() and len(answer_text.strip()) > 5

    # Check for failure
    if not is_answer_valid and not response.get('contexts'):
        # This is a complete failure. Log it.
        v1_results_failed.append({
            "question": row['question'],
            "ground_truth": row['ground_truth']
        })
    else:
        # This is a valid-enough output for grading. Add it.
        v1_results_good.append({
            "question": row['question'],
            "answer": response['answer'],
            "contexts": response['contexts'],
            "ground_truth": row['ground_truth']
        })

# --- Convert ONLY the good results to a Dataset for RAGAs ---
# This will now work because v1_results_good only contains valid dictionaries.
v1_dataset_good = Dataset.from_list(v1_results_good)
v2_dataset = Dataset.from_list(v2_results)

print(f"\nResponse generation complete.")
print(f"V1 System: {len(v1_results_good)} successful generations passed to RAGAs.")
print(f"V1 System: {len(v1_results_failed)} complete failures were caught.")
print(f"V2 System: {len(v2_results)} successful generations.")

--- Initializing V1 RAG Pipeline for Evaluation ---


  embeddings = HuggingFaceEmbeddings(
Device set to use cpu
  llm_pipe = HuggingFacePipeline(pipeline=pipe)


--- V1 RAG Pipeline Ready ---
Initializing V2 models and services for evaluation...
V2 Adapter Initialized.

--- Step 2: Generating responses with a Quality Gate ---


Evaluating V2 System: 100%|██████████| 30/30 [01:16<00:00,  2.54s/it]
Evaluating V1 System: 100%|██████████| 30/30 [00:04<00:00,  6.27it/s]


Response generation complete.
V1 System: 30 successful generations passed to RAGAs.
V1 System: 0 complete failures were caught.
V2 System: 30 successful generations.





In [3]:
# Step 3: Running RAGAs Evaluation Using Gemini as the Judge
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings  # Import the embeddings class
from dotenv import load_dotenv
import os

# --- Load the Gemini API key ---
load_dotenv()
gemini_api_key = os.getenv("GEMINI_API_KEY")

if not gemini_api_key:
    raise ValueError("GEMINI_API_KEY not found in .env file. Please ensure it is set.")

# --- 1. Define the Judge LLM using Google Gemini ---
# This is for the "chat" part of the evaluation.
judge_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=gemini_api_key,
    temperature=0,
)

# --- 2. Define the Embedding Model for Evaluation ---
# This is for metrics like 'faithfulness' that need to compare sentences.
# We will use the same multilingual model from our RAG pipeline for consistency.
# This completely removes the need for OpenAI embeddings.
ragas_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)

# --- 3. Define the metrics we want to measure ---
metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
]

print("\n--- Step 3: Running RAGAs evaluation using Gemini and local embeddings ---")

print("\nEvaluating V1...")
# --- The Definitive Fix: Pass BOTH the LLM and the embeddings to the evaluate function ---
v1_scores = evaluate(
    v1_dataset_good, 
    metrics=metrics, 
    llm=judge_llm, 
    embeddings=ragas_embeddings,
    raise_exceptions=False
)

print("\nEvaluating V2...")
v2_scores = evaluate(
    v2_dataset, 
    metrics=metrics, 
    llm=judge_llm, 
    embeddings=ragas_embeddings,
    raise_exceptions=False
)

print("\nEvaluation complete.")


--- Step 3: Running RAGAs evaluation using Gemini and local embeddings ---

Evaluating V1...


Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]

Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[5]: IndexError(list index out of range)
Exception raised in Job[9]: IndexError(list index out of range)
Exception raised in Job[13]: IndexError(list index out of range)
Exception raised in Job[21]: IndexError(list index out of range)
Exception raised in Job[25]: IndexError(list index out of range)
Exception raised in Job[29]: IndexError(list index out of range)
Exception raised in Job[33]: IndexError(list index out of range)
Exception raised in Job[53]: IndexError(list index out of range)
Exception raised in Job[57]: IndexError(list index out of range)
Exception raised in Job[61]: IndexError(list index out of range)
Exception raised in Job[49]: IndexError(list index out of range)
Exception raised in Job[65]: IndexError(list index out of range)
Exception raised in Job[69]: IndexError(list index out of range)
Exception raised in Job[73]: IndexError(list index out of range)
Exception raised in Job[77]:


Evaluating V2...


Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]

Exception raised in Job[13]: IndexError(list index out of range)
Exception raised in Job[9]: IndexError(list index out of range)
Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[5]: IndexError(list index out of range)
Exception raised in Job[17]: IndexError(list index out of range)
Exception raised in Job[21]: IndexError(list index out of range)
Exception raised in Job[29]: IndexError(list index out of range)
Exception raised in Job[33]: IndexError(list index out of range)
Exception raised in Job[37]: IndexError(list index out of range)
Exception raised in Job[41]: IndexError(list index out of range)
Exception raised in Job[45]: IndexError(list index out of range)
Exception raised in Job[49]: IndexError(list index out of range)
Exception raised in Job[53]: IndexError(list index out of range)
Exception raised in Job[57]: IndexError(list index out of range)
Exception raised in Job[61]: IndexError(list index out of range)
Exception raised in Job[65]:


Evaluation complete.


In [6]:
# Step 4: Display and Compare Results
import pandas as pd

v1_scores_df = v1_scores.to_pandas()
v2_scores_df = v2_scores.to_pandas()

print("--- V1 Evaluation Scores (Detailed) ---")
display(v1_scores_df.head())

print("\n--- V2 Evaluation Scores (Detailed) ---")
display(v2_scores_df.head())

# Create a final summary comparison table
summary_data = {
    "Metric": ["Context Precision", "Context Recall", "Answer Faithfulness", "Answer Relevancy"],
    "V1 Score (Avg)": [
        v1_scores_df['context_precision'].mean(),
        v1_scores_df['context_recall'].mean(),
        v1_scores_df['faithfulness'].mean(),
        v1_scores_df['answer_relevancy'].mean()
    ],
    "V2 Score (Avg)": [
        v2_scores_df['context_precision'].mean(),
        v2_scores_df['context_recall'].mean(),
        v2_scores_df['faithfulness'].mean(),
        v2_scores_df['answer_relevancy'].mean()
    ]
}
summary_df = pd.DataFrame(summary_data)

print("\n\n--- FINAL PERFORMANCE SUMMARY: V1 vs. V2 ---")
display(summary_df)


--- V1 Evaluation Scores (Detailed) ---


Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision,context_recall
0,Berapa lama tempoh pemulangan standard untuk b...,[Isu Selepas Tempoh Pemulangan Platform: Jika ...,7/10/14 hari,Tempoh pemulangan standard adalah 15 hari dari...,0.333333,,0.0,0.0
1,Apakah polisi pemulangan untuk item dari premi...,[52. Polisi Pemulangan Produk: Memahami Tempoh...,1).,Item dari premium mall mempunyai tempoh pemula...,,,0.0,0.0
2,Bagaimana cara untuk memohon bayaran balik jik...,[di mana anda boleh meminta bayaran balik jika...,1).,Anda boleh memohon bayaran balik melalui pusat...,,,0.0,0.0
3,Adakah saya akan mendapat bayaran balik penuh ...,[### Adakah saya perlu membayar kos penghantar...,1).,Bayaran balik penuh biasanya tidak termasuk ko...,,,0.5,0.0
4,Berapa lama masa yang diambil untuk bayaran ba...,[Jumlah Masa Bayaran Balik: Menjumlahkan semua...,i.,Bayaran balik ke e-wallet biasanya diproses da...,,0.0,1.0,0.0



--- V2 Evaluation Scores (Detailed) ---


Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision,context_recall
0,Berapa lama tempoh pemulangan standard untuk b...,[Standard Shipping:\n\nWithin Major Cities: Ty...,Tempoh pemulangan standard untuk barangan yang...,Tempoh pemulangan standard adalah 15 hari dari...,1.0,,0.0,0.0
1,Apakah polisi pemulangan untuk item dari premi...,"[4. AuraMall-Specific Policies\n\nAuraMall, ou...",Dasar pemulangan umum AuraCart terpakai untuk ...,Item dari premium mall mempunyai tempoh pemula...,1.0,,0.0,0.0
2,Bagaimana cara untuk memohon bayaran balik jik...,"[Klik ""Return/Refund"" atau ""Request Return"" di...",Untuk memohon bayaran balik bagi produk yang r...,Anda boleh memohon bayaran balik melalui pusat...,1.0,,1.0,1.0
3,Adakah saya akan mendapat bayaran balik penuh ...,[Jaminan Ketulenan: Produk AuraMall dijamin ke...,Yuran penghantaran asal umumnya tidak boleh di...,Bayaran balik penuh biasanya tidak termasuk ko...,1.0,,0.5,1.0
4,Berapa lama masa yang diambil untuk bayaran ba...,[5. Masa Pemprosesan Bayaran Balik\n\nSetelah ...,"Bagi bayaran balik ke AuraWallet anda, ia adal...",Bayaran balik ke e-wallet biasanya diproses da...,1.0,,0.5,0.0




--- FINAL PERFORMANCE SUMMARY: V1 vs. V2 ---


Unnamed: 0,Metric,V1 Score (Avg),V2 Score (Avg)
0,Context Precision,0.333333,0.336111
1,Context Recall,0.1,0.35
2,Answer Faithfulness,0.466667,0.714187
3,Answer Relevancy,0.036701,0.594228


In [7]:
# Step 5: Report on Generation Failure Rate

v1_failure_rate = len(v1_results_failed) / len(eval_df_neutral)

print("--- System Reliability Summary ---")
print(f"V1 System Generation Failure Rate: {v1_failure_rate:.2%}")
print(f"(V1 failed to produce any output for {len(v1_results_failed)} out of {len(eval_df_neutral)} questions.)")
print("\nFailed V1 Questions:")
for item in v1_results_failed:
    print(f"- {item['question']}")

--- System Reliability Summary ---
V1 System Generation Failure Rate: 0.00%
(V1 failed to produce any output for 0 out of 30 questions.)

Failed V1 Questions:
