In [1]:
# Step 1: Create the Neutral Evaluation Dataset
import pandas as pd
import json
import re
import os

print("--- Step 1: Creating a Brand-Neutral Golden Evaluation Dataset ---")

# --- Load the Original, Biased SQuAD Data from V1 ---
squad_filepath = 'v1_malay_selfhosted/squad_format_qa_pairs.json'
print(f"Loading original SQuAD data from: {squad_filepath}")
with open(squad_filepath, 'r', encoding='utf-8') as f:
    squad_data = json.load(f)

# --- Define Neutralization Rules ---
# We replace specific brand names with generic e-commerce terms.
replacements = {
    r'\bshopee\b': 'platform',
    r'\blazada\b': 'platform',
    r'\blazmall\b': 'premium mall',
    r'\bshopeepay\b': 'e-wallet',
    r'\blazada wallet\b': 'e-wallet',
    r'\bshopee coins\b': 'reward coins',
}

def neutralize_text(text: str) -> str:
    """Applies a series of regex replacements to make text brand-agnostic."""
    for pattern, replacement in replacements.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

# --- Flatten and Neutralize the Data ---
print("Neutralizing questions, answers, and contexts...")
eval_data_neutral = []
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        neutral_context = neutralize_text(paragraph['context'])
        for qa in paragraph['qas']:
            if qa['answers']:
                neutral_question = neutralize_text(qa['question'])
                neutral_answer = neutralize_text(qa['answers'][0]['text'])
                
                eval_data_neutral.append({
                    'question': neutral_question,
                    'ground_truth': neutral_answer, # RAGAs expects this key for the true answer
                    'contexts': [neutral_context], # RAGAs expects a list of contexts
                })

# Convert to a Pandas DataFrame
eval_df_neutral = pd.DataFrame(eval_data_neutral)

# Save the neutralized dataset for review and reusability
output_path = "evaluation_dataset_neutral.csv"
eval_df_neutral.to_csv(output_path, index=False)

print(f"\nSuccessfully created and saved a NEUTRAL evaluation dataset to '{output_path}' with {len(eval_df_neutral)} questions.")
print("\n--- Neutralized Dataset Preview ---")
display(eval_df_neutral.head())


--- Step 1: Creating a Brand-Neutral Golden Evaluation Dataset ---
Loading original SQuAD data from: v1_malay_selfhosted/squad_format_qa_pairs.json
Neutralizing questions, answers, and contexts...

Successfully created and saved a NEUTRAL evaluation dataset to 'evaluation_dataset_neutral.csv' with 25 questions.

--- Neutralized Dataset Preview ---


Unnamed: 0,question,ground_truth,contexts
0,Bagaimana jika saya ingin membatalkan produk p...,"Mengikut dasar pembatalan platform, anda boleh...","[Mengikut dasar pembatalan platform, anda bole..."
1,Berapa jenis Polisi Pemulangan yang tersedia d...,Terdapat 3 jenis Polisi Pemulangan yang tersed...,[Terdapat 3 jenis Polisi Pemulangan yang terse...
2,Apakah yang perlu saya lakukan jika saya ingin...,anda mesti memulangkan semua item yang telah d...,"[Walau bagaimanapun, jika anda ingin memulangk..."
3,Berapa lama tempoh pemulangan untuk produk pre...,30 Hari,[premium mall & Choice (selepas 1 Februari 202...
4,Berapa lama tempoh pemulangan untuk produk Pas...,15 Hari,[Pasaran & LazGlobal (selepas 31 Okt 2024) | 1...


In [2]:
# Step 2: Generate Responses from Both V1 and V2 Systems
import sys
from tqdm import tqdm
from datasets import Dataset

# Add project folders to the Python path
sys.path.append('./v1_malay_selfhosted')
sys.path.append('./v2_multilingual_api/backend')

from v1_adapter import get_v1_rag_response
from v2_adapter import get_v2_rag_response

print("\n--- Step 2: Generating responses with a Quality Gate ---")

# --- V2 Evaluation ---
# We expect V2 to be highly reliable, so we don't need a complex gate yet.
v2_results = []
for index, row in tqdm(eval_df_neutral.iterrows(), total=len(eval_df_neutral), desc="Evaluating V2 System"):
    response = get_v2_rag_response(row['question'])
    v2_results.append({
        "question": row['question'],
        "answer": response['answer'],
        "contexts": response['contexts'],
        "ground_truth": row['ground_truth']
    })
    
# --- V1 Evaluation with a Quality Gate ---
v1_results_good = []  # To be sent to RAGAs
v1_results_failed = [] # To count failures

for index, row in tqdm(eval_df_neutral.iterrows(), total=len(eval_df_neutral), desc="Evaluating V1 System"):
    response = get_v1_rag_response(row['question'])
    
    # --- THE QUALITY GATE ---
    # Check if the response is an "empty box" (no answer AND no contexts)
    # RAGAs needs at least one of these to function.
    if (not response['answer'] or not response['answer'].strip() or response['answer'] == "No answer generated.") and not response['contexts']:
        # This is a complete failure. Log it and move on.
        v1_results_failed.append({
            "question": row['question'],
            "ground_truth": row['ground_truth']
        })
    else:
        # This is a valid output. Add it to the list to be graded.
        v1_results_good.append({
            "question": row['question'],
            "answer": response['answer'],
            "contexts": response['contexts'],
            "ground_truth": row['ground_truth']
        })

# --- Convert ONLY the good results to a Dataset for RAGAs ---
v1_dataset_good = Dataset.from_list(v1_results_good)
v2_dataset = Dataset.from_list(v2_results) # V2 dataset is assumed to be all good

print(f"\nResponse generation complete.")
print(f"V1 System: {len(v1_results_good)} successful generations, {len(v1_results_failed)} failed generations.")
print(f"V2 System: {len(v2_results)} successful generations.")

--- Initializing V1 RAG Pipeline for Evaluation ---


  embeddings = HuggingFaceEmbeddings(
Device set to use cpu
  llm_pipe = HuggingFacePipeline(pipeline=pipe)


--- V1 RAG Pipeline Ready ---
Initializing V2 models and services for evaluation...
V2 Adapter Initialized.

--- Step 2: Generating responses with a Quality Gate ---


Evaluating V2 System: 100%|██████████| 25/25 [00:58<00:00,  2.33s/it]
Evaluating V1 System: 100%|██████████| 25/25 [00:04<00:00,  6.13it/s]


Response generation complete.
V1 System: 25 successful generations, 0 failed generations.
V2 System: 25 successful generations.





In [4]:
# Step 3: Running RAGAs Evaluation Using Gemini as the Judge
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings  # Import the embeddings class
from dotenv import load_dotenv
import os

# --- Load the Gemini API key ---
load_dotenv()
gemini_api_key = os.getenv("GEMINI_API_KEY")

if not gemini_api_key:
    raise ValueError("GEMINI_API_KEY not found in .env file. Please ensure it is set.")

# --- 1. Define the Judge LLM using Google Gemini ---
# This is for the "chat" part of the evaluation.
judge_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=gemini_api_key,
    temperature=0,
)

# --- 2. Define the Embedding Model for Evaluation ---
# This is for metrics like 'faithfulness' that need to compare sentences.
# We will use the same multilingual model from our RAG pipeline for consistency.
# This completely removes the need for OpenAI embeddings.
ragas_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)

# --- 3. Define the metrics we want to measure ---
metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
]

print("\n--- Step 3: Running RAGAs evaluation using Gemini and local embeddings ---")

print("\nEvaluating V1...")
# --- The Definitive Fix: Pass BOTH the LLM and the embeddings to the evaluate function ---
v1_scores = evaluate(
    v1_dataset_good, 
    metrics=metrics, 
    llm=judge_llm, 
    embeddings=ragas_embeddings
)

print("\nEvaluating V2...")
v2_scores = evaluate(
    v2_dataset, 
    metrics=metrics, 
    llm=judge_llm, 
    embeddings=ragas_embeddings
)

print("\nEvaluation complete.")


--- Step 3: Running RAGAs evaluation using Gemini and local embeddings ---

Evaluating V1...


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

Exception raised in Job[10]: IndexError(list index out of range)
Exception raised in Job[4]: IndexError(list index out of range)
Exception raised in Job[13]: IndexError(list index out of range)
Exception raised in Job[7]: IndexError(list index out of range)
Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[19]: IndexError(list index out of range)
Exception raised in Job[28]: IndexError(list index out of range)
Exception raised in Job[31]: IndexError(list index out of range)
Exception raised in Job[34]: IndexError(list index out of range)
Exception raised in Job[40]: IndexError(list index out of range)
Exception raised in Job[46]: IndexError(list index out of range)
Exception raised in Job[43]: IndexError(list index out of range)
Exception raised in Job[55]: IndexError(list index out of range)
Exception raised in Job[61]: IndexError(list index out of range)
Exception raised in Job[67]: IndexError(list index out of range)
Exception raised in Job[58]:


Evaluating V2...


Evaluating:   0%|          | 0/75 [00:00<?, ?it/s]

Exception raised in Job[7]: IndexError(list index out of range)
Exception raised in Job[4]: IndexError(list index out of range)
Exception raised in Job[10]: IndexError(list index out of range)
Exception raised in Job[1]: IndexError(list index out of range)
Exception raised in Job[13]: IndexError(list index out of range)
Exception raised in Job[19]: IndexError(list index out of range)
Exception raised in Job[16]: IndexError(list index out of range)
Exception raised in Job[25]: IndexError(list index out of range)
Exception raised in Job[28]: IndexError(list index out of range)
Exception raised in Job[31]: IndexError(list index out of range)
Exception raised in Job[34]: IndexError(list index out of range)
Exception raised in Job[37]: IndexError(list index out of range)
Exception raised in Job[49]: IndexError(list index out of range)
Exception raised in Job[52]: IndexError(list index out of range)
Exception raised in Job[55]: IndexError(list index out of range)
Exception raised in Job[61]:


Evaluation complete.


In [5]:
# Step 4: Display and Compare Results
import pandas as pd

v1_scores_df = v1_scores.to_pandas()
v2_scores_df = v2_scores.to_pandas()

print("--- V1 Evaluation Scores (Detailed) ---")
display(v1_scores_df.head())

print("\n--- V2 Evaluation Scores (Detailed) ---")
display(v2_scores_df.head())

# Create a final summary comparison table
summary_data = {
    "Metric": ["Context Precision", "Answer Faithfulness", "Answer Relevancy"],
    "V1 Score (Avg)": [
        v1_scores_df['context_precision'].mean(),
        v1_scores_df['faithfulness'].mean(),
        v1_scores_df['answer_relevancy'].mean()
    ],
    "V2 Score (Avg)": [
        v2_scores_df['context_precision'].mean(),
        v2_scores_df['faithfulness'].mean(),
        v2_scores_df['answer_relevancy'].mean()
    ]
}
summary_df = pd.DataFrame(summary_data)

print("\n\n--- FINAL PERFORMANCE SUMMARY: V1 vs. V2 ---")
display(summary_df)


--- V1 Evaluation Scores (Detailed) ---


Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision
0,Bagaimana jika saya ingin membatalkan produk p...,[Melihat Diskaun yang Diterapkan: Setelah anda...,i.,"Mengikut dasar pembatalan platform, anda boleh...",,,0.0
1,Berapa jenis Polisi Pemulangan yang tersedia d...,[Rujuk Polisi Platform: Fahami polisi pemulang...,1).,Terdapat 3 jenis Polisi Pemulangan yang tersed...,,,0.0
2,Apakah yang perlu saya lakukan jika saya ingin...,"[Walau bagaimanapun, jika anda ingin memulangk...",i.,anda mesti memulangkan semua item yang telah d...,,,1.0
3,Berapa lama tempoh pemulangan untuk produk pre...,[Tempoh Pemulangan yang Lebih Lama: Produk Laz...,14 hari,30 Hari,0.0,,0.0
4,Berapa lama tempoh pemulangan untuk produk Pas...,[2. Apakah jenis Polisi Pemulangan yang tersed...,i.,15 Hari,,,1.0



--- V2 Evaluation Scores (Detailed) ---


Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision
0,Bagaimana jika saya ingin membatalkan produk p...,"[4. Dasar Khusus AuraMall\n\nAuraMall, pasaran...",Jika anda perlu membatalkan pesanan daripada A...,"Mengikut dasar pembatalan platform, anda boleh...",1.0,,0.0
1,Berapa jenis Polisi Pemulangan yang tersedia d...,[Platform Policies and Community Guidelines\n\...,"Setakat maklumat yang saya ada, saya tidak dap...",Terdapat 3 jenis Polisi Pemulangan yang tersed...,0.0,,0.0
2,Apakah yang perlu saya lakukan jika saya ingin...,[General Conditions for All Returns\n\nWithin ...,Saya tidak mempunyai maklumat yang mencukupi d...,anda mesti memulangkan semua item yang telah d...,0.0,,0.0
3,Berapa lama tempoh pemulangan untuk produk pre...,[2. Syarat untuk Pemulangan\n\nUntuk pemulanga...,Tempoh pemulangan untuk kebanyakan item yang d...,30 Hari,1.0,,1.0
4,Berapa lama tempoh pemulangan untuk produk Pas...,[General Conditions for All Returns\n\nWithin ...,"Maaf, saya tidak mempunyai maklumat mengenai t...",15 Hari,0.0,,0.0




--- FINAL PERFORMANCE SUMMARY: V1 vs. V2 ---


Unnamed: 0,Metric,V1 Score (Avg),V2 Score (Avg)
0,Context Precision,0.14,0.16
1,Answer Faithfulness,0.333333,0.606476
2,Answer Relevancy,0.020689,0.22407


In [None]:
# Step 5: Report on Generation Failure Rate

v1_failure_rate = len(v1_results_failed) / len(eval_df_neutral)

print("--- System Reliability Summary ---")
print(f"V1 System Generation Failure Rate: {v1_failure_rate:.2%}")
print(f"(V1 failed to produce any output for {len(v1_results_failed)} out of {len(eval_df_neutral)} questions.)")
print("\nFailed V1 Questions:")
for item in v1_results_failed:
    print(f"- {item['question']}")

--- System Reliability Summary ---
V1 System Generation Failure Rate: 0.00%
(V1 failed to produce any output for 0 out of 25 questions.)

Failed V1 Questions:
