In [None]:
# Step 1: Create the Neutral Evaluation Dataset
import pandas as pd
import json
import re
import os

print("--- Step 1: Creating a Brand-Neutral Golden Evaluation Dataset ---")

# --- Load the Original, Biased SQuAD Data from V1 ---
squad_filepath = 'v1_malay_selfhosted/squad_format_qa_pairs.json'
print(f"Loading original SQuAD data from: {squad_filepath}")
with open(squad_filepath, 'r', encoding='utf-8') as f:
    squad_data = json.load(f)

# --- Define Neutralization Rules ---
# We replace specific brand names with generic e-commerce terms.
replacements = {
    r'\bshopee\b': 'platform',
    r'\blazada\b': 'platform',
    r'\blazmall\b': 'premium mall',
    r'\bshopeepay\b': 'e-wallet',
    r'\blazada wallet\b': 'e-wallet',
    r'\bshopee coins\b': 'reward coins',
}

def neutralize_text(text: str) -> str:
    """Applies a series of regex replacements to make text brand-agnostic."""
    for pattern, replacement in replacements.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

# --- Flatten and Neutralize the Data ---
print("Neutralizing questions, answers, and contexts...")
eval_data_neutral = []
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        neutral_context = neutralize_text(paragraph['context'])
        for qa in paragraph['qas']:
            if qa['answers']:
                neutral_question = neutralize_text(qa['question'])
                neutral_answer = neutralize_text(qa['answers'][0]['text'])
                
                eval_data_neutral.append({
                    'question': neutral_question,
                    'ground_truth': neutral_answer, # RAGAs expects this key for the true answer
                    'contexts': [neutral_context], # RAGAs expects a list of contexts
                })

# Convert to a Pandas DataFrame
eval_df_neutral = pd.DataFrame(eval_data_neutral)

# Save the neutralized dataset for review and reusability
output_path = "evaluation_dataset_neutral.csv"
eval_df_neutral.to_csv(output_path, index=False)

print(f"\nSuccessfully created and saved a NEUTRAL evaluation dataset to '{output_path}' with {len(eval_df_neutral)} questions.")
print("\n--- Neutralized Dataset Preview ---")
display(eval_df_neutral.head())


In [None]:
# Step 2: Generate Responses from Both V1 and V2 Systems
import sys
from tqdm import tqdm
from datasets import Dataset

# Add project folders to the Python path to allow importing our adapters
sys.path.append('./v1_malay_selfhosted')
sys.path.append('./v2_multilingual_api/backend')

from v1_adapter import get_v1_rag_response
from v2_adapter import get_v2_rag_response

print("\n--- Step 2: Generating responses for the evaluation dataset ---")

# Run V2
v2_results = []
for index, row in tqdm(eval_df_neutral.iterrows(), total=len(eval_df_neutral), desc="Evaluating V2 System"):
    response = get_v2_rag_response(row['question'])
    v2_results.append({
        "question": row['question'],
        "answer": response['answer'],
        "contexts": response['contexts'],
        "ground_truth": row['ground_truth']
    })
    
# Run V1
v1_results = []
for index, row in tqdm(eval_df_neutral.iterrows(), total=len(eval_df_neutral), desc="Evaluating V1 System"):
    response = get_v1_rag_response(row['question'])
    v1_results.append({
        "question": row['question'],
        "answer": response['answer'],
        "contexts": response['contexts'],
        "ground_truth": row['ground_truth']
    })

# Convert results to Hugging Face Dataset objects for RAGAs
v1_dataset = Dataset.from_list(v1_results)
v2_dataset = Dataset.from_list(v2_results)
print("\nResponse generation complete.")

In [None]:
# Step 3: Execute RAGAs Evaluation
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision

# Define the metrics we want to measure
metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
]

print("\n--- Step 3: Running RAGAs evaluation (this will take time)... ---")

print("\nEvaluating V1...")
v1_scores = evaluate(v1_dataset, metrics)

print("\nEvaluating V2...")
v2_scores = evaluate(v2_dataset, metrics)
print("\nEvaluation complete.")

In [None]:
# Step 4: Display and Compare Results
import pandas as pd

v1_scores_df = v1_scores.to_pandas()
v2_scores_df = v2_scores.to_pandas()

print("--- V1 Evaluation Scores (Detailed) ---")
display(v1_scores_df.head())

print("\n--- V2 Evaluation Scores (Detailed) ---")
display(v2_scores_df.head())

# Create a final summary comparison table
summary_data = {
    "Metric": ["Context Precision", "Answer Faithfulness", "Answer Relevancy"],
    "V1 Score (Avg)": [
        v1_scores_df['context_precision'].mean(),
        v1_scores_df['faithfulness'].mean(),
        v1_scores_df['answer_relevancy'].mean()
    ],
    "V2 Score (Avg)": [
        v2_scores_df['context_precision'].mean(),
        v2_scores_df['faithfulness'].mean(),
        v2_scores_df['answer_relevancy'].mean()
    ]
}
summary_df = pd.DataFrame(summary_data)

print("\n\n--- FINAL PERFORMANCE SUMMARY: V1 vs. V2 ---")
display(summary_df)
