In [6]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import os  
import re

In [7]:
# Load files
rag_df = pd.read_csv("../../data/evaluation_input/answer_1090.csv")
ref_df = pd.read_csv("../../data/evaluation_input/reference_answer_1090.csv")

# Clean column names
rag_df.columns = rag_df.columns.str.strip()
ref_df.columns = ref_df.columns.str.strip()

# Rename respondent column to be consistent
rag_df = rag_df.rename(columns={"Interview File": "respondent_id"})
ref_df = ref_df.rename(columns={"respondent_id": "respondent_id"})

# Melt both to long format
rag_long = rag_df.melt(id_vars=["respondent_id"], var_name="question", value_name="answer_rag")
ref_long = ref_df.melt(id_vars=["respondent_id"], var_name="question", value_name="answer_ref")

# Merge them
merged_df = pd.merge(rag_long, ref_long, on=["respondent_id", "question"], how="left")

# Remove rows without answer
merged_df = merged_df.dropna(subset=["answer_rag"])

# Define GPT-based evaluation
client = OpenAI(api_key="sk-XXXX...")

# Optional: check structure
merged_df.head()

Unnamed: 0,respondent_id,question,answer_rag,answer_ref
0,c7d7640b-9344-48aa-9d48-7395eaeda149,"Hey, what’s the biggest news story or issue yo...",the upcoming election,The upcoming election
1,c7d7640b-9344-48aa-9d48-7395eaeda149,Can you tell me more about that news story? Wh...,it’s a woman Kamala against Donald Trump; hear...,It is a woman Kamala against Donald Trump
2,c7d7640b-9344-48aa-9d48-7395eaeda149,"So, why does that event or issue feel like the...",best president for our country,It’s important we chose the best president for...
3,c7d7640b-9344-48aa-9d48-7395eaeda149,You mentioned you feel like you can help make ...,if grownups and kids listened and tried to be ...,I think if we talked to each other more and ar...
4,c7d7640b-9344-48aa-9d48-7395eaeda149,What are you thinking of dressing up as for Ha...,Briar Rose and Lydia.,Briar Rose | Lydia |


In [10]:
def ask_score_and_feedback(prompt: str, temperature: float = 0.0) -> tuple[str, str]:
    """
    Ask GPT to return a numeric score and feedback using a structured format.
    Expected LLM format:
    Score: X
    Feedback: ...
    
    Returns:
    - (score, feedback): Extracted via regex, fallback to raw content if format is unexpected.
    """
    messages = [
        {"role": "system", "content": "You are a helpful evaluation assistant. Respond in this format:\nScore: <number>\nFeedback: <short explanation>"},
        {"role": "user", "content": prompt}
    ]

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=temperature
    )

    content = response.choices[0].message.content.strip()

    # Use regex to extract "Score: X" and "Feedback: ..."
    match = re.search(r"Score\s*:\s*([0-9.]+)[\n\r]+Feedback\s*:\s*(.*)", content, re.IGNORECASE | re.DOTALL)

    if match:
        score = match.group(1).strip()
        feedback = match.group(2).strip()
    else:
        # fallback: return entire response in feedback, leave score blank
        score = ""
        feedback = content

    return score, feedback
    

def build_prompt(metric: str, row: dict) -> str:
    question = row["question"]
    answer = row["answer_rag"]
    context = row.get("retrieved_context", "")
    reference = row.get("answer_ref", "")

    if metric == "relevance":
        return f"""Evaluate the relevance of the answer to the question.
Question: {question}
Answer: {answer}
First, rate from 1 (not relevant) to 5 (fully relevant).
Then explain briefly.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "faithfulness":
        return f"""Evaluate the faithfulness of the answer to the retrieved context.
Context: {context}
Answer: {answer}
Rate from 1 (hallucinated) to 5 (fully grounded), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "precision":
        return f"""Evaluate whether the context includes only necessary info to generate the answer.
Context: {context}
Answer: {answer}
Rate from 1 (verbose) to 5 (precise), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "recall":
        return f"""Evaluate whether the context includes all necessary info to answer the question.
Question: {question}
Context: {context}
Answer: {answer}
Rate from 1 (missing info) to 5 (complete), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "correctness":
        return f"""Compare the generated answer with the reference.
Question: {question}
Answer: {answer}
Reference: {reference}
Rate from 1 (wrong) to 5 (semantically equivalent), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "consistency":
        return f"""Evaluate the structural and stylistic consistency of the answer.
Answer: {answer}
Rate from 1 (unclear or inconsistent) to 5 (clear and well-structured), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    else:
        raise ValueError(f"Unknown metric: {metric}")
    

def score_ragas(row: pd.Series) -> pd.Series:
    metrics = []
    if pd.notna(row.get("retrieved_context")):
        metrics += ["faithfulness", "precision", "recall"]
    metrics += ["relevance", "consistency"]
    if pd.notna(row.get("answer_ref")):
        metrics.append("correctness")

    results = {}
    for metric in metrics:
        prompt = build_prompt(metric, row)
        score, feedback = ask_score_and_feedback(prompt)
        results[f"{metric}_score"] = score
        results[f"{metric}_feedback"] = feedback

    return pd.Series(results)


In [11]:
# Apply evaluation
tqdm.pandas()
scores_df = merged_df.progress_apply(score_ragas, axis=1)

# Combine results
result_df = pd.concat([merged_df, scores_df], axis=1)

# Optional: save to CSV
# result_df.to_csv("ragas_evaluation_result.csv", index=False)

# Display results
result_df

100%|█████████████████████████████████████████████| 9/9 [01:01<00:00,  6.86s/it]


Unnamed: 0,respondent_id,question,answer_rag,answer_ref,relevance_score,relevance_feedback,consistency_score,consistency_feedback,correctness_score,correctness_feedback
0,c7d7640b-9344-48aa-9d48-7395eaeda149,"Hey, what’s the biggest news story or issue yo...",the upcoming election,The upcoming election,5,The answer is fully relevant as it responds di...,1,The answer provided is too brief and lacks con...,5.0,The generated answer is semantically equivalen...
1,c7d7640b-9344-48aa-9d48-7395eaeda149,Can you tell me more about that news story? Wh...,it’s a woman Kamala against Donald Trump; hear...,It is a woman Kamala against Donald Trump,4,The answer is mostly relevant to the question....,2,The answer lacks clear structure and coherence...,4.0,The generated answer is mostly in line with th...
2,c7d7640b-9344-48aa-9d48-7395eaeda149,"So, why does that event or issue feel like the...",best president for our country,It’s important we chose the best president for...,1,The answer provided does not directly address ...,1,The answer is unclear and lacks structure. It ...,4.0,The generated answer and the reference are sim...
3,c7d7640b-9344-48aa-9d48-7395eaeda149,You mentioned you feel like you can help make ...,if grownups and kids listened and tried to be ...,I think if we talked to each other more and ar...,2,The answer is somewhat relevant as it suggests...,4,"The answer is clear and straightforward, but i...",4.0,The generated answer and the reference are qui...
4,c7d7640b-9344-48aa-9d48-7395eaeda149,What are you thinking of dressing up as for Ha...,Briar Rose and Lydia.,Briar Rose | Lydia |,5,The answer is fully relevant as it directly re...,1,The answer is unclear and lacks context. It's ...,5.0,The generated answer is semantically equivalen...
5,c7d7640b-9344-48aa-9d48-7395eaeda149,"When you’re out trick-or-treating, what’s the ...","Hershey bars, like the full-size ones.",Hershey bars,5,The answer is fully relevant to the question a...,2,The answer is clear but lacks detail and conte...,4.5,The generated answer is almost identical to th...
6,c7d7640b-9344-48aa-9d48-7395eaeda149,"On the flip side, what’s the worst thing someo...",Twizzlers.,Twizzlers,5,The answer is fully relevant as it directly re...,1,"The answer provided, ""Twizzlers,"" is not clear...",5.0,The generated answer is exactly the same as th...
7,c7d7640b-9344-48aa-9d48-7395eaeda149,You said you’re planning to watch something Ha...,Beetlejuice and Nightmare Before Christmas.,Beetlejuice and Nightmare before Christmas,5,The answer is fully relevant as it directly re...,1,"The answer lacks context and detail, making it...",5.0,The generated answer is semantically equivalen...
8,c7d7640b-9344-48aa-9d48-7395eaeda149,Tell me everything about how you feel about Ha...,"Excited to wear my costume, go trick or treati...",I’m excited to wear my costume to school and a...,5,The answer is fully relevant to the question. ...,4,The answer is mostly clear and well-structured...,4.0,The generated answer and the reference are qui...
