In [6]:
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import os  
import re
from dotenv import load_dotenv

In [12]:
# Load files
rag_df = pd.read_csv("../../data/evaluation_input/small_sample/answer_1090.csv")
ref_df = pd.read_csv("../../data/evaluation_input/small_sample/reference_answer_1090.csv")

# Clean column names
rag_df.columns = rag_df.columns.str.strip()
ref_df.columns = ref_df.columns.str.strip()

# Rename respondent column to be consistent
rag_df = rag_df.rename(columns={"Interview File": "respondent_id"})
ref_df = ref_df.rename(columns={"respondent_id": "respondent_id"})

# Melt both to long format
rag_long = rag_df.melt(id_vars=["respondent_id"], var_name="question", value_name="answer_rag")
ref_long = ref_df.melt(id_vars=["respondent_id"], var_name="question", value_name="answer_ref")

# Merge them
merged_df = pd.merge(rag_long, ref_long, on=["respondent_id", "question"], how="left")

# Remove rows without answer
merged_df = merged_df.dropna(subset=["answer_rag"])

# Define GPT-based evaluation
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Optional: check structure
merged_df.head()

Unnamed: 0,respondent_id,question,answer_rag,answer_ref
0,c7d7640b-9344-48aa-9d48-7395eaeda149,"Hey, what’s the biggest news story or issue yo...",the upcoming election,The upcoming election
1,c7d7640b-9344-48aa-9d48-7395eaeda149,Can you tell me more about that news story? Wh...,it’s a woman Kamala against Donald Trump; hear...,It is a woman Kamala against Donald Trump
2,c7d7640b-9344-48aa-9d48-7395eaeda149,"So, why does that event or issue feel like the...",best president for our country,It’s important we chose the best president for...
3,c7d7640b-9344-48aa-9d48-7395eaeda149,You mentioned you feel like you can help make ...,if grownups and kids listened and tried to be ...,I think if we talked to each other more and ar...
4,c7d7640b-9344-48aa-9d48-7395eaeda149,What are you thinking of dressing up as for Ha...,Briar Rose and Lydia.,Briar Rose | Lydia |


In [23]:
def ask_score_and_feedback(prompt: str, temperature: float = 0.0, model: str = "gpt-4o-mini") -> tuple[float, str]:
    """
    Ask GPT to return a numeric score and feedback using a structured format.
    Expected LLM format:
        Score: <number>
        Feedback: <text>

    Parameters:
    - prompt (str): The instruction sent to GPT.
    - temperature (float): Temperature setting for response.
    - model (str): GPT model to use.

    Returns:
    - (score, feedback): float score and string feedback. If parsing fails, score is np.nan.
    """
    messages = [
        {
            "role": "system",
            "content": "You are a helpful evaluation assistant. Respond in this format:\nScore: <number>\nFeedback: <short explanation>"
        },
        {"role": "user", "content": prompt}
    ]

    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature
        )

        content = response.choices[0].message.content.strip()

        # Match flexible "Score: X" and "Feedback: ..."
        match = re.search(
            r"Score\s*[:：]\s*([0-9.]+)\s*[\n\r]+Feedback\s*[:：]\s*(.*)",
            content,
            re.IGNORECASE | re.DOTALL
        )

        if match:
            score = float(match.group(1).strip())
            feedback = match.group(2).strip()
        else:
            # fallback if format is not matched
            score = np.nan
            feedback = content

    except Exception as e:
        score = np.nan
        feedback = f"Error calling OpenAI: {e}"

    return score, feedback
    

def build_prompt(metric: str, row: dict) -> str:
    question = row["question"]
    answer = row["answer_rag"]
    context = row.get("retrieved_context", "")
    reference = row.get("answer_ref", "")

    if metric == "relevance":
        return f"""Evaluate the relevance of the answer to the question.
Question: {question}
Answer: {answer}
First, rate from 1 (not relevant) to 5 (fully relevant).
Then explain briefly.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "faithfulness":
        return f"""Evaluate the faithfulness of the answer to the retrieved context.
Context: {context}
Answer: {answer}
Rate from 1 (hallucinated) to 5 (fully grounded), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "precision":
        return f"""Evaluate whether the context includes only necessary info to generate the answer.
Context: {context}
Answer: {answer}
Rate from 1 (verbose) to 5 (precise), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "recall":
        return f"""Evaluate whether the context includes all necessary info to answer the question.
Question: {question}
Context: {context}
Answer: {answer}
Rate from 1 (missing info) to 5 (complete), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "correctness":
        return f"""Compare the generated answer with the reference.
Question: {question}
Answer: {answer}
Reference: {reference}
Rate from 1 (wrong) to 5 (semantically equivalent), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    elif metric == "consistency":
        return f"""Evaluate the structural and stylistic consistency of the answer.
Answer: {answer}
Rate from 1 (unclear or inconsistent) to 5 (clear and well-structured), then explain.
Respond in the format:
Score: X
Feedback: ..."""

    else:
        raise ValueError(f"Unknown metric: {metric}")
    

def score_ragas(row: pd.Series) -> pd.Series:
    metrics = []
    if pd.notna(row.get("retrieved_context")):
        metrics += ["faithfulness", "precision", "recall"]
    metrics += ["relevance", "consistency"]
    if pd.notna(row.get("answer_ref")):
        metrics.append("correctness")

    results = {}
    for metric in metrics:
        prompt = build_prompt(metric, row)
        score, feedback = ask_score_and_feedback(prompt)
        results[f"{metric}_score"] = score
        results[f"{metric}_feedback"] = feedback

    return pd.Series(results)


In [24]:
# Apply evaluation
tqdm.pandas()
scores_df = merged_df.progress_apply(score_ragas, axis=1)

# Combine results
result_df = pd.concat([merged_df, scores_df], axis=1)

# Optional: save to CSV
# result_df.to_csv("ragas_evaluation_result.csv", index=False)

# Display results
result_df

100%|█████████████████████████████████████████████| 9/9 [00:37<00:00,  4.14s/it]


Unnamed: 0,respondent_id,question,answer_rag,answer_ref,relevance_score,relevance_feedback,consistency_score,consistency_feedback,correctness_score,correctness_feedback
0,c7d7640b-9344-48aa-9d48-7395eaeda149,"Hey, what’s the biggest news story or issue yo...",the upcoming election,The upcoming election,4.0,The answer is relevant as it addresses a signi...,2.0,"The answer is vague and lacks context, making ...",5.0,The generated answer is semantically equivalen...
1,c7d7640b-9344-48aa-9d48-7395eaeda149,Can you tell me more about that news story? Wh...,it’s a woman Kamala against Donald Trump; hear...,It is a woman Kamala against Donald Trump,4.0,The answer provides relevant information about...,3.0,The answer presents a clear subject (Kamala ag...,3.0,The generated answer provides some relevant in...
2,c7d7640b-9344-48aa-9d48-7395eaeda149,"So, why does that event or issue feel like the...",best president for our country,It’s important we chose the best president for...,2.0,The answer does not directly address the quest...,2.0,"The answer is vague and lacks clarity, making ...",3.0,The generated answer and the reference both ex...
3,c7d7640b-9344-48aa-9d48-7395eaeda149,You mentioned you feel like you can help make ...,if grownups and kids listened and tried to be ...,I think if we talked to each other more and ar...,3.0,The answer touches on the importance of kindne...,4.0,The answer is clear and conveys a straightforw...,3.0,Both answers emphasize the importance of kindn...
4,c7d7640b-9344-48aa-9d48-7395eaeda149,What are you thinking of dressing up as for Ha...,Briar Rose and Lydia.,Briar Rose | Lydia |,5.0,The answer directly addresses the question by ...,2.0,"The answer is unclear and lacks context, makin...",5.0,The answer directly lists the same characters ...
5,c7d7640b-9344-48aa-9d48-7395eaeda149,"When you’re out trick-or-treating, what’s the ...","Hershey bars, like the full-size ones.",Hershey bars,5.0,The answer directly addresses the question by ...,2.0,"The answer is unclear and lacks context, makin...",5.0,The generated answer is semantically equivalen...
6,c7d7640b-9344-48aa-9d48-7395eaeda149,"On the flip side, what’s the worst thing someo...",Twizzlers.,Twizzlers,4.0,The answer is relevant as it directly addresse...,1.0,"The answer ""Twizzlers"" is unclear and lacks co...",5.0,The generated answer is semantically equivalen...
7,c7d7640b-9344-48aa-9d48-7395eaeda149,You said you’re planning to watch something Ha...,Beetlejuice and Nightmare Before Christmas.,Beetlejuice and Nightmare before Christmas,5.0,The answer directly addresses the question by ...,2.0,"The answer lacks clarity and context, as it si...",5.0,The generated answer is semantically equivalen...
8,c7d7640b-9344-48aa-9d48-7395eaeda149,Tell me everything about how you feel about Ha...,"Excited to wear my costume, go trick or treati...",I’m excited to wear my costume to school and a...,5.0,The answer directly addresses the question by ...,4.0,The answer is clear and conveys excitement eff...,4.0,The generated answer captures the excitement f...
