In [1]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv

In [2]:
# Define GPT-based evaluation
load_dotenv()
client = OpenAI(api_key=os.getenv("api_key"))

In [9]:
# Load files
rag_df = pd.read_csv("~/Desktop/temp/evaluation_input/answer_1090.csv")
ref_df = pd.read_csv("~/Desktop/temp/evaluation_input/reference_answer_1090.csv")
context_df = pd.read_csv("~/Desktop/temp/evaluation_input/retrieved_contexts_cleaned.csv")

# Clean column names
rag_df.columns = rag_df.columns.str.strip()
ref_df.columns = ref_df.columns.str.strip()
context_df.columns = context_df.columns.str.strip()

# Rename respondent column to be consistent
rag_df = rag_df.rename(columns={"Interview File": "respondent_id"})
ref_df = ref_df.rename(columns={"respondent_id": "respondent_id"})

# Standardize ID column and melt to long format
rag_df = rag_df.rename(columns={"Interview File": "respondent_id"})
rag_long = rag_df.melt(id_vars=["respondent_id"], var_name="question", value_name="answer_rag")
ref_long = ref_df.melt(id_vars=["respondent_id"], var_name="question", value_name="answer_ref")
context_df = context_df.rename(columns={"guide_question": "question"})

# Merge everything
merged_df = pd.merge(rag_long, ref_long, on=["respondent_id", "question"], how="left")
merged_df = pd.merge(merged_df, context_df, on=["respondent_id", "question"], how="left")
merged_df = merged_df.dropna(subset=["answer_rag"])

# Optional: check structure
merged_df.head()

Unnamed: 0,respondent_id,question,answer_rag,answer_ref,retrieved_context
0,c7d7640b-9344-48aa-9d48-7395eaeda149,"Hey, what’s the biggest news story or issue yo...",the upcoming election,The upcoming election,"Interviewer: Alright, thanks for hanging out w..."
1,387bf5f4-4944-4247-9980-d69983b44a6f,"Hey, what’s the biggest news story or issue yo...",Social media threats about a school shooting.,The social media threats that was made about a...,"Interviewer: Alright, thanks for chatting with..."
2,6c8cf423-2a8d-4a67-9133-c1c34e3ee04f,"Hey, what’s the biggest news story or issue yo...",Woman running for president.,That a woman is running for president.,"Interviewer: Alright, thanks for hopping on wi..."
3,f405a20e-d532-4abc-a197-8099f2270344,"Hey, what’s the biggest news story or issue yo...",P Diddy allegations.,The P Diddy allegations.,"Interviewer: Oh hey, thanks so much for hangin..."
4,7b18e570-043a-4b9d-8e6a-5880c770e96b,"Hey, what’s the biggest news story or issue yo...","Trump versus Harvey, close race in election.",There is a election. And if Trump versus Harve...,"Interviewer: Alright, thanks for hopping on! L..."


In [10]:
# Metric activation control
ACTIVE_METRICS = {
    "faithfulness": True,
    "precision": True,
    "recall": True,
    "relevance": True,
    "correctness": True
}

def ask_score_and_feedback(prompt: str, temperature: float = 0.0, model: str = "gpt-4o-mini") -> tuple[float, str]:
    """
    Query GPT to evaluate a response using a scoring rubric.

    Returns:
    - (score, feedback): Parsed float score and feedback string.
    """
    messages = [
        {
            "role": "system",
            "content": "You are a helpful evaluation assistant. Respond in this format:\nScore: <number>\nFeedback: <short explanation>"
        },
        {"role": "user", "content": prompt}
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )

    content = response.choices[0].message.content.strip()

    # Normalize line breaks
    content = content.replace("\\n", "\n").replace("\r", "\n")

    # Match score and feedback separately
    score_match = re.search(r"Score\s*[:：]\s*([0-9.]+)", content, re.IGNORECASE)
    feedback_match = re.search(r"Feedback\s*[:：]\s*(.*)", content, re.IGNORECASE | re.DOTALL)

    if not score_match:
        raise ValueError(f"Could not find Score in response:\n{content}")

    score = float(score_match.group(1).strip())
    feedback = feedback_match.group(1).strip() if feedback_match else ""

    return score, feedback
    

def build_prompt(metric: str, row: dict) -> str:
    """
    Create a scoring prompt for a specific metric based on a row of data.
    """
    question = row["question"]
    answer = row["answer_rag"]
    context = row.get("retrieved_context", "")
    reference = row.get("answer_ref", "")

    if metric == "relevance":
        return f"""Evaluate the relevance of the answer to the question.
Question: {question}
Answer: {answer}
Rate from 1 (not relevant) to 5 (fully relevant). Explain briefly.
Score: X
Feedback: ..."""

    elif metric == "faithfulness":
        return f"""Evaluate the faithfulness of the answer to the retrieved context.
Context: {context}
Answer: {answer}
Rate from 1 (hallucinated) to 5 (fully grounded). Explain.
Score: X
Feedback: ..."""

    elif metric == "precision":
        return f"""Evaluate whether the context includes only necessary info to generate the answer.
Context: {context}
Answer: {answer}
Rate from 1 (verbose) to 5 (precise). Explain.
Score: X
Feedback: ..."""

    elif metric == "recall":
        return f"""Evaluate whether the context includes all necessary info to answer the question.
Question: {question}
Context: {context}
Answer: {answer}
Rate from 1 (missing info) to 5 (complete). Explain.
Score: X
Feedback: ..."""

    elif metric == "correctness":
        return f"""Compare the generated answer with the reference.
Question: {question}
Answer: {answer}
Reference: {reference}
Rate from 1 (wrong) to 5 (semantically equivalent). Explain.
Score: X
Feedback: ..."""

    else:
        raise ValueError(f"Unknown metric: {metric}")

    

def score_ragas(row: pd.Series) -> pd.Series:
    """
    Run GPT evaluation on a row using selected RAGAS metrics,
    based on availability of fields and ACTIVE_METRICS.
    """
    metrics = []

    if pd.notna(row.get("retrieved_context")):
        for m in ["faithfulness", "precision", "recall"]:
            if ACTIVE_METRICS.get(m, False):
                metrics.append(m)

    if ACTIVE_METRICS.get("relevance", False):
        metrics.append("relevance")

    if ACTIVE_METRICS.get("correctness", False) and pd.notna(row.get("answer_ref")):
        metrics.append("correctness")

    results = {}
    for metric in metrics:
        prompt = build_prompt(metric, row)
        score, feedback = ask_score_and_feedback(prompt)
        results[f"{metric}_score"] = score
        results[f"{metric}_feedback"] = feedback

    return pd.Series(results)


In [11]:
# Only take the first 20 rows for testing
test_df = merged_df.head(20)

# Apply evaluation
tqdm.pandas()
scores_df = test_df.progress_apply(score_ragas, axis=1)

# Combine results
result_df = pd.concat([test_df, scores_df], axis=1)

# Display results
result_df

100%|███████████████████████████████████████████| 20/20 [01:54<00:00,  5.75s/it]


Unnamed: 0,respondent_id,question,answer_rag,answer_ref,retrieved_context,faithfulness_score,faithfulness_feedback,precision_score,precision_feedback,recall_score,recall_feedback,relevance_score,relevance_feedback,correctness_score,correctness_feedback
0,c7d7640b-9344-48aa-9d48-7395eaeda149,"Hey, what’s the biggest news story or issue yo...",the upcoming election,The upcoming election,"Interviewer: Alright, thanks for hanging out w...",5.0,The answer directly reflects the context provi...,4.0,The context provides relevant information abou...,5.0,The context provides a clear and complete resp...,4.0,The answer is relevant as it addresses a signi...,5.0,The generated answer is semantically equivalen...
1,387bf5f4-4944-4247-9980-d69983b44a6f,"Hey, what’s the biggest news story or issue yo...",Social media threats about a school shooting.,The social media threats that was made about a...,"Interviewer: Alright, thanks for chatting with...",5.0,The answer accurately reflects the context pro...,4.0,The context provides relevant information abou...,5.0,The context provides a clear and relevant resp...,5.0,The answer directly addresses the question by ...,4.0,The generated answer captures the essence of t...
2,6c8cf423-2a8d-4a67-9133-c1c34e3ee04f,"Hey, what’s the biggest news story or issue yo...",Woman running for president.,That a woman is running for president.,"Interviewer: Alright, thanks for hopping on wi...",5.0,The answer accurately reflects the context pro...,4.0,The context provides relevant information abou...,5.0,The context provides a clear and complete answ...,3.0,"The answer provides a specific news story, but...",4.0,The answer conveys the same idea as the refere...
3,f405a20e-d532-4abc-a197-8099f2270344,"Hey, what’s the biggest news story or issue yo...",P Diddy allegations.,The P Diddy allegations.,"Interviewer: Oh hey, thanks so much for hangin...",5.0,"The answer ""P Diddy allegations"" directly corr...",5.0,The context provides a clear and focused discu...,5.0,The context provides a clear and complete answ...,4.0,The answer is relevant as it addresses a curre...,5.0,The generated answer and the reference are sem...
4,7b18e570-043a-4b9d-8e6a-5880c770e96b,"Hey, what’s the biggest news story or issue yo...","Trump versus Harvey, close race in election.",There is a election. And if Trump versus Harve...,"Interviewer: Alright, thanks for hopping on! L...",3.0,The answer captures a key element of the conte...,4.0,The context provides relevant information abou...,4.0,The context provides a clear indication of the...,4.0,The answer addresses a current news story rela...,4.0,The generated answer conveys the same idea as ...
5,fe189a48-e69d-4c97-b004-b25789b1f63d,"Hey, what’s the biggest news story or issue yo...",the shootings at schools,Shootings at schools,"Interviewer: Alright, let's jump right in. So,...",5.0,The answer directly reflects the interviewee's...,4.0,The context provides relevant information abou...,5.0,The context provides a clear and direct respon...,5.0,The answer directly addresses the question by ...,5.0,The generated answer is semantically equivalen...
6,ea99ab44-c149-4919-9601-7d6c013af9c2,"Hey, what’s the biggest news story or issue yo...",Trump shot in the ear.,I heard that Donald Trump got shot in the ear.,"Interviewer: Alright, thanks for hanging out w...",2.0,The answer refers to a specific event mentione...,4.0,The context provides necessary information abo...,5.0,The context provides a clear and direct answer...,2.0,"The answer mentions a news story, but it lacks...",4.0,The generated answer conveys the same informat...
7,fbfe46f7-24aa-465c-a744-587f472077a7,"Hey, what’s the biggest news story or issue yo...",Donald Trump was almost killed.,Donald Trump was almost killed by someone.,"Interviewer: Alright, thanks for doing this! L...",4.0,The answer accurately reflects a statement mad...,4.0,The context provides relevant information abou...,4.0,The context provides a clear answer to the que...,4.0,The answer addresses a significant news story ...,4.0,The generated answer conveys the same core inf...
8,57b45fe6-c016-4e1d-aef6-d50309d92c17,"Hey, what’s the biggest news story or issue yo...",Trump claiming people eat dogs and cats.,Trump saying people eat dogs and cats.,"Interviewer: Haha, gym saves the day! Okay, le...",5.0,The answer accurately reflects the content of ...,4.0,The context provides relevant information abou...,5.0,The context provides a clear and detailed resp...,3.0,The answer mentions a news story related to Tr...,4.0,The generated answer captures the essence of t...
9,19ab5410-a614-4a1b-99ca-f15ad467cb54,"Hey, what’s the biggest news story or issue yo...",hurricane in Florida,A hurricane in Florida,"Interviewer: Alright, thanks for hanging out w...",5.0,"The answer ""hurricane in Florida"" is directly ...",5.0,The context provides only necessary informatio...,5.0,The context provides a clear and complete answ...,4.0,The answer is relevant as it addresses a signi...,5.0,The generated answer is semantically equivalen...


In [12]:
# save to csv
result_df.to_csv("~/Desktop/temp/evaluation_output/ragas_evaluation_result.csv", index=False)