In [38]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv

In [39]:
# Define GPT-based evaluation
load_dotenv()
client = OpenAI(api_key=os.getenv("API_KEY"))

In [40]:
# Load files
rag_df = pd.read_csv("~/Desktop/temp/evaluation_input/answer_1090.csv")
ref_df = pd.read_csv("~/Desktop/temp/evaluation_input/reference_answer_1090.csv")
context_df = pd.read_csv("~/Desktop/temp/evaluation_output/retrieved_contexts_cleaned.csv")

# Clean column names
rag_df.columns = rag_df.columns.str.strip()
ref_df.columns = ref_df.columns.str.strip()
context_df.columns = context_df.columns.str.strip()

# Rename respondent column to be consistent
rag_df = rag_df.rename(columns={"Interview File": "respondent_id"})
ref_df = ref_df.rename(columns={"respondent_id": "respondent_id"})

# Standardize ID column and melt to long format
rag_df = rag_df.rename(columns={"Interview File": "respondent_id"})
rag_long = rag_df.melt(id_vars=["respondent_id"], var_name="question", value_name="answer_rag")
ref_long = ref_df.melt(id_vars=["respondent_id"], var_name="question", value_name="answer_ref")
context_df = context_df.rename(columns={"guide_question": "question"})

# Merge everything
merged_df = pd.merge(rag_long, ref_long, on=["respondent_id", "question"], how="left")
merged_df = pd.merge(merged_df, context_df, on=["respondent_id", "question"], how="left")
merged_df = merged_df.dropna(subset=["answer_rag"])

# Optional: check structure
merged_df.head()

Unnamed: 0,respondent_id,question,answer_rag,answer_ref,retrieved_context
0,c7d7640b-9344-48aa-9d48-7395eaeda149,"Hey, what’s the biggest news story or issue yo...",the upcoming election,The upcoming election,"Interviewer: Alright, thanks for hanging out w..."
1,387bf5f4-4944-4247-9980-d69983b44a6f,"Hey, what’s the biggest news story or issue yo...",Social media threats about a school shooting.,The social media threats that was made about a...,"Interviewer: Alright, thanks for chatting with..."
2,6c8cf423-2a8d-4a67-9133-c1c34e3ee04f,"Hey, what’s the biggest news story or issue yo...",Woman running for president.,That a woman is running for president.,"Interviewer: Alright, thanks for hopping on wi..."
3,f405a20e-d532-4abc-a197-8099f2270344,"Hey, what’s the biggest news story or issue yo...",P Diddy allegations.,The P Diddy allegations.,"Interviewer: Oh hey, thanks so much for hangin..."
4,7b18e570-043a-4b9d-8e6a-5880c770e96b,"Hey, what’s the biggest news story or issue yo...","Trump versus Harvey, close race in election.",There is a election. And if Trump versus Harve...,"Interviewer: Alright, thanks for hopping on! L..."


In [41]:
def ask_score_and_feedback(prompt: str, temperature: float = 0.0, model: str = "gpt-4o-mini") -> tuple[float, str]:
    """
    Query GPT to evaluate a response using a scoring rubric.

    Returns:
    - (score, feedback): Parsed float score and feedback string.
    """
    messages = [
        {
            "role": "system",
            "content": "You are a helpful evaluation assistant. Respond in this format:\nScore: <number>\nFeedback: <short explanation>"
        },
        {"role": "user", "content": prompt}
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )

    content = response.choices[0].message.content.strip()

    # Normalize line breaks
    content = content.replace("\\n", "\n").replace("\r", "\n")

    # Match score and feedback separately
    score_match = re.search(r"Score\s*[:：]\s*([0-9.]+)", content, re.IGNORECASE)
    feedback_match = re.search(r"Feedback\s*[:：]\s*(.*)", content, re.IGNORECASE | re.DOTALL)

    if not score_match:
        raise ValueError(f"Could not find Score in response:\n{content}")

    score = float(score_match.group(1).strip())
    feedback = feedback_match.group(1).strip() if feedback_match else ""

    return score, feedback
    

def build_prompt(metric: str, row: dict) -> str:
    """
    Create a scoring prompt for a specific metric based on a row of data.
    """
    question = row["question"]
    answer = row["answer_rag"]
    context = row.get("retrieved_context", "")
    reference = row.get("answer_ref", "")

    if metric == "relevance":
        return f"""Evaluate the relevance of the answer to the question.
Question: {question}
Answer: {answer}
Rate from 1 (not relevant) to 5 (fully relevant). Explain briefly.
Score: X
Feedback: ..."""

    elif metric == "faithfulness":
        return f"""Evaluate the faithfulness of the answer to the retrieved context.
Context: {context}
Answer: {answer}
Rate from 1 (hallucinated) to 5 (fully grounded). Explain.
Score: X
Feedback: ..."""

    elif metric == "precision":
        return f"""Evaluate whether the context includes only necessary info to generate the answer.
Context: {context}
Answer: {answer}
Rate from 1 (verbose) to 5 (precise). Explain.
Score: X
Feedback: ..."""

    elif metric == "recall":
        return f"""Evaluate whether the context includes all necessary info to answer the question.
Question: {question}
Context: {context}
Answer: {answer}
Rate from 1 (missing info) to 5 (complete). Explain.
Score: X
Feedback: ..."""

    elif metric == "correctness":
        return f"""Compare the generated answer with the reference.
Question: {question}
Answer: {answer}
Reference: {reference}
Rate from 1 (wrong) to 5 (semantically equivalent). Explain.
Score: X
Feedback: ..."""

    elif metric == "consistency":
        return f"""Evaluate the structural and stylistic consistency of the answer.
Answer: {answer}
Rate from 1 (inconsistent or unclear) to 5 (clear and well-structured). Explain.
Score: X
Feedback: ..."""

    else:
        raise ValueError(f"Unknown metric: {metric}")

    

def score_ragas(row: pd.Series) -> pd.Series:
    """
    Run GPT evaluation on a row using selected RAGAS metrics.
    """
    metrics = []
    if pd.notna(row.get("retrieved_context")):
        metrics += ["faithfulness", "precision", "recall"]
    metrics += ["relevance", "consistency"]
    if pd.notna(row.get("answer_ref")):
        metrics.append("correctness")

    results = {}
    for metric in metrics:
        prompt = build_prompt(metric, row)
        score, feedback = ask_score_and_feedback(prompt)
        results[f"{metric}_score"] = score
        results[f"{metric}_feedback"] = feedback

    return pd.Series(results)


In [43]:
# Only take the first 20 rows for testing
# test_df = merged_df.head(20)

# Apply evaluation
tqdm.pandas()
scores_df = merged_df.progress_apply(score_ragas, axis=1)

# Combine results
result_df = pd.concat([merged_df, scores_df], axis=1)

# Display results
result_df

100%|███████████████████████████████████████████| 91/91 [10:58<00:00,  7.24s/it]


Unnamed: 0,respondent_id,question,answer_rag,answer_ref,retrieved_context,consistency_feedback,consistency_score,correctness_feedback,correctness_score,faithfulness_feedback,faithfulness_score,precision_feedback,precision_score,recall_feedback,recall_score,relevance_feedback,relevance_score
0,c7d7640b-9344-48aa-9d48-7395eaeda149,"Hey, what’s the biggest news story or issue yo...",the upcoming election,The upcoming election,"Interviewer: Alright, thanks for hanging out w...","The phrase ""the upcoming election"" is clear bu...",2.0,The generated answer is semantically equivalen...,5.0,The answer accurately reflects the main topic ...,5.0,The context includes only necessary informatio...,5.0,The context provides a clear and direct answer...,5.0,The answer is relevant as it addresses a signi...,4.0
1,387bf5f4-4944-4247-9980-d69983b44a6f,"Hey, what’s the biggest news story or issue yo...",Social media threats about a school shooting.,The social media threats that was made about a...,"Interviewer: Alright, thanks for chatting with...","The answer is vague and lacks detail, making i...",2.0,The generated answer captures the essence of t...,4.0,The answer accurately reflects the key point f...,5.0,The context provides all necessary information...,5.0,The context provides a clear and specific answ...,5.0,The answer directly addresses the question by ...,5.0
2,6c8cf423-2a8d-4a67-9133-c1c34e3ee04f,"Hey, what’s the biggest news story or issue yo...",Woman running for president.,That a woman is running for president.,"Interviewer: Alright, thanks for hopping on wi...","The answer is very brief and lacks context, ma...",2.0,The answer conveys the same idea as the refere...,4.0,The answer accurately reflects the context pro...,5.0,The context provides only the necessary inform...,5.0,The context provides a clear and complete answ...,5.0,The answer mentions a significant political ev...,3.0
3,f405a20e-d532-4abc-a197-8099f2270344,"Hey, what’s the biggest news story or issue yo...",P Diddy allegations.,The P Diddy allegations.,"Interviewer: Oh hey, thanks so much for hangin...",The answer is vague and lacks context or detai...,2.0,The generated answer and the reference are sem...,5.0,The answer accurately reflects the main topic ...,5.0,The context provides only the necessary inform...,5.0,The context provides a clear and complete resp...,5.0,The answer is relevant as it addresses a curre...,4.0
4,7b18e570-043a-4b9d-8e6a-5880c770e96b,"Hey, what’s the biggest news story or issue yo...","Trump versus Harvey, close race in election.",There is a election. And if Trump versus Harve...,"Interviewer: Alright, thanks for hopping on! L...",The answer presents a clear comparison between...,3.0,The generated answer captures the essence of t...,4.0,The answer captures the essence of the context...,3.0,The context provides relevant information abou...,4.0,The context provides a clear discussion about ...,4.0,The answer addresses a current news story rela...,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,ea99ab44-c149-4919-9601-7d6c013af9c2,Tell me everything about how you feel about Ha...,Halloween is my favorite holiday; it reminds m...,Halloween is my favorite holiday. My dad died ...,"Interviewer: Nice, keeping it on theme. So, te...",The answer is clear and conveys a personal sen...,4.0,The generated answer captures the essence of t...,4.0,"The answer diverges from the context, which fo...",2.0,The context provides relevant information abou...,2.0,The context provides a good amount of informat...,4.0,The answer expresses a personal sentiment abou...,4.0
87,fbfe46f7-24aa-465c-a744-587f472077a7,Tell me everything about how you feel about Ha...,Fun and candy.,I like it because it’s fun and we get candy.,"Interviewer: Such good picks! So, tell me—how ...","The answer is vague and lacks clarity, making ...",2.0,The generated answer captures the essence of t...,4.0,"The answer ""Fun and candy"" captures a general ...",2.0,The context provides necessary information abo...,4.0,The context provides a comprehensive view of t...,5.0,"The answer is very brief and lacks detail, fai...",2.0
88,57b45fe6-c016-4e1d-aef6-d50309d92c17,Tell me everything about how you feel about Ha...,"I love Halloween; it's my favorite holiday, an...","I love Halloween—dressing up, decorations, can...","Interviewer: Yesss, Beetlejuice is such a good...","The answer is clear and well-structured, with ...",5.0,The generated answer captures the enthusiasm f...,4.0,The answer does not accurately reflect the int...,2.0,The context provides relevant information abou...,2.0,The context provides some information about th...,3.0,The answer directly addresses the question by ...,5.0
89,19ab5410-a614-4a1b-99ca-f15ad467cb54,Tell me everything about how you feel about Ha...,Excited about getting candy.,"I’m excited to get candy, dress up, and hang o...",Interviewer: Those are the best! I still remem...,The answer is brief and lacks clarity and stru...,2.0,The generated answer captures the excitement a...,3.0,The answer captures a part of the interviewee'...,3.0,The context provides necessary information abo...,3.0,The context provides a comprehensive view of t...,5.0,The answer addresses the excitement for Hallow...,3.0


In [44]:
# save to csv
result_df.to_csv("~/Desktop/temp/evaluation_output/ragas_evaluation_result.csv", index=False)