In [1]:
from openai import OpenAI 

In [2]:
# Connection parameters (only has to be executed once during a session)

# Note: `localhost` is only valid as long as you are logged in on madagaskarweihe. 
# Otherwise, you will need to perform port forwarding on your machine using `ssh -fNL 8006:127.0.0.1:8006 ashousaa@madagaskarweihe`.
# This might require an ssh key to be uploaded to madagaskarweihe first.
VLLM_API_ENDPOINT = 'http://localhost:8006/v1' 
VLLM_KEY = 's7Vtzeyq3kfhVkPhlWdL95pPRBq36KDP1d5bBj54BqQ'
MODEL = "default"

In [3]:
# Connection to LLM server (only has to be executed once during a session)
client = OpenAI(api_key=VLLM_KEY,
                base_url=VLLM_API_ENDPOINT)

In [4]:
from pydantic import BaseModel
from typing import Literal


class EvaluationScores(BaseModel):
    correctness: Literal[1, 2, 3, 4, 5]
    correctness_reason: str
    completeness_reference: Literal[1, 2, 3, 4, 5]
    completeness_reference_reason: str
    faithfulness: Literal[1, 2, 3, 4, 5]
    faithfulness_reason: str
    completeness_question: Literal[1, 2, 3, 4, 5]
    completeness_question_reason: str




In [5]:
def build_prompt(response, reference, context, question):
    return f"""
You are an expert evaluator assessing hallucination in RAG-based answers. Rate the following RESPONSE using a 1–5 Likert scale on four dimensions. Each rating must be justified with a short reason.

Use this scale:
1 - Very Poor
2 - Poor
3 - Fair
4 - Good
5 - Excellent

Respond strictly in this format (only return valid JSON, no extra text):
{{
  "correctness": <1-5>,
  "correctness_reason": "<short justification>",
  "completeness_reference": <1-5>,
  "completeness_reference_reason": "<short justification>",
  "faithfulness": <1-5>,
  "faithfulness_reason": "<short justification>",
  "completeness_question": <1-5>,
  "completeness_question_reason": "<short justification>"
}}

Criteria:
1. **Correctness** – How accurately does the RESPONSE convey the same meaning as the REFERENCE, regardless of wording?
2. **Completeness (Reference)** – Does the RESPONSE include all key information from the REFERENCE?
3. **Faithfulness to Context** – Does the RESPONSE strictly reflect what is present in the CONTEXT (no made-up or contradictory info)?
4. **Completeness (Question)** – Does the RESPONSE fully address all aspects of the QUESTION?

QUESTION:
{question}

REFERENCE:
{reference}

RESPONSE:
{response}

CONTEXT:
{context}
"""


In [6]:
from pydantic import ValidationError
import json

def evaluate_response(question, response, reference, context):
    prompt = build_prompt(response, reference, context, question)
    try:
        response = client.completions.create(
            model=MODEL,
            prompt=prompt,
            temperature=0,
            max_tokens=512,
        )
        output = response.choices[0].text.strip()
        # Remove non-JSON parts (e.g., "JSON:")
        json_start_index = output.find('{')  # Find where the JSON starts
        output = output[json_start_index:].strip().rstrip("```")
        result = EvaluationScores(**json.loads(output))
        return result
    except json.JSONDecodeError as e:
        print("❌ Failed to parse JSON:")
        print(output)
        print(e)
    except ValidationError as e:
        print("❌ Schema validation failed:")
        print(output)
        print(e)
    return None

In [10]:
import pandas as pd

def evaluate_all(input_csv: str, output_csv: str):
    df = pd.read_csv(input_csv)
    results = []

    for i, row in df.iterrows():
        print(f"Evaluating question {row['Question Number']}...")

        res = evaluate_response(
            question=row["Question"],
            response=row["Generated Answer"],
            reference=row["Ref_Answer"],
            context=row["Top-k Chunks Used"]
        )

        if res:
            result_row = {
                "question_index": row["Question Number"],
                "correct_chunk": row["Original Chunk ID"],
                "reference_answer": row["Ref_Answer"],
                "retrieved_chunks": row["Top-k Chunks Used"],
                "correctness": res.correctness,
                "correctness_reason": res.correctness_reason,
                "completeness_reference": res.completeness_reference,
                "completeness_reference_reason": res.completeness_reference_reason,
                "faithfulness": res.faithfulness,
                "faithfulness_reason": res.faithfulness_reason,
                "completeness_question": res.completeness_question,
                "completeness_question_reason": res.completeness_question_reason,
                "judge_model_used": MODEL
            }
            results.append(result_row)

    pd.DataFrame(results).to_csv(output_csv, index=False)
    print(f"\n✅ Evaluation complete. Output saved to: {output_csv}")

In [12]:
if __name__ == "__main__":
    # Example single evaluation
    example_question = "What is considered an official trip?"
    example_response = "An official trip is any journey taken for work purposes, approved by a supervisor."
    example_reference = "An official trip is travel undertaken for official duties, typically with prior approval."
    example_context = "Official trips are defined as journeys carried out due to job responsibilities, requiring authorization."

    result = evaluate_response(example_question, example_response, example_reference, example_context)
    print(result.model_dump_json(indent=2))

    # # Uncomment to run batch evaluation:
    # evaluate_all("AddedColManually_RAG_Output_Answers.csv", "evaluated_output.csv")

{
  "correctness": 4,
  "correctness_reason": "The response accurately conveys the meaning of the reference, though it uses slightly different wording ('work purposes' vs 'official duties').",
  "completeness_reference": 4,
  "completeness_reference_reason": "The response includes the key information from the reference: travel for work and the need for approval/authorization.",
  "faithfulness": 5,
  "faithfulness_reason": "The response is entirely based on the provided context and does not introduce any new or contradictory information.",
  "completeness_question": 5,
  "completeness_question_reason": "The response directly and fully answers the question of what constitutes an official trip."
}
