In [31]:
from openai import OpenAI 

In [32]:
# Connection parameters (only has to be executed once during a session)

# Note: `localhost` is only valid as long as you are logged in on madagaskarweihe. 
# Otherwise, you will need to perform port forwarding on your machine using `ssh -fNL 8006:127.0.0.1:8006 ashousaa@madagaskarweihe`.
# This might require an ssh key to be uploaded to madagaskarweihe first.
VLLM_API_ENDPOINT = 'http://localhost:8006/v1' 
VLLM_KEY = 's7Vtzeyq3kfhVkPhlWdL95pPRBq36KDP1d5bBj54BqQ'
MODEL = "default"

In [33]:
# Connection to LLM server (only has to be executed once during a session)
client = OpenAI(api_key=VLLM_KEY,
                base_url=VLLM_API_ENDPOINT)

In [34]:
from pydantic import BaseModel
from typing import Literal


class EvaluationScores(BaseModel):
    correctness: Literal[1, 2, 3, 4, 5]
    correctness_reason: str
    completeness_reference: Literal[1, 2, 3, 4, 5]
    completeness_reference_reason: str
    faithfulness: Literal[1, 2, 3, 4, 5]
    faithfulness_reason: str
    completeness_question: Literal[1, 2, 3, 4, 5]
    completeness_question_reason: str




In [35]:
#Original version of the prompt: uses the context for completeness to reference instead of only the reference
def build_prompt_v1(response, reference, context, question):
  return f"""
You are an expert evaluator assessing hallucination in RAG-based answers. Rate the following RESPONSE using a 1–5 Likert scale on four dimensions. Each rating must be justified with a short reason.

Use this scale:
1 - Very Poor
2 - Poor
3 - Fair
4 - Good
5 - Excellent

Respond strictly in this format (only return valid JSON, no extra text):
{{
  "correctness": <1-5>,
  "correctness_reason": "<short justification>",
  "completeness_reference": <1-5>,
  "completeness_reference_reason": "<short justification>",
  "faithfulness": <1-5>,
  "faithfulness_reason": "<short justification>",
  "completeness_question": <1-5>,
  "completeness_question_reason": "<short justification>"
}}

Criteria:
1. **Correctness** – How accurately does the RESPONSE convey the same meaning as the REFERENCE, regardless of wording?
2. **Completeness (Reference)** – Does the RESPONSE include all key information from the REFERENCE?
3. **Faithfulness to Context** – Does the RESPONSE strictly reflect what is present in the CONTEXT (no made-up or contradictory info)?
4. **Completeness (Question)** – Does the RESPONSE fully address all aspects of the QUESTION?

QUESTION:
{question}

REFERENCE:
{reference}

RESPONSE:
{response}

CONTEXT:
{context}
"""


In [36]:
# Second version of the prompt: 
# Added ignore...
# Reduces points because wants answer to have everything mentioned in the context even if not relevant to the question.
#  Faithfulness is only about checking if the answer does not contradict the context or invent info. 
#  Completeness (question) is about checking if the answer is complete and answers the question fully

def build_prompt_v2(response, reference, context, question):
    return f"""
You are an expert evaluator assessing hallucination in RAG-based answers. Rate the following RESPONSE using a 1–5 Likert scale on four dimensions. Each rating must be justified with a short reason.

Use this scale:
1 - Very Poor
2 - Poor
3 - Fair
4 - Good
5 - Excellent

Respond strictly in this format (only return valid JSON, no extra text):
{{
  "correctness": <1-5>,
  "correctness_reason": "<short justification>",
  "completeness_reference": <1-5>,
  "completeness_reference_reason": "<short justification>",
  "faithfulness": <1-5>,
  "faithfulness_reason": "<short justification>",
  "completeness_question": <1-5>,
  "completeness_question_reason": "<short justification>"
}}

Criteria:
1. **Correctness** – How accurately does the RESPONSE convey the same meaning as the REFERENCE, regardless of wording?  Ignore the context and question for this rating
2. **Completeness (Reference)** – Does the RESPONSE include all key information from the REFERENCE? Ignore the context and question for this rating 
3. **Faithfulness to Context** – Does the RESPONSE strictly reflect what is present in the CONTEXT (no made-up or contradictory info)? Ignore the reference and question for this rating
4. **Completeness (Question)** – Does the RESPONSE fully address all aspects of the QUESTION?  Ignore the reference and context for this rating

RESPONSE:
{response}

REFERENCE (for correctness & completeness_reference):
{reference}

CONTEXT (for faithfulness):
{context}

QUESTION (for completeness_question):
{question}

"""


In [37]:
# Third version of the prompt: refined the defintions of faithfulness and completeness to question.

def build_prompt_v3(response, reference, context, question):
    return f"""
You are an expert evaluator assessing hallucination in RAG-based answers. Rate the following RESPONSE using a 1–5 Likert scale on four dimensions. Each rating must be justified with a short reason.

Use this scale:
1 - Very Poor
2 - Poor
3 - Fair
4 - Good
5 - Excellent

Respond strictly in this format (only return valid JSON, no extra text):
{{
  "correctness": <1-5>,
  "correctness_reason": "<short justification>",
  "completeness_reference": <1-5>,
  "completeness_reference_reason": "<short justification>",
  "faithfulness": <1-5>,
  "faithfulness_reason": "<short justification>",
  "completeness_question": <1-5>,
  "completeness_question_reason": "<short justification>"
}}

Criteria:
1. **Correctness** – How accurately does the RESPONSE convey the same meaning as the REFERENCE, regardless of wording?  Ignore the context and question for this rating
2. **Completeness (Reference)** – Does the RESPONSE include all key information from the REFERENCE? Ignore the context and question for this rating 
3. **Faithfulness to Context** – Check ONLY whether the RESPONSE is faithful to the CONTEXT.Ignore the reference and question for this rating
   - Faithful means: no made-up facts, no contradictions with the context.
   - Do NOT penalize for leaving out context information.
   - Faithfulness is about truthfulness, NOT completeness.
4. **Completeness (Question)** – Does the RESPONSE fully answer what is asked in the QUESTION? Ignore the reference and context for this rating
   - Use CONTEXT **only as needed** to support this.
   - Do NOT penalize if RESPONSE omits unrelated context details.

RESPONSE:
{response}

REFERENCE (for correctness & completeness_reference):
{reference}

CONTEXT (for faithfulness):
{context}

QUESTION (for completeness_question):
{question}

"""


In [38]:
from pydantic import ValidationError
import json

def evaluate_response(question, response, reference, context,prompt_builder):
    prompt = build_prompt_v3(response, reference, context, question)
    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are an expert evaluator assessing hallucination in RAG-based answers. Always return your ratings in valid JSON format. Do not include anything else."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=512,
        )

        output = response.choices[0].message.content.strip()
        json_start_index = output.find('{')
        output = output[json_start_index:].strip().rstrip("```")
        result = EvaluationScores(**json.loads(output))
        return result
    except json.JSONDecodeError as e:
        print("❌ Failed to parse JSON:")
        print(output)
        print(e)
    except ValidationError as e:
        print("❌ Schema validation failed:")
        print(output)
        print(e)
    return None



In [39]:
import pandas as pd

def evaluate_all(input_csv: str, output_csv: str, prompt_builder, prompt_id: str):
    df = pd.read_csv(input_csv)
    results = []

    for i, row in df.iterrows():
        res = evaluate_response(
            question=row["Question"],
            response=row["Generated Answer"],
            reference=row["Ref_Answer"],
            context=row["Top-k Chunks Used"],
            prompt_builder=prompt_builder 
        )

        if res:
            result_row = {
                "question_index": row["Question Number"],
                "correct_chunk": row["Original Chunk ID"],
                "reference_answer": row["Ref_Answer"],
                "retrieved_chunks": row["Top-k Chunks Used"],
                "correctness": res.correctness,
                "correctness_reason": res.correctness_reason,
                "completeness_reference": res.completeness_reference,
                "completeness_reference_reason": res.completeness_reference_reason,
                "faithfulness": res.faithfulness,
                "faithfulness_reason": res.faithfulness_reason,
                "completeness_question": res.completeness_question,
                "completeness_question_reason": res.completeness_question_reason,
                "judge_model_used": MODEL,
                "prompt_variant_id": prompt_id
            }
            results.append(result_row)
    result_df = pd.DataFrame(results)
    result_df.to_csv(output_csv, index=False)
    print(f"\n✅ Evaluation complete. Output saved to: {output_csv}")
    return result_df

In [40]:
if __name__ == "__main__":
#     # Example single evaluation
#     example_question = "What is considered an official trip?"
#     example_response = f"""Official trips in the sense of this Act are journeys undertaken to conduct official  
# business outside the usual place of work, which have been ordered or approved by the  
# responsible superior, unless an order or approval is not applicable due to the nature of the  
# official’s office or the nature of the official business."""
#     example_reference = f"""Official trips in the sense of this Act are journeys undertaken to conduct official  
# business outside the usual place of work, which have been ordered or approved by the  
# responsible superior, unless an order or approval is not applicable due to the nature of the  
# official’s office or the nature of the official business."""
#     example_context =  f"""Official trips in the sense of this Act are journeys undertaken to conduct official  
# business outside the usual place of work, which have been ordered or approved by the  
# responsible superior, unless an order or approval is not applicable due to the nature of the  
# official’s office or the nature of the official business. The order or approval must be given 
# in writing or electronically. Official trips also include journeys from a location serving as a 
# temporary residence to the place of work, provided that the conditions of sentences 1 
# and 2 are otherwise met. Official trips should only be carried out if a less costly method of 
# conducting the official business is not possible or reasonable."""

#     result = evaluate_response(example_question, example_response, example_reference, example_context)


    # Uncomment to run batch evaluation:
    result=evaluate_all(
        input_csv="AddedColManually_RAG_Output_Answers.csv",
        output_csv="evaluated_output_v2.csv",
        prompt_builder=build_prompt_v2,
        prompt_id="prompt_v2"
    )
    print(result.model_dump_json(indent=2))


✅ Evaluation complete. Output saved to: evaluated_output_v2.csv


AttributeError: 'DataFrame' object has no attribute 'model_dump_json'