In [1]:
from openai import OpenAI 

In [2]:
# Connection parameters (only has to be executed once during a session)

# Note: `localhost` is only valid as long as you are logged in on madagaskarweihe. 
# Otherwise, you will need to perform port forwarding on your machine using `ssh -fNL 8006:127.0.0.1:8006 ashousaa@madagaskarweihe`.
# This might require an ssh key to be uploaded to madagaskarweihe first.
VLLM_API_ENDPOINT = 'http://localhost:8006/v1' 
VLLM_KEY = 's7Vtzeyq3kfhVkPhlWdL95pPRBq36KDP1d5bBj54BqQ'
MODEL = "default"

In [3]:
# Connection to LLM server (only has to be executed once during a session)
client = OpenAI(api_key=VLLM_KEY,
                base_url=VLLM_API_ENDPOINT)

In [4]:
from pydantic import BaseModel
from typing import Literal


class EvaluationScores(BaseModel):
    correctness: Literal[1, 2, 3, 4, 5]
    correctness_reason: str
    completeness_reference: Literal[1, 2, 3, 4, 5]
    completeness_reference_reason: str
    faithfulness: Literal[1, 2, 3, 4, 5]
    faithfulness_reason: str
    completeness_question: Literal[1, 2, 3, 4, 5]
    completeness_question_reason: str




In [None]:
#Original version of the prompt: uses the context for completeness to reference instead of only the reference
def build_prompt_v1(response, reference, context, question):
  return f"""
You are an expert evaluator assessing hallucination in RAG-based answers. Rate the following RESPONSE using a 1–5 Likert scale on four dimensions. Each rating must be justified with a short reason.

Use this scale:
1 - Very Poor
2 - Poor
3 - Fair
4 - Good
5 - Excellent

Respond strictly in this format (only return valid JSON, no extra text):
{{
  "correctness": <1-5>,
  "correctness_reason": "<short justification>",
  "completeness_reference": <1-5>,
  "completeness_reference_reason": "<short justification>",
  "faithfulness": <1-5>,
  "faithfulness_reason": "<short justification>",
  "completeness_question": <1-5>,
  "completeness_question_reason": "<short justification>"
}}

Criteria:
1. **Correctness** – How accurately does the RESPONSE convey the same meaning as the REFERENCE, regardless of wording?
2. **Completeness (Reference)** – Does the RESPONSE include all key information from the REFERENCE?
3. **Faithfulness to Context** – Does the RESPONSE strictly reflect what is present in the CONTEXT (no made-up or contradictory info)?
4. **Completeness (Question)** – Does the RESPONSE fully address all aspects of the QUESTION?

QUESTION:
{question}

REFERENCE:
{reference}

RESPONSE:
{response}

CONTEXT:
{context}
"""


In [5]:
# Second version of the prompt: 
# Added ignore...
# Reduces points because wants answer to have everything mentioned in the context even if not relevant to the question.
#  Faithfulness is only about checking if the answer does not contradict the context or invent info. 
#  Completeness (question) is about checking if the answer is complete and answers the question fully

def build_prompt_v2(response, reference, context, question):
    return f"""
You are an expert evaluator assessing hallucination in RAG-based answers. Rate the following RESPONSE using a 1–5 Likert scale on four dimensions. Each rating must be justified with a short reason.

Use this scale:
1 - Very Poor
2 - Poor
3 - Fair
4 - Good
5 - Excellent

Respond strictly in this format (only return valid JSON, no extra text):
{{
  "correctness": <1-5>,
  "correctness_reason": "<short justification>",
  "completeness_reference": <1-5>,
  "completeness_reference_reason": "<short justification>",
  "faithfulness": <1-5>,
  "faithfulness_reason": "<short justification>",
  "completeness_question": <1-5>,
  "completeness_question_reason": "<short justification>"
}}

Criteria:
1. **Correctness** – How accurately does the RESPONSE convey the same meaning as the REFERENCE, regardless of wording?  Ignore the context and question for this rating
2. **Completeness (Reference)** – Does the RESPONSE include all key information from the REFERENCE? Ignore the context and question for this rating 
3. **Faithfulness to Context** – Does the RESPONSE strictly reflect what is present in the CONTEXT (no made-up or contradictory info)? Ignore the reference and question for this rating
4. **Completeness (Question)** – Does the RESPONSE fully address all aspects of the QUESTION?  Ignore the reference and context for this rating

RESPONSE:
{response}

REFERENCE (for correctness & completeness_reference):
{reference}

CONTEXT (for faithfulness):
{context}

QUESTION (for completeness_question):
{question}

"""


In [None]:
# Third version of the prompt: refined the defintions of faithfulness and completeness to question.

def build_prompt_v3(response, reference, context, question):
    return f"""
You are an expert evaluator assessing hallucination in RAG-based answers. Rate the following RESPONSE using a 1–5 Likert scale on four dimensions. Each rating must be justified with a short reason.

Use this scale:
1 - Very Poor
2 - Poor
3 - Fair
4 - Good
5 - Excellent

Respond strictly in this format (only return valid JSON, no extra text):
{{
  "correctness": <1-5>,
  "correctness_reason": "<short justification>",
  "completeness_reference": <1-5>,
  "completeness_reference_reason": "<short justification>",
  "faithfulness": <1-5>,
  "faithfulness_reason": "<short justification>",
  "completeness_question": <1-5>,
  "completeness_question_reason": "<short justification>"
}}

Criteria:
1. **Correctness** – How accurately does the RESPONSE convey the same meaning as the REFERENCE, regardless of wording?  Ignore the context and question for this rating
2. **Completeness (Reference)** – Does the RESPONSE include all key information from the REFERENCE? Ignore the context and question for this rating 
3. **Faithfulness to Context** – Check ONLY whether the RESPONSE is faithful to the CONTEXT.Ignore the reference and question for this rating
   - Faithful means: no made-up facts, no contradictions with the context.
   - Do NOT penalize for leaving out context information.
   - Faithfulness is about truthfulness, NOT completeness.
4. **Completeness (Question)** – Does the RESPONSE fully answer what is asked in the QUESTION? Ignore the reference and context for this rating
   - Use CONTEXT **only as needed** to support this.
   - Do NOT penalize if RESPONSE omits unrelated context details.

RESPONSE:
{response}

REFERENCE (for correctness & completeness_reference):
{reference}

CONTEXT (for faithfulness):
{context}

QUESTION (for completeness_question):
{question}

"""


In [None]:
# Uses RDNA definitions (Role,Description,Narrative,Aspects) to define the task.
def build_prompt_v4(response, reference, context, question):
    return f"""
You are evaluating an answer generated by a retrieval-augmented generation (RAG) system.

The person who asked the question was seeking an accurate, complete, and context-faithful answer to a specific travel-related problem. You are given the original QUESTION they asked, a CONTEXT that was used to generate the answer, a REFERENCE answer created by a human expert, and the actual RESPONSE from the system.

You must rate the RESPONSE using a 1–5 Likert scale on four specific aspects.

---

### DESCRIPTION
The task is to assess how well the RESPONSE aligns with factual accuracy, includes relevant content, and satisfies the user's information need.

### NARRATIVE
Your evaluation should consider:
- If the RESPONSE accurately captures the intent and content of the REFERENCE (Correctness)
- If the RESPONSE includes all key information from the REFERENCE (Completeness to Reference)
- If the RESPONSE is grounded in the CONTEXT and avoids introducing unsupported claims (Faithfulness)
- If the RESPONSE fully and directly answers the QUESTION (Completeness to Question)

Do not penalize for differences in wording, writing style, or length unless they affect clarity or substance.

---

### ASPECTS (RATED 1–5)
Use this scale:
1 - Very Poor
2 - Poor
3 - Fair
4 - Good
5 - Excellent

Return your evaluation strictly as a valid JSON object, like this:
{{
  "correctness": <1-5>,
  "correctness_reason": "<short justification>",
  "completeness_reference": <1-5>,
  "completeness_reference_reason": "<short justification>",
  "faithfulness": <1-5>,
  "faithfulness_reason": "<short justification>",
  "completeness_question": <1-5>,
  "completeness_question_reason": "<short justification>"
}}

---

### RESPONSE:
{response}

### REFERENCE (for correctness & completeness_reference):
{reference}

### CONTEXT (for faithfulness):
{context}

### QUESTION (for completeness_question):
{question}
"""


In [None]:
# Uses DNA definitions (Role,Description,Narrative,Aspects) to define the task.
def build_prompt_v5(response, reference, context, question):
    return f"""
The following task involves evaluating an answer produced by a retrieval-augmented generation (RAG) system in response to a real-world user query about travel regulations.

---

### DESCRIPTION
You will assess the quality of the system's RESPONSE based on how accurately it reflects the REFERENCE answer, includes important information, stays faithful to the CONTEXT used, and fully addresses the QUESTION.

You are provided with:
- A RESPONSE generated by the system
- A REFERENCE answer written by a human expert
- A CONTEXT document retrieved by the system
- The original QUESTION asked by the user

---

### NARRATIVE
Consider the following four evaluation dimensions:

1. **Correctness** – Does the RESPONSE convey the same meaning as the REFERENCE?  
   ↪ Focus only on semantic accuracy compared to the REFERENCE. Ignore context and question.

2. **Completeness (Reference)** – Does the RESPONSE include all key information from the REFERENCE?  
   ↪ Check for omission of important details found in the REFERENCE. Ignore context and question.

3. **Faithfulness (to Context)** – Is the RESPONSE factually consistent with the CONTEXT?  
   ↪ Look for hallucinations, contradictions, or made-up facts. Do not penalize for omissions. Ignore the REFERENCE and QUESTION.

4. **Completeness (to Question)** – Does the RESPONSE fully answer the QUESTION?  
   ↪ Judge whether the RESPONSE addresses everything asked. Use CONTEXT only as needed. Ignore the REFERENCE.

---

### ASPECTS (RATED 1–5)
Use this 5-point Likert scale:
1 - Very Poor  
2 - Poor  
3 - Fair  
4 - Good  
5 - Excellent  

Output your judgment in **strict JSON** format with one score and one short justification per dimension:

```json
{{
  "correctness": <1-5>,
  "correctness_reason": "<short justification>",
  "completeness_reference": <1-5>,
  "completeness_reference_reason": "<short justification>",
  "faithfulness": <1-5>,
  "faithfulness_reason": "<short justification>",
  "completeness_question": <1-5>,
  "completeness_question_reason": "<short justification>"
}}
RESPONSE:
{response}

REFERENCE (for correctness & completeness_reference):
{reference}

CONTEXT (for faithfulness):
{context}

QUESTION (for completeness_question):
{question}
"""

In [6]:
import pandas as pd
# Load the chunk ID to text mapping
chunk_df = pd.read_csv("NEW_chunks.csv") 
chunk_lookup = dict(zip(chunk_df['chunk_id'], chunk_df['chunk_text']))
print(chunk_lookup)

{'chunk_1': 'English Travel Reimbursement Law Revised Version of the State Travel Expense Act \nPreliminary Page A. Objective The previous travel expense regulations are outdated and require \nupdating and legal simplification to facilitate the conduct and administrative processing of official \ntravel. In addition, with regard to mobility behavior, the requirements of climate protection shall \nbe taken into account (the exemplary function of the state administration pursuant to §  of the \nBaden-Württemberg Climate Protection Act).', 'chunk_2': 'Revised Version of the State Travel Expense Act \nPreliminary Page B. Essential Content A revision of the State Travel Expense Act resulting in a \nmodern regulatory framework. The focal points are: . A new regulation for travel costs and \nmileage allowance. . . . . Adjustment of the reduction of the per diem allowance in the case of \ncomplimentary meals in line with tax law provisions, thereby eliminating the need to tax parts of \nthe per

In [11]:
# def resolve_chunk_id_to_text(chunk_id_string, chunk_lookup):

#     chunk_ids = [chunk.strip() for chunk in chunk_id_string.split(";") if chunk.strip()]
#     resolved_chunks = [chunk_lookup.get(chunk_id, f"[Missing: {chunk_id}]") for chunk_id in chunk_ids]
#     return "\n---\n".join(resolved_chunks)  # Join with separator if multiple

def resolve_chunk_id_to_text(chunk_id, chunk_lookup):

    return chunk_lookup.get(chunk_id.lower(), f"[Missing: {chunk_id}]")

res= resolve_chunk_id_to_text("Chunk_1", chunk_lookup)
print(res)

English Travel Reimbursement Law Revised Version of the State Travel Expense Act 
Preliminary Page A. Objective The previous travel expense regulations are outdated and require 
updating and legal simplification to facilitate the conduct and administrative processing of official 
travel. In addition, with regard to mobility behavior, the requirements of climate protection shall 
be taken into account (the exemplary function of the state administration pursuant to §  of the 
Baden-Württemberg Climate Protection Act).


In [12]:
from pydantic import ValidationError
import json

last_input = None

def evaluate_response(question, response, reference, context,prompt_builder):
    global last_input
    prompt = prompt_builder(response, reference, context, question)
    print(prompt)
    #   # DEBUG: Print what we're actually sending
    # print(f"DEBUG - Question: {question}")
    # print(f"DEBUG - Response (first 200 chars): {str(response)[:200]}...")
    # print(f"DEBUG - Reference (first 200 chars): {str(reference)[:200]}...")
    # print(f"DEBUG - Context (first 200 chars): {str(context)[:200]}...")
    # print("="*50)
    try:
        last_input = prompt  # Store the last input for debugging
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are an expert evaluator assessing hallucination in RAG-based answers. Always return your ratings in valid JSON format. Do not include anything else."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=512,
        )
         # Extract the JSON response
        output = response.choices[0].message.content.strip()

        json_start_index = output.find('{')
        output = output[json_start_index:].strip().rstrip("```") # Removing the words "json" and "```" at the end

        result = EvaluationScores(**json.loads(output))
         # Takes JSON string and converts it to a dictionary
        # Then double asterrisk operator does tuple unpacking and gives it to EvaluationScores constructor to convert the dictionary to a class object
        return result
    
    except json.JSONDecodeError as e:
        print("❌ Failed to parse JSON:")
        print(output)
        print(e)
    except ValidationError as e:
        print("❌ Schema validation failed:")
        print(output)
        print(e)
    return None



# from pydantic import ValidationError
# import json

# last_input = None

# def evaluate_response(question, response, reference, context, prompt_builder):
#     global last_input
    
#     # Store original data before any API calls
#     original_response = response
#     original_reference = reference
#     original_context = context
#     original_question = question
    
#     prompt = prompt_builder(original_response, original_reference, original_context, original_question)
    
#     # DEBUG: Print what we're actually sending
#     print(f"DEBUG - Question: {original_question}")
#     print(f"DEBUG - Response (first 200 chars): {str(original_response)[:200]}...")
#     print(f"DEBUG - Reference (first 200 chars): {str(original_reference)[:200]}...")
#     print(f"DEBUG - Context (first 200 chars): {str(original_context)[:200]}...")
#     print("="*50)
    
#     try:
#         last_input = prompt  # Store the last input for debugging
        
#         # ✅ FIXED: Use different variable name for API response
#         api_response = client.chat.completions.create(
#             model=MODEL,
#             messages=[
#                 {"role": "system", "content": "You are an expert evaluator assessing hallucination in RAG-based answers. Always return your ratings in valid JSON format. Do not include anything else."},
#                 {"role": "user", "content": prompt}
#             ],
#             temperature=0,
#             max_tokens=512,
#         )
        
#         # Extract the JSON response
#         output = api_response.choices[0].message.content.strip()

#         json_start_index = output.find('{')
#         output = output[json_start_index:].strip().rstrip("```")

#         result = EvaluationScores(**json.loads(output))
#         return result
    
#     except json.JSONDecodeError as e:
#         print("❌ Failed to parse JSON:")
#         print(output)
#         print(e)
#     except ValidationError as e:
#         print("❌ Schema validation failed:")
#         print(output)
#         print(e)
#     return None



In [None]:
import pandas as pd

def evaluate_all(input_csv: str, output_csv: str, prompt_builder, judge_prompt_id: str):
    df = pd.read_csv(input_csv)
    results = []


    for i, row in df.iterrows():
        
        # print(f"\n🔍 Processing row {i}")
        # print(f"Excel Generated Answer: {str(row['Generated Answer'])[:100]}...")
        # Call method to evaluate each row and pass to it the required parameters
        res = evaluate_response(
            question=row["Question"],
            response=row["Generated Answer"],
            reference=row["Reference Answer"],
            # context=row["Chunks Retrieved"],
            # context=resolve_chunk_ids_to_text(row["Chunks Retrieved"], chunk_lookup),
            context=resolve_chunk_id_to_text(row["Original Chunk"], chunk_lookup),
            prompt_builder=prompt_builder 
        )

        if res:

            # Create the output row with the scores
            result_row = {
                "Generation Model": row["Generation Model"],
                "Question Index": row["Question Index"],
                "Question": row["Question"],
                "Type": row["Type"],
                "Source_QID": row["Source_QID"],
                "Original Chunk": row["Original Chunk"],
                "Chunks Retrieved": row["Chunks Retrieved"],
                "Generated Answer": row["Generated Answer"],
                "Reference Answer": row["Reference Answer"],
                "Generation Prompt Used": row["Generation Prompt Used"],
                "Encoding Used": row["Encoding Used"],
                "Enable Thinking": row["Enable Thinking"],
                "correctness": res.correctness,
                "correctness_reason": res.correctness_reason,
                "completeness_reference": res.completeness_reference,
                "completeness_reference_reason": res.completeness_reference_reason,
                "faithfulness": res.faithfulness,
                "faithfulness_reason": res.faithfulness_reason,
                "completeness_question": res.completeness_question,
                "completeness_question_reason": res.completeness_question_reason,
                "Judge Model": MODEL,
                "Judge Prompt": judge_prompt_id
            }
            results.append(result_row)
            
    result_df = pd.DataFrame(results) # Create a DataFrame from the results, a DataFrame is a table-like structure in pandas
    result_df.to_csv(output_csv, index=False) # Save the results to a CSV file
    print(f"\n✅ Evaluation complete. Output saved to: {output_csv}") # Print a success message
    return result_df

In [None]:
 # Example single evaluation:

 
#     example_question = "What is considered an official trip?"
#     example_response = f"""Official trips in the sense of this Act are journeys undertaken to conduct official  
# business outside the usual place of work, which have been ordered or approved by the  
# responsible superior, unless an order or approval is not applicable due to the nature of the  
# official’s office or the nature of the official business."""
#     example_reference = f"""Official trips in the sense of this Act are journeys undertaken to conduct official  
# business outside the usual place of work, which have been ordered or approved by the  
# responsible superior, unless an order or approval is not applicable due to the nature of the  
# official’s office or the nature of the official business."""
#     example_context =  f"""Official trips in the sense of this Act are journeys undertaken to conduct official  
# business outside the usual place of work, which have been ordered or approved by the  
# responsible superior, unless an order or approval is not applicable due to the nature of the  
# official’s office or the nature of the official business. The order or approval must be given 
# in writing or electronically. Official trips also include journeys from a location serving as a 
# temporary residence to the place of work, provided that the conditions of sentences 1 
# and 2 are otherwise met. Official trips should only be carried out if a less costly method of 
# conducting the official business is not possible or reasonable."""

#     result = evaluate_response(example_question, example_response, example_reference, example_context, prompt_builder=build_prompt_v5)
#     print(result)



    



In [None]:
# Batch evaluation:


# CHANGE NAMES ACCORDING TO JUDGE PROMPT USED AND WHAT YOU ARE TESTING

output_csv = ""
# judge_prompt_id="prompt_v3"
judge_prompt_id="prompt_v2"

result=evaluate_all(
    
     judge_prompt_id=judge_prompt_id,

    # CHANGE NAMES ACCORDING TO JUDGE PROMPT USED AND WHAT YOU ARE TESTING


    # prompt_builder=build_prompt_v3,
    
    prompt_builder=build_prompt_v2,
    
        
    # input_csv="NEW_combined_size_Qwen3_outputs_Basic_RAG_Prompt.csv",
    # input_csv="NEW_combined_size_Phi-3_outputs_Basic_RAG_Prompt.csv", 
    # input_csv="NEW_combined_size_Llama-3.2_outputs_Basic_RAG_Prompt.csv",

    # input_csv="NEW_combined_en_prompt_outputs_Qwen3-4B.csv",
    input_csv="NEW_combined_en_size_Qwen3_outputs_Basic_RAG_Prompt.csv",
    #input_csv="NEW_combined_prompt_outputs_Phi-3-mini-128k-instruct.csv",
    # input_csv="NEW_combined_prompt_outputs_Llama-3.2-3B-Instruct.csv",
    # input_csv="NEW_combined_prompt_outputs_Qwen3-4B.csv",
    # input_csv="combined_size_Phi-3_outputs_Basic_RAG_Prompt.csv",
    # input_csv="combined_prompt_outputs_Phi-3-mini-128k-instruct.csv",
    # input_csv="combined_prompt_outputs_Phi-3-mini-4k-instruct.csv",
    # input_csv="combined_prompt_outputs_Llama-3.2-3B-Instruct.csv",
    # input_csv="combined_prompt_outputs_Qwen3-4B.csv",
    # input_csv= "combined_en_prompt_outputs_Qwen3-0.6B.csv",
    # input_csv="combined_en_size_Qwen3_outputs_Basic_RAG_Prompt.csv",
    # input_csv="combined_size_Llama-3.2_outputs_Basic_RAG_Prompt.csv",
    # input_csv="combined_size_Qwen3_outputs_Basic_RAG_Prompt_added_model.csv",
    # input_csv="combined_size_Qwen3_outputs_Basic_RAG_Prompt.csv",
    # input_csv="combined_prompt_outputs_Qwen3-0.6B.csv",

    # output_csv=f"evaluated_No_Context_Prompt_outputs_Phi-3-mini-4k-instruct_{judge_prompt_id}.csv" 
    # output_csv=f"evaluated_No_Context_Prompt_outputs_Phi-3-mini-128k-instruct_{judge_prompt_id}.csv"

    # output_csv=f"NEW2_evaluated_size_Qwen3_outputs_Basic_RAG_Prompt_{judge_prompt_id}.csv",
    output_csv=f"NEW2_evaluated_en_size_Qwen3_outputs_Basic_RAG_Prompt_{judge_prompt_id}.csv"


#    output_csv=f"NEW_evaluated_size_Phi-3_outputs_Basic_RAG_Prompt_{judge_prompt_id}.csv",
    # output_csv=f"NEW_evaluated_size_Llama-3.2-Instruct_outputs_Basic_RAG_Prompt_{judge_prompt_id}.csv",
   

    # output_csv=f"NEW_evaluated_en_prompt_outputs_Qwen3-4B_{judge_prompt_id}.csv" 
    # output_csv=f"NEW_evaluated_en_size_Qwen3_outputs_Basic_RAG_Prompt_{judge_prompt_id}.csv"
    # output_csv=f"NEW_evaluated_prompt_outputs_Phi-3-mini-128k-instruct_{judge_prompt_id}.csv" 
    #output_csv=f"NEW_evaluated_prompt_outputs_Llama-3.2-3B-Instruct_{judge_prompt_id}.csv"
    #output_csv=f"NEW_evaluated_prompt_outputs_Qwen3-4B_{judge_prompt_id}.csv" 
    # output_csv=f"evaluated_size_Phi-3_outputs_Basic_RAG_Prompt_{judge_prompt_id}.csv"
    # output_csv=f"evaluated_prompt_outputs_Phi-3-mini-128k-instruct_{judge_prompt_id}.csv" 
    # output_csv=f"evaluated_prompt_outputs_Phi-3-mini-4k-instruct_{judge_prompt_id}.csv" 
    # output_csv=f"evaluated_prompt_outputs_Llama-3.2-3B-Instruct_{judge_prompt_id}.csv" 
    # output_csv=f"evaluated_prompt_outputs_Qwen3-4B_{judge_prompt_id}.csv" 
    # output_csv=f"evaluated_en_prompt_outputs_Qwen3-0.6B_{judge_prompt_id}.csv" 
    # output_csv=f"evaluated_en_size_Qwen3_outputs_Basic_RAG_Prompt_{judge_prompt_id}.csv"
    # output_csv=f"evaluated_size_Llama-3.2_outputs_Basic_RAG_Prompt_{judge_prompt_id}.csv"
    # output_csv=f"evaluated_size_Qwen3_outputs_Basic_RAG_Prompt_{judge_prompt_id}_added_model.csv", 
    # output_csv=f"evaluated_size_Qwen3_outputs_Basic_RAG_Prompt_{judge_prompt_id}.csv", 
    # output_csv=f"evaluated_prompt_outputs_Qwen3-0.6B_{judge_prompt_id}.csv" 

)
print(result.to_json(indent=2, force_ascii=False))

In [None]:
print (last_input)

In [None]:
import pandas as pd
import re

# Read the CSV file into a pandas DataFrame. 

# CHANGE NAME ACCORDING TO WHAT YOU ARE TESTING
df = pd.read_csv("NEW2_evaluated_size_Qwen3_outputs_Basic_RAG_Prompt_prompt_v2.csv")

# # ===================== For testing different model sizes ===================================================

summary = df.groupby("Generation Model")[[
    "correctness",
    "completeness_reference",
    "faithfulness",
    "completeness_question"
]].mean().reset_index()

# Extract model size from the name for sorting
def extract_size(model_name):
    match = re.search(r"(\d+\.?\d*)B", model_name) # Match pattern like "0.6B", "1.3B", etc. If a match is found, the match variable will be a match object; otherwise, it will be None.
    return float(match.group(1)) if match else float('inf') # Group 1 captures the numeric part of the model size (e.g., "0.6" from "0.6B"). #  If no match is found, return infinity to sort it last.

summary["model_size"] = summary["Generation Model"].apply(extract_size) # Apply the extract_size function to the "Generation Model" column to create a new column "model_size" with the extracted numeric size.

# Sort by extracted numeric size
summary = summary.sort_values("model_size")

# This line removes the "model_size" column from the DataFrame, as it was only needed for sorting and is not required in the final output.
summary = summary.drop(columns=["model_size"]) 

# # CHANGE NAME ACCORDING TO WHAT YOU ARE TESTING
summary.to_csv(f"NEW2_size_performance_Qwen3_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False) 
# summary.to_csv(f"NEW_size_en_performance_Qwen3_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False)
# # summary.to_csv(f"NEW_size_performance_Qwen3_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False) 
# # summary.to_csv(f"NEW_size_performance_Phi-3_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False) 
# summary.to_csv(f"NEW_size_performance_Llama-3.2_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False) 
# # summary.to_csv(f"NEW_size_performance_Qwen3_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False) 
# # summary.to_csv(f"size_performance_Phi-3_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False)
# # summary.to_csv(f"size_performance_Qwen3_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False) 
# # summary.to_csv(f"size_performance_Qwen3_summary_Basic_RAG_Prompt_{judge_prompt_id}_added_model.csv", index=False) 
# # summary.to_csv(f"size_performance_Llama-3.2_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False)
# # summary.to_csv(f"size_en_performance_Qwen3_summary_Basic_RAG_Prompt_{judge_prompt_id}.csv", index=False)
# # ===================== ======= ===================================================================================

# ===================== For testing different prompts ===================================================
# summary = df.groupby("Generation Prompt Used")[[
#     "correctness",
#     "completeness_reference",
#     "faithfulness",
#     "completeness_question"
# ]].mean().reset_index()

# #  # CHANGE NAME ACCORDING TO WHAT YOU ARE TESTING

# summary.to_csv(f"NEW_prompt_en_performance_summary_Qwen3-4B_{judge_prompt_id}.csv", index=False)
# # summary.to_csv(f"NEW_prompt_performance_summary_mini-128k-instruct_{judge_prompt_id}.csv", index=False)
# # # summary.to_csv(f"NEW_prompt_performance_summary_Llama-3.2-3B-Instruct_{judge_prompt_id}.csv", index=False)
# # #summary.to_csv(f"NEW_prompt_performance_summary_Qwen3-4B_{judge_prompt_id}.csv", index=False)
# # # summary.to_csv(f"prompt_performance_summary_mini-128k-instruct_{judge_prompt_id}.csv", index=False)
# # # summary.to_csv(f"prompt_performance_summary_Llama-3.2-3B-Instruct_{judge_prompt_id}.csv", index=False)
# # # summary.to_csv(f"prompt_performance_summary_Qwen3-4B_{judge_prompt_id}.csv", index=False)
# # # summary.to_csv(f"prompt_performance_summary_Qwen3-0.6B_{judge_prompt_id}.csv", index=False)
# # # summary.to_csv(f"prompt_en_performance_summary_Qwen3-0.6B_{judge_prompt_id}.csv", index=False)
#==================================================================================================