In [None]:
import os
from openai import OpenAI
import pandas as pd
import json

# Initialize the OpenAI client
client = OpenAI(
    api_key=""
)

# Load the rubric and generated question data
rubric_data = pd.read_excel('MCQ_Evaluation_Rubric.xlsx')
generations_data = pd.read_excel('formatted_generated_test_results_few_shot.xlsx')

# Define a function to evaluate a question using the rubric
def evaluate_question(question, choices, answer, rubric):
    """
    Evaluates a question against a rubric using an LLM.

    Args:
        question (str): The generated question text.
        choices (list): List of answer choices.
        answer (str): The correct answer.
        rubric (pd.DataFrame): The rubric dataframe.

    Returns:
        str: The raw evaluation response from the model.
    """
    rubric_text = "\n".join(
        [f"{row['Criteria']}: {row['Explanation']}" for _, row in rubric.iterrows()]
    )

    prompt = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "system",
                "content": "You are an expert evaluator for educational questions. Be meticulous, accurate, and thorough in your evaluations. Ensure that grades (0-3) are assigned with strict adherence to the rubric's explanations. All grades across the range must be used appropriately, and your evaluations should include clear, unbiased reasoning."
            },
            {
                "role": "user",
                "content": f"""
                You are an expert evaluator assessing a multiple-choice question based on the following rubric:
                
                Rubric Criteria:
                {rubric_text}

                Question: {question}
                Choices: {choices}
                Answer: {answer}

                Provide an evaluation for each of the 16 criteria using the full grade range (0-3). For each criterion, provide:
                - Criterion name
                - Grade (0-3)
                - Detailed reason for the grade

                Calculate the total percentage score as the average of all grades (e.g., Total Percentage: 85.4%).

                Output Format:
                Criterion: <Criterion Name> | Grade: <Grade (0-3)> | Reason: <Reason>
                ...
                Total Percentage: <Percentage>
                Ensure the response strictly follows this format for easier parsing.
                """
            }
        ]
    }

    # Use the OpenAI client to evaluate
    response = client.chat.completions.create(**prompt, max_tokens=2000)
    evaluation_text = response.choices[0].message.content

    return evaluation_text

# Main loop for evaluations
evaluations = []
for _, row in generations_data.iterrows():
    question = row["question"]
    choices = row["choices"]
    answer = row["answer"]

    evaluation_text = evaluate_question(question, choices, answer, rubric_data)
    evaluation_data = {
        "Question": question,
        "Choices": choices,
        "Answer": answer,
        "Raw Evaluation": evaluation_text
    }
    evaluations.append(evaluation_data)

# Convert evaluations to a DataFrame
evaluation_results = pd.DataFrame(evaluations)

# Save the results to an Excel file
evaluation_results.to_excel('Evaluated_Questions_few_shot.xlsx', index=False)

print("Evaluation complete. Results saved to Evaluated_Questions_few_shot.xlsx")


Evaluation complete. Results saved to Evaluated_Questions_few_shot.xlsx
