In [1]:
import pandas as pd
import numpy as np
import re
import json
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
DEEPSEEK_MODELS = ["deepseek-chat"]
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")

In [2]:
INPUT_FILE = "4o-QA.xlsx"

In [3]:
class LLMEvaluator:
    def __init__(self, deepseek_api_key: str = None):
        self.deepseek_api_key = deepseek_api_key
        self.client = OpenAI(
            api_key=self.deepseek_api_key,
            base_url="https://api.deepseek.com"
        )
    
    def call_deepseek_api(self, prompt: str, model: str = "deepseek-chat") -> str:
        if model == "deepseek-chat":
            response = self.client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
                max_tokens=2000,
                response_format={"type": "json_object"}
            )
            # print(response.choices[0].message.content.strip())
            return response.choices[0].message.content.strip()

    
    def save_to_json(self, results, output_path):
        with open(output_path, 'w', encoding='utf-8') as jsonfile:
            json.dump(results, jsonfile, indent=2, ensure_ascii=False)
        print(f"Evaluation Results Saved in JSON: {output_path}")

    
    def evaluate_with_llm(self, question, candidate, model="deepseek-chat"):
        
        prompt = f"""
Below is a question and a student's answer based on literature about Additive Manufacturing, specifically regarding Fused Deposition Modeling (FDM) and Polylactic Acid (PLA). 
The question and the referenced literature may also include content related to PLA composites.

**Question:**
{question}

**Student Answer:**
{candidate}

Evaluate the answer on the following dimensions (score each from 1 to 10, decimals allowed, 10 = best):

1. Answer_Relevance: The degree to which the content of the student's answer is relevant to the question.
2. Context_Precision: The proportion of information in the student's answer that is relevant to the question, relative to all information provided in the answer.
3. Completeness: Whether the student's answer is complete, for example, whether there are any incomplete sentences.
4. Coherency: Whether the meaning of the student's answer is smooth and clear, for example, how well the sentences are connected to each other.
5. Formatting_Quality: Evaluate the use of Markdown. Consider whether Markdown syntax is present and whether the formatting meaningfully improves readability and organization.

Also provide an Explanation: Give the rationale for each score or identify potential issues, covering all dimensions.

Output only valid JSON in the following format, with no additional text outside the JSON. 
The values in this example are placeholders and should not be taken as the reference.

{{
  "Answer_Relevance": 5.0,
  "Context_Precision": 5.0,
  "Completeness": 5.0,
  "Coherency": 5.0,
  "Formatting_Quality": 5.0,
  "Explanation": "Scoring rationale or potential issues if exist."
}}

Note:
- Evaluate objectively and rigorously, and do not be lenient in scoring.
- Score EACH dimension independently. Do not blend dimensions or compensate a low score in one dimension with a high score in another. Do not compute or imply an overall/average score.
- Completeness: if any sentence or paragraph is clearly truncated, apply an additional penalty on this dimension, as truncation indicates missing content.
- Formatting_Quality: if the answer, even when short, could be made clearer with basic Markdown but none is used, deduct points; also deduct for incorrect or harmful formatting.
"""

        response = self.call_deepseek_api(prompt, model)
        
        if model == "deepseek-chat":
            start_idx = response.find('{')
            end_idx = response.rfind('}') + 1
            json_str = response[start_idx:end_idx]
            result = json.loads(json_str)
            
            # regulate the score format, without influencing the "evaluation" dimension
            for key in ["Answer_Relevance", "Context_Precision", "Completeness", "Coherency", "Formatting_Quality"]:
                if key in result:
                    result[key] = max(1.0, min(10.0, float(result[key])))
                else:
                    result[key] = 0
            return result            

In [4]:
evaluator = LLMEvaluator(deepseek_api_key=DEEPSEEK_API_KEY)
df = pd.read_excel(INPUT_FILE)

In [5]:
def evaluate_responses_with_models(df):
    model_results = {}
    total_rows = len(df)
    
    print(f"Start Evaluation of {total_rows} Data Rows.")
    print("=" * 50)

    for model_idx, model in enumerate(DEEPSEEK_MODELS, 1):
        print(f"Evaluating Using {model}.")
        results = []
        
        for idx, row in df.iterrows():
            question = str(row.iloc[0])
            response = str(row.iloc[1])
            print(f"   Processing Row {idx + 1}/{total_rows}...", end=" ")
            
            evaluation_results = evaluator.evaluate_with_llm(
                question=question,
                candidate=response,
                model=model
            )
            
            results.append({
                'question': question,
                'response': response,
                'LLM_evaluation_results': evaluation_results
            })
            print("LLM Evaluation for This Row Completed.")

        
        dimensions = ["Answer_Relevance", "Context_Precision", "Completeness", "Coherency", "Formatting_Quality"]
        averages = {}

        # calculate the average score
        for dim in dimensions:
            scores = [r["LLM_evaluation_results"].get(dim, 0) for r in results]
            averages[dim] = np.mean(scores)
        
        model_results[model] = {
            'individual_results': results,
            'averages': averages
        }
        
        print(f"Evaluation Using {model} Completed.")
    return model_results

results = evaluate_responses_with_models(df)

Start Evaluation of 45 Data Rows.
Evaluating Using deepseek-chat.
   Processing Row 1/45... LLM Evaluation for This Row Completed.
   Processing Row 2/45... LLM Evaluation for This Row Completed.
   Processing Row 3/45... LLM Evaluation for This Row Completed.
   Processing Row 4/45... LLM Evaluation for This Row Completed.
   Processing Row 5/45... LLM Evaluation for This Row Completed.
   Processing Row 6/45... LLM Evaluation for This Row Completed.
   Processing Row 7/45... LLM Evaluation for This Row Completed.
   Processing Row 8/45... LLM Evaluation for This Row Completed.
   Processing Row 9/45... LLM Evaluation for This Row Completed.
   Processing Row 10/45... LLM Evaluation for This Row Completed.
   Processing Row 11/45... LLM Evaluation for This Row Completed.
   Processing Row 12/45... LLM Evaluation for This Row Completed.
   Processing Row 13/45... LLM Evaluation for This Row Completed.
   Processing Row 14/45... LLM Evaluation for This Row Completed.
   Processing Row 1

In [6]:
for model, model_data in results.items():
    print(f"\n{model.upper()} Evaluation Results:")
    avg = model_data['averages']
    print(f"  Answer_Relevance: {avg['Answer_Relevance']:.2f}/10")
    print(f"  Context_Precision: {avg['Context_Precision']:.2f}/10")
    print(f"  Completeness: {avg['Completeness']:.2f}/10")
    print(f"  Coherency: {avg['Coherency']:.2f}/10")
    print(f"  Formatting_Quality: {avg['Formatting_Quality']:.2f}/10")
    
    overall_avg = np.mean([avg['Answer_Relevance'], avg['Context_Precision'], avg['Completeness'], avg['Coherency'], avg['Formatting_Quality']])
    print(f"  Overall Average: {overall_avg:.2f}/10")


DEEPSEEK-CHAT Evaluation Results:
  Answer_Relevance: 9.40/10
  Context_Precision: 8.99/10
  Completeness: 9.64/10
  Coherency: 9.38/10
  Formatting_Quality: 6.91/10
  Overall Average: 8.86/10


In [7]:
evaluator.save_to_json(results, "4o-QA-3.json")

Evaluation Results Saved in JSON: 4o-QA-3.json
