In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from deepeval import evaluate
from deepeval.evaluate import DisplayConfig
display_config = DisplayConfig(
    show_indicator=True,
    print_results=False,
    verbose_mode=False,
)
import toml
with open('config.toml', 'r', encoding='utf-8') as toml_file:
    config = toml.load(toml_file)

model_names = config['model_names']
alter_names = model_names.copy() # Create a copy to modify

for i, name in enumerate(alter_names):
    if name in config['alternative_names']: 
        alter_names[i] = config['alternative_names'][name]

for name in alter_names:
    print(name)
INPUT_EXCEL_FILE = "goldens" / Path(config['QA_file_name'])
quesion_dfs = pd.read_excel(INPUT_EXCEL_FILE, sheet_name=None, index_col=0)
DOMAIN = list(quesion_dfs.keys())
num_questions_per_domain = quesion_dfs[DOMAIN[0]].shape[0]
print(f"共有{len(DOMAIN)}个领域，每个领域有{num_questions_per_domain}个问题")

QA_df = {}
for i, model in enumerate(model_names):
    QA_FILE = "QA" / Path(f"{model}_answers.xlsx")
    QA_df[model_names[i]] = pd.read_excel(QA_FILE, sheet_name=None, index_col=0)

DeepSeek-R1-Distill-Qwen-32B
DeepSeek-R1-Distill-Qwen-14B
deepseek/deepseek-r1
openai/gpt-4.1
openai/o3
anthropic/claude-sonnet-4
google/gemini-2.5-pro-preview
x-ai/grok-3-beta
共有13个领域，每个领域有10个问题


In [2]:
from custom_metrics import get_dataset, correctness_metric

In [None]:
from deepeval import evaluate
from deepeval.dataset import EvaluationDataset
test_modelA = model_names[0]
test_modelB = model_names[1]
case_dataset = get_dataset(
        infer_model=test_modelA,
        ref_model=test_modelB,
        question_dataframe=quesion_dfs,
        QA_dataframe=QA_df,
        domains=DOMAIN
    )
evaluation_output = case_dataset.evaluate([correctness_metric])
evaluation_output = evaluate(case_dataset, 
                             [correctness_metric], 
                             display_config=display_config,
                            #  hyperparameters={"Temperature": 0.1, "Max Tokens": 50000, "System Prompt": "You MUST NOT add any extra commentary outside the JSON"}
                             )

Evaluating 130 test case(s) in parallel: |          |  0% (0/130) [Time Taken: 00:00, ?test case/s]

In [None]:
correctness_matrix = np.zeros((len(alter_names), len(alter_names), len(DOMAIN) * num_questions_per_domain))
print(correctness_matrix.shape)
for i in range(len(model_names)):
    for j in range(i+1, len(model_names)):
        inference_model = model_names[i]
        reference_model = model_names[j]
        case_dataset = get_dataset(
                infer_model=inference_model,
                ref_model=reference_model,
                question_dataframe=quesion_dfs,
                QA_dataframe=QA_df,
                domains=DOMAIN
            )
        print(f"Evaluating {i}_{inference_model} vs {j}_{reference_model}")
        evaluation_output = evaluate(
            case_dataset, 
            [correctness_metric], 
            display_config=display_config,
            hyperparameters={"Temperature": 0.1, "Max Tokens": 50000,}
                )
        scores = np.array([evaluation_output.test_results[k].metrics_data[0].score for k in range(len(evaluation_output.test_results))])
        correctness_matrix[i, j, :] = scores

        inference_model_name = alter_names[j]
        reference_model_name = alter_names[i]
        case_dataset = get_dataset(
                infer_model=inference_model,
                ref_model=reference_model,
                question_dataframe=quesion_dfs,
                QA_dataframe=QA_df,
                domains=DOMAIN
            )
        print(f"Evaluating {j}_{inference_model_name} vs {i}_{reference_model_name}")
        evaluation_output = evaluate(
            case_dataset,
            [correctness_metric], 
            display_config=display_config,
            hyperparameters={"Temperature": 0.1, "Max Tokens": 50000,}
            )
        scores = np.array([evaluation_output.test_results[k].metrics_data[0].score for k in range(len(evaluation_output.test_results))])
        correctness_matrix[j, i, :] = scores
np.save('deepeval_correctness_matrix.npy', correctness_matrix)