In [1]:
import json
import openai
import pandas as pd

from code.utils import normalize
from code.gpt_api import gpt_eval, gpt_eval_template
from gen_pred import get_preds

In [2]:
# FILL IN YOUR OPENAI KEY
openai.api_key = ""

In [3]:
# 证券从业考试，保险核保推理
# 构建数据集
datasets = ["证券从业资格考试", "保险核保推理"]
data = pd.read_csv("data/fin_test.csv")
dataset2prompt = json.load(open("code/dataset2prompt.json", "r"))
for dataset in datasets:
    prompt_template = dataset2prompt[dataset]
    preds = get_preds(data.query("任务 == @dataset")[['id','问题','答案']],prompt_template)
    with open(f"pred/{dataset}.json","w") as f:
        for pred in preds:
            json.dump(pred,f,ensure_ascii=False)
            f.write('\n')

# 客观题评估

In [4]:
generated_answer_list = []
reference_answer_list = []
with open("pred/证券从业资格考试.json",'r') as f:
    for line in f:
        data = json.loads(line)
        question = data['prompt']
        reference_answer = data['answer']
        generated_answer = gpt_eval(question)
        reference_answer_list.append(reference_answer)
        generated_answer_list.append(generated_answer)

In [5]:
# Calculate Accuracy
res_list = []
for gen_answer, ref_answer in zip(generated_answer_list, reference_answer_list):
    gen_answer = normalize(gen_answer)
    if gen_answer == ref_answer:
        res_list.append(True)
    else:
        res_list.append(False)

true_count = res_list.count(True)
total_count = len(res_list)
accuracy = (true_count / total_count) * 100
print("证券从业考试准确率：", accuracy, "%")

证券从业考试准确率： 40.0 %


# 主观题评估

In [6]:
generated_answer_list = []
score_list = []
with open("pred/保险核保推理.json",'r') as f:
    for line in f:
        data = json.loads(line)
        question = data['prompt']
        generated_answer = gpt_eval(question)
        generated_answer_list.append(generated_answer)
        gpt_eval_input = gpt_eval_template.format(task="保险核保推理", question=question, generated_answer=generated_answer)
        gpt_score = gpt_eval(gpt_eval_input, "gpt-4")
        try:
            score_list.append(float(gpt_score))
        except:
            score_list.append("")

In [7]:
score_list = [score for score in score_list if isinstance(score, (int, float))]
average = sum(score_list) / len(score_list)
print(f"保险逻辑推理得分: {average}")

保险逻辑推理得分: 8.5
