In [33]:
import json
import os
from sklearn.metrics import f1_score, accuracy_score

path = 'output/gpt_4o_mini.jsonl'
with open(path, 'r') as f:
    data = [json.loads(line) for line in f]

question_codes = list(data[0]['questions_log'].keys())

In [34]:
mapping = {} # 问题编码到问题的真实、预测结果映射

for question_code in question_codes:
    mapping[question_code] = {'label': [], 'prediction': []}
    for item in data:
        if item['questions_log'][question_code] < 0:
            continue
        mapping[question_code]['label'].append(item['questions_log'][question_code])
        mapping[question_code]['prediction'].append(item['questions_simu_log'][question_code])

scores = {}
for question_code in question_codes:
    labels = mapping[question_code]['label']
    preds = mapping[question_code]['prediction']
    micro_f1 = f1_score(labels, preds, average='micro')
    macro_f1 = f1_score(labels, preds, average='macro')
    scores[question_code] = {'micro': micro_f1, 'macro': macro_f1}

In [35]:
total_micro, total_macro = 0, 0
total_micro_subset, total_macro_subset = 0, 0
subset_codes =  ['V201007a', 'V201239', 'V201240', 'V201241', 'V201242', 'V201227']

for question_id, score in scores.items():
    total_micro += score['micro']
    total_macro += score['macro']
    if question_id in subset_codes:
        total_micro_subset += score['micro']
        total_macro_subset += score['macro']

avg_micro = total_micro / len(scores)
avg_macro = total_macro / len(scores)
avg_micro_subset = total_micro_subset / len(subset_codes)
avg_macro_subset = total_macro_subset / len(subset_codes)

print(path)
print("===Overall===")
print(f'Average Micro: {avg_micro:.4f}')
print(f'Average Macro: {avg_macro:.4f}')
print("===Subset===")
print(f'Average Micro: {avg_micro_subset:.4f}')
print(f'Average Macro: {avg_macro_subset:.4f}')

output_wo_neutral/gpt_4o_mini_1k.jsonl
===Overall===
Average Micro: 0.7464
Average Macro: 0.5430
===Subset===
Average Micro: 0.7926
Average Macro: 0.6396
