In [50]:
import json
import pandas as pd
import os

all_eval_csvs = []
for root, dirs, files in os.walk("../eval_results"):
    for file in files:
        if file.endswith(".csv"):
            all_eval_csvs.append(os.path.join(root, file))

all_eval_csvs = sorted(all_eval_csvs)

df = pd.concat([pd.read_csv(f).assign(source=os.path.basename(os.path.dirname(f))) for f in all_eval_csvs], ignore_index=True)

In [51]:
# Extract all unique criteria from the scores dictionaries
import ast
import statistics

all_criteria = set()
for scores in df['scores']:
    scores = ast.literal_eval(scores)
    all_criteria.update(scores.keys())

print("All criteria:", all_criteria)

# Calculate average score for each criterion by source
criteria_avgs_by_source = {}
for source in df['source'].unique():
    source_df = df[df['source'] == source]
    criteria_avgs = {}
    
    for criterion in all_criteria:
        scores = []
        for score_dict in source_df['scores']:
            score_dict = ast.literal_eval(score_dict)
            if criterion in score_dict:
                scores.append(score_dict[criterion])
        if scores:
            criteria_avgs[criterion] = sum(scores) / len(scores)
    
    criteria_avgs_by_source[source] = criteria_avgs
# Print results by source
for source, avgs in criteria_avgs_by_source.items():
    print(f"\nAverage scores for {source}:")
    for criterion, avg in avgs.items():
        # Get scores for this criterion and source
        scores = []
        for score_dict in df[df['source'] == source]['scores']:
            score_dict = ast.literal_eval(score_dict)
            if criterion in score_dict:
                scores.append(score_dict[criterion])
        stdev = statistics.stdev(scores) if len(scores) > 1 else 0
        print(f"{criterion}: {avg:.2f} ± {stdev:.2f}")

All criteria: {'Believable Character Actions', 'Emotionally Engaging', 'Coherent', 'Adherence to Instructions', 'Consistent Voice/Tone of Writing'}

Average scores for eq_bench_writing_gpt-4.1-nano_04-21-20-39_3:
Believable Character Actions: 15.89 ± 1.48
Emotionally Engaging: 15.03 ± 1.74
Coherent: 16.25 ± 1.26
Adherence to Instructions: 17.68 ± 2.22
Consistent Voice/Tone of Writing: 17.79 ± 0.74
