In [2]:
import pandas as pd

# 1. Load both CSVs
llama_df   = pd.read_csv("llama70b_answers_second_round_with_origin.csv")
centaur_df = pd.read_csv("Centaur_Lab_Second_Round.csv")

# 2. Merge on Origin and bring in answer_corr and data_source_corr
comparison = (
    llama_df
    .merge(
        centaur_df[['Origin', 'answer_corr', 'data_source_corr']],
        on='Origin',
        how='inner'
    )
)

# 3. Compute match flag
comparison['is_match'] = (
    comparison['Extracted_Answer'].astype(str).str.strip()
    == comparison['answer_corr'].astype(str).str.strip()
)

# 4. Overall accuracy and std
overall_acc = comparison['is_match'].mean()
overall_std = comparison['is_match'].std()
print(f"Overall accuracy       : {overall_acc:.2%}")
print(f"Overall std deviation  : {overall_std:.2%}")

# 5. Accuracy and std by data_source_corr
by_source = (
    comparison
    .groupby('data_source_corr')['is_match']
    .agg(accuracy='mean', std='std')
    .reset_index()
)

# format as percentages if you like
by_source['accuracy'] = by_source['accuracy'].map("{:.2%}".format)
by_source['std']      = by_source['std'].map("{:.2%}".format)

print("\nAccuracy and std by data_source_corr:")
print(by_source)


Overall accuracy       : 45.54%
Overall std deviation  : 49.82%

Accuracy and std by data_source_corr:
  data_source_corr accuracy     std
0             jama   52.23%  49.99%
1       medbullets   48.79%  50.11%
2         medxpert   21.38%  41.07%
3             mmlu   61.66%  48.75%
