In [5]:
import pandas as pd
import numpy as np

# Step 1: Load both files
df = pd.read_csv("llama70b_answers_summary.csv")           # predictions
truth_df = pd.read_csv("merged_llm_4k_questions.csv")       # ground truth

# Step 2: Create artificial QA_ID for row alignment
truth_df = truth_df.reset_index()
truth_df["QA_ID"] = "Merge Q" + (truth_df["index"] + 1).astype(str)

# Step 3: Standardize formats
df["Extracted_Answer"] = df["Extracted_Answer"].astype(str).str.strip().str.upper()
truth_df["answer_idx"] = truth_df["answer"].astype(str).str.strip().str.upper()

# Step 4: Merge using QA_ID
merged = pd.merge(df, truth_df[["QA_ID", "answer_idx", "data_source"]], on="QA_ID", how="inner")

# Step 5: Compute match (boolean)
merged["match"] = merged["Extracted_Answer"] == merged["answer_idx"]

# Step 6: Overall statistics
correct_count = merged["match"].sum()
total_count = merged["match"].notna().sum()
accuracy = correct_count / total_count if total_count > 0 else 0
std_dev = merged["match"].std(ddof=1)  # sample standard deviation

print(f"Correct Predictions: {correct_count}")
print(f"Total Evaluated: {total_count}")
print(f"Accuracy: {accuracy:.2%}")
print(f"Standard Deviation (Overall): {std_dev:.4f}")

# Step 7: Per-data source statistics
grouped = (
    merged.groupby("data_source")["match"]
    .agg(accuracy="mean", std_dev=lambda x: x.std(ddof=1))
    .reset_index()
)

# Step 8: Display per-source results
print("\nAccuracy and Std Dev by Data Source:")
print(grouped)


Correct Predictions: 1218
Total Evaluated: 4054
Accuracy: 30.04%
Standard Deviation (Overall): 0.4585

Accuracy and Std Dev by Data Source:
  data_source  accuracy   std_dev
0        jama  0.480658  0.499868
1  medbullets  0.486577  0.500661
2    medxpert  0.148571  0.355738
3        mmlu  0.779412  0.415408
