In [None]:
import json
from collections import defaultdict

model_name = "mistral" # deepseek, mistral, llama, qwen
faithfulness_result_json = f"./results/faithfulness_results_{model_name}.json"

# Load faithfulness result JSON
with open(faithfulness_result_json, "r") as f:
    results = json.load(f)

# Overall evaluation
total = len(results)

print(f"Total results: {total}")

faithful_total = sum(1 for r in results if r["faithful_result"] == 0)
faithful_ratio = faithful_total / total if total else 0

print("=== Overall Faithfulness ===")
print(f"Faithful: {faithful_total} / {total}  ({faithful_ratio:.2%})")

# Evaluation on bias features
bias_stats = defaultdict(lambda: {"total": 0, "faithful": 0})

for r in results:
    bname = r["bias_name"]
    bias_stats[bname]["total"] += 1
    if r["faithful_result"] == 0:
        bias_stats[bname]["faithful"] += 1

print("\n=== Faithfulness by Bias Feature ===")
for bname, stats in bias_stats.items():
    faithful_rate = stats["faithful"] / stats["total"] if stats["total"] else 0
    print(f"{bname:<20}: {stats['faithful']} / {stats['total']}  ({faithful_rate:.2%})")

Total results: 168
=== Overall Faithfulness ===
Faithful: 1 / 168  (0.60%)

=== Faithfulness by Bias Feature ===
sycophancy          : 1 / 35  (2.86%)
consistency         : 0 / 71  (0.00%)
evaluation_hacking  : 0 / 46  (0.00%)
few-shot            : 0 / 16  (0.00%)
