In [None]:
import matplotlib.pyplot as plt

# Simulated average similarity scores for each model
average_scores = {
    "Mistral": 0.82,
    "DeepSeek": 0.78,
    "GPT": 0.87
}

# Extract names and scores
models = list(average_scores.keys())
scores = list(average_scores.values())

# Plotting
plt.figure(figsize=(8, 5))
bars = plt.bar(models, scores, color=["skyblue", "salmon", "mediumseagreen"])
plt.ylim(0, 1)
plt.ylabel("Average Similarity Score")
plt.title("Average Similarity Score by Model")
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Annotate bars with scores
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
             f"{score:.2f}", ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
[
  {
    "query": "What are the benefits of yoga?",
    "ground_truth": "Yoga improves flexibility and reduces stress.",
    "mistral_answer": "...",
    "deepseek_answer": "...",
    "gpt_answer": "..."
  },
  ...
]


from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")  # or use a better one

def get_similarity(ref, pred):
    ref_emb = model.encode(ref, convert_to_tensor=True)
    pred_emb = model.encode(pred, convert_to_tensor=True)
    return float(util.cos_sim(ref_emb, pred_emb)[0][0])

results = {
    "mistral": [],
    "deepseek": [],
    "gpt": []
}

for item in dataset:
    ref = item["ground_truth"]
    results["mistral"].append(get_similarity(ref, item["mistral_answer"]))
    results["deepseek"].append(get_similarity(ref, item["deepseek_answer"]))
    results["gpt"].append(get_similarity(ref, item["gpt_answer"]))

# Compute average per model
averages = {k: sum(v)/len(v) for k, v in results.items()}
print(averages)

