In [None]:
# !pip install --upgrade pip
# !pip install -q sentence-transformers
# !pip install ragas datasets

In [21]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    context_recall,
    context_precision,
    faithfulness,
    answer_correctness,
    answer_similarity
)


In [22]:

import numpy as np
from sentence_transformers import SentenceTransformer, util
from dotenv import load_dotenv
from openai import OpenAI
import os

# --- Load API Key ---
load_dotenv(override=True)
my_api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=my_api_key)


In [23]:
# --- Retriever: Get top-k docs ---
def get_top_k_similar(query, k=3):
    documents = [
        {"section": "Pay Policies", "content": "Employees are paid bi-weekly via direct deposit."},
        {"section": "Leave of Absence", "content": "Employees must submit a leave request for approval."},
        {"section": "Internet Use", "content": "Company internet must be used for work-related tasks only."}
    ]

    texts = [doc["content"] for doc in documents]
    model = SentenceTransformer("all-MiniLM-L6-v2")
    doc_vectors = model.encode(texts, convert_to_tensor=True)
    query_vec = model.encode(query, convert_to_tensor=True)

    similarities = util.cos_sim(query_vec, doc_vectors)[0].cpu().numpy()
    top_k_idx = np.argsort(similarities)[::-1][:k]

    return [documents[int(idx)] for idx in top_k_idx]



In [24]:



# --- Generator: Use OpenAI with retrieved docs ---
def generate_answer(query, contexts):
    context_text = " ".join(contexts)
    prompt = f"Answer the question based only on the following context:\n{context_text}\n\nQuestion: {query}\nAnswer:"
    
    completion = client.chat.completions.create(
        model="gpt-4o-mini",  # or gpt-3.5-turbo
        messages=[{"role": "user", "content": prompt}],
        max_tokens=200,
    )
    return completion.choices[0].message.content.strip()



In [25]:

# --- Build dataset for Ragas ---
def build_dataset():
    query = "How often do employees get paid?"
    retrieved_docs = get_top_k_similar(query, 3)
    contexts = [d["content"] for d in retrieved_docs]

    # Gold reference
    gold_answer = "Employees are paid bi-weekly via direct deposit."

    # Generate answer using LLM
    model_answer = generate_answer(query, contexts)

    examples = [
        {
            "question": query,
            "answer": model_answer,        # LLM-generated answer
            "contexts": contexts,         
            "reference": gold_answer,     
            "ground_truths": [gold_answer]
        }
    ]
    return Dataset.from_list(examples)



In [29]:


dataset = build_dataset()

# --- All metrics across retriever, generator, and end-to-end ---
all_metrics = [
    context_recall, context_precision,  # Retriever
    
    faithfulness, answer_correctness, answer_similarity  # Generator

]

results = evaluate(dataset, metrics=all_metrics)

print("\n🔹 Full RAG Evaluation Results")
print(results)


Evaluating: 100%|██████████| 5/5 [00:05<00:00,  1.16s/it]



🔹 Full RAG Evaluation Results
{'context_recall': 1.0000, 'context_precision': 1.0000, 'faithfulness': 1.0000, 'answer_correctness': 0.7408, 'answer_similarity': 0.9632}


In [34]:
dataset = build_dataset()

# --- All metrics across retriever, generator, and end-to-end ---
all_metrics = [
    context_recall, context_precision,      # Retriever
    faithfulness, answer_correctness, answer_similarity  # Generator
]

results = evaluate(dataset, metrics=all_metrics)

print("\n🔹 Full RAG Evaluation Results (Aggregated)")
print(results)

# --- Show detailed per-example breakdown ---
print("\n🔹 Detailed Scores (per question & metric)")
for i, row in enumerate(dataset):
    print(f"\nQ{i+1}: {row['question']}")
    print(f"  Answer: {row['answer']}")
    print(f"  Ground Truth: {row['reference']}")
    print(f"  Contexts: {row['contexts']}")
    
    # Each metric is a column in `results` matching the dataset rows
for metric in all_metrics:
    metric_name = metric.name  # ✅ use the metric's internal name
    scores = results[metric_name]
    for i, score in enumerate(scores):
        print(f"Example {i+1} | {metric_name}: {score:.3f}")


Evaluating: 100%|██████████| 5/5 [00:05<00:00,  1.12s/it]



🔹 Full RAG Evaluation Results (Aggregated)
{'context_recall': 1.0000, 'context_precision': 1.0000, 'faithfulness': 1.0000, 'answer_correctness': 0.7417, 'answer_similarity': 0.9666}

🔹 Detailed Scores (per question & metric)

Q1: How often do employees get paid?
  Answer: Employees are paid bi-weekly.
  Ground Truth: Employees are paid bi-weekly via direct deposit.
  Contexts: ['Employees are paid bi-weekly via direct deposit.', 'Employees must submit a leave request for approval.', 'Company internet must be used for work-related tasks only.']
Example 1 | context_recall: 1.000
Example 1 | context_precision: 1.000
Example 1 | faithfulness: 1.000
Example 1 | answer_correctness: 0.742
Example 1 | answer_similarity: 0.967
