## Reference Data

In [None]:
# %pip install pytest

In [1]:
reference_data = [
  {
    "question": "What’s the leave policy?", 
    "ground_truth": "employees must submit a leave request for approval.", #Expected llm generated answer
    "context": "Employees must submit a leave request for approval. " #Expected retrieved context
  }
]

# reference_data = [
#   {
#     "question": "What is the company's policy on remote work?", 
#     "ground_truth": "Remote work is allowed up to 3 days per week.", #Expected llm generated answer
#     "context": "Remote work is allowed up to 3 days per week." #Expected retrieved context
#   }
# ]
question = reference_data[0]['question']
ground_truth = reference_data[0]['ground_truth']
context = reference_data[0]['context']
print (f"question: {question}")
print (f"ground_truth: {ground_truth}")
print (f"context: {context}")

question: What’s the leave policy?
ground_truth: employees must submit a leave request for approval.
context: Employees must submit a leave request for approval. 


In [2]:
# Retrieve context from Milvus DB

from milvus_chatbot_with_rag import retrieve_similiar_contexts, generate_answer

def perform_retrieval(question):

    retrieved_context = retrieve_similiar_contexts(question, "policy_docs_collection", 1)[0]['content']
    print (f"perform_retrieval.retrieved_context: {retrieved_context}")
    return retrieved_context

# Generate answer using LLM

question = reference_data[0]['question']
context = perform_retrieval(question)
answer = generate_answer(question, context)
answer


  from .autonotebook import tqdm as notebook_tqdm


Connected to Milvus on Zilliz Cloud
perform_retrieval.retrieved_context: Employees must submit a leave request for approval.


'Employees must submit a leave request for approval.'

In [None]:
# %pip install ragas datasets 

In [3]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

from dotenv import load_dotenv
from openai import OpenAI
import os

# --- Load API Key ---
load_dotenv(override=True, dotenv_path="../.env")
my_api_key = os.getenv("OPENAI_API_KEY")


client = OpenAI(api_key=my_api_key)

# Question User asked
question = reference_data[0]['question']

# Reference context (should be a string)
reference_context = reference_data[0]['context']

# ground truth answer
ground_truth = reference_data[0]['ground_truth']

# Retrieved context (a string from perform_retrieval)
retrieved_context = [perform_retrieval(question)]
llm_answer = generate_answer(question, retrieved_context[0])

# Build dataset properly
dataset_dict = {
    "question": [question],
    "contexts": [retrieved_context],    # list of strings INSIDE another list
    "ground_truth": [ground_truth],   # single string/ reference answer
    "answer": [llm_answer]
}

print(f"dataset_dict: {dataset_dict}")

ragas_dataset = Dataset.from_dict(dataset_dict)


  from ragas.metrics import faithfulness, answer_correctness
  from ragas.metrics import faithfulness, answer_correctness


Connected to Milvus on Zilliz Cloud
perform_retrieval.retrieved_context: Employees must submit a leave request for approval.
dataset_dict: {'question': ['What’s the leave policy?'], 'contexts': [['Employees must submit a leave request for approval.']], 'ground_truth': ['employees must submit a leave request for approval.'], 'answer': ['Employees must submit a leave request for approval.']}


In [4]:
from ragas.llms.base import llm_factory
from ragas import evaluate
from ragas.metrics import answer_correctness

results = evaluate(
    dataset=ragas_dataset,
    metrics=[faithfulness, answer_correctness]  
)


print("LLM Generation Evaluation Results:")
results.to_pandas()



  from ragas.metrics import answer_correctness
Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]Exception raised in Job[1]: TypeError(Cannot use aembed_text() with a synchronous client. Use embed_text() instead.)
Evaluating: 100%|██████████| 2/2 [00:11<00:00,  5.83s/it]

LLM Generation Evaluation Results:





Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_correctness
0,What’s the leave policy?,[Employees must submit a leave request for app...,Employees must submit a leave request for appr...,employees must submit a leave request for appr...,1.0,


In [None]:
from ragas.llms.base import llm_factory
from ragas import evaluate
from ragas.metrics import answer_correctness

# Create the modern LLM wrapper
client = OpenAI()
llm = llm_factory("gpt-4o-mini", client=client)

# Run evaluation
results = evaluate(
    dataset=ragas_dataset,
    metrics=[answer_correctness],
    llm=llm
)

print("LLM Generation Evaluation Results:")
results.to_pandas()


## 