In [1]:
# ============================================================================
# RAG EVALUATION WITH RAGAS
# ============================================================================
# This notebook demonstrates how to evaluate a RAG (Retrieval-Augmented 
# Generation) system using RAGAS metrics. It uses a PDF document about 
# "The Ember & Oak Kitchen" restaurant as the knowledge base.
# ============================================================================

# Load environment variables from .env file (for API keys)
from dotenv import load_dotenv

# LangChain imports for building the RAG pipeline
from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

# RAGAS metrics for evaluation
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,  # Measures how relevant retrieved context is
    LLMContextRecall,                      # Measures if all relevant info was retrieved
    ContextEntityRecall,                   # Measures entity coverage in retrieved context
    NoiseSensitivity,                      # Measures robustness to irrelevant context
    ResponseRelevancy,                     # Measures relevance of generated response
    Faithfulness                            # Measures if response is grounded in context
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import LLMContextRecall
from ragas import evaluate

# Pinecone vector database for storing embeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from pinecone import ServerlessSpec

import os
from datasets import Dataset  # HuggingFace datasets for RAGAS

# Import test examples with ground truth answers
from data import examples

# LangChain chains for RAG pipeline
from langchain_classic.chains import create_retrieval_chain 
from langchain_classic.chains.combine_documents.stuff import create_stuff_documents_chain 
from langchain_core.prompts import ChatPromptTemplate

# Load environment variables (OPENAI_API_KEY, PINECONE_API_KEY)
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


True

In [2]:
# ============================================================================
# STEP 1: BUILD THE RAG PIPELINE
# ============================================================================
# This section sets up the complete RAG pipeline including:
# - LLM for generating responses
# - Document loading and chunking
# - Embeddings and vector storage in Pinecone
# - Retriever configuration
# ============================================================================

# Initialize the LLM (GPT-4o) for generating responses
# Temperature 0.6 provides a balance between creativity and consistency
Model = init_chat_model("gpt-4o", temperature = 0.6)

# Load the PDF document about The Ember & Oak Kitchen
loader = PyPDFLoader("The_Ember_and_Oak_Kitchen_Profile.pdf")
document = loader.load()

# Split documents into chunks for better retrieval
# chunk_size=1000: Each chunk is ~1000 characters
# chunk_overlap=200: 200 characters overlap between chunks to maintain context
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(document)

# Initialize OpenAI embeddings model
# text-embedding-3-large: High-quality embeddings with 3072 dimensions
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# ============================================================================
# PINECONE VECTOR DATABASE SETUP
# ============================================================================
# Set up Pinecone for storing and retrieving document embeddings
# ============================================================================

api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)
index_name = "rag-evaluation"

# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=3072,  # Must match embedding model dimension
        metric="cosine",  # Cosine similarity for semantic search
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

# Check if index already has documents to avoid duplicate uploads
stats = index.describe_index_stats()
total_vectors = stats["total_vector_count"]

# Only upload documents if index is empty
if total_vectors == 0:
    print("Index is empty, adding documents...")
    vectorstore = PineconeVectorStore.from_documents(
        documents=all_splits,
        embedding=embeddings,
        index_name=index_name
    )
    print(f"Added {len(all_splits)} documents to Pinecone")
else:
    # Connect to existing index with documents
    print(f"Index already has {total_vectors} vectors, skipping upload")
    vectorstore = PineconeVectorStore(
        index=index,
        embedding=embeddings
    )

# Create retriever object that will fetch top 3 most relevant chunks for each query
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})



Index already has 33 vectors, skipping upload


In [3]:
# ============================================================================
# STEP 2: CREATE AND TEST THE RAG CHAIN
# ============================================================================
# This section creates the complete RAG chain and tests it with a sample query
# ============================================================================

# Test query to verify the RAG pipeline is working
query = "Who founded The Ember & Oak Kitchen and what is their background?"

# Create a prompt template that instructs the LLM to answer based on retrieved context
prompt = ChatPromptTemplate.from_messages([
    ("system", "Answer based on context:\n{context}"),
    ("human", "{input}")
])

# Create a chain that combines documents and generates answers
document_chain = create_stuff_documents_chain(Model, prompt)

# Create the complete RAG chain (retrieval + generation)
rag_chain = create_retrieval_chain(retriever, document_chain)

# Test the RAG chain with a sample query
response = rag_chain.invoke({
    "input": query
})

# The response contains both the answer and the retrieved context
# rag_answer = response["answer"]



In [4]:
# ============================================================================
# STEP 3: PREPARE EVALUATION DATASET
# ============================================================================
# This section runs the RAG pipeline on all test questions and collects:
# - User inputs (questions)
# - Generated responses
# - Retrieved contexts
# - Ground truth answers (references)
# ============================================================================

# Initialize dictionary to store evaluation data
evaluation_data = {
    "user_input": [],          # The questions being asked
    "response": [],            # RAG system's generated answers
    "retrieved_contexts": [],  # Context chunks retrieved from vector DB
    "reference": []            # Ground truth answers for comparison
}

# Process each example question through the RAG pipeline
for item in examples:
    question = item["question"]

    # Get response from RAG chain
    response = rag_chain.invoke({
        "input": question
    })
    rag_answer = response["answer"]

    # Get ground truth answer
    ground_truth = item["ground_truth"]

    # Get retrieved contexts for this question
    contexts = retriever.invoke(question)
    
    # Extract page_content from Document objects for PyArrow compatibility
    # RAGAS requires plain text, not Document objects
    context_texts = [doc.page_content for doc in contexts]

    # Append all data to evaluation dictionary
    evaluation_data["user_input"].append(question)
    evaluation_data["response"].append(rag_answer)
    evaluation_data["retrieved_contexts"].append(context_texts)
    evaluation_data["reference"].append(ground_truth)

print(f"Collected {len(evaluation_data['user_input'])} samples for evaluation")



Collected 8 samples for evaluation


In [5]:
# ============================================================================
# STEP 4: EVALUATE WITH RAGAS METRICS
# ============================================================================
# This section evaluates the RAG system using multiple RAGAS metrics to
# assess retrieval quality, response relevance, and faithfulness
# ============================================================================

# Convert evaluation data to HuggingFace Dataset format
# RAGAS evaluate() function requires Dataset format, not a regular Python dict
dataset = Dataset.from_dict(evaluation_data)

# Use a smaller, faster embedding model for evaluation metrics
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Wrap LangChain components for RAGAS compatibility
evaluator_llm = LangchainLLMWrapper(Model)
evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)

# Define evaluation metrics
# Each metric measures a different aspect of RAG performance:
metrics = [
    LLMContextPrecisionWithoutReference(),  # How precise is the retrieved context?
    LLMContextRecall(),                      # Did we retrieve all relevant information?
    ContextEntityRecall(),                   # Are key entities in the retrieved context?
    NoiseSensitivity(),                      # Is the system robust to irrelevant context?
    ResponseRelevancy(),                     # Is the response relevant to the question?
    Faithfulness(),                          # Is the response faithful to the context?
]

print("\ntarting RAGAS Evaluation...")
# Run evaluation on the dataset
# This will take several minutes as it processes each question with each metric
result = evaluate(
    dataset=dataset,
    metrics=metrics,
    llm=evaluator_llm,
    embeddings=embeddings
)

print("\nRAGAS Results:")
print(result)



tarting RAGAS Evaluation...


  evaluator_llm = LangchainLLMWrapper(Model)
  evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)
Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  17%|█▋        | 8/48 [00:50<03:20,  5.00s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  19%|█▉        | 9/48 [00:54<03:10,  4.90s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  31%|███▏      | 15/48 [01:24<02:20,  4.26s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  50%|█████     | 24/48 [02:01<01:17,  3.21s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Evaluating:  60%|██████    | 29/48 [02:37<01:33,  4.94s/it]LLM returned 1 generations instead of requested 3. Proce


RAGAS Results:
{'llm_context_precision_without_reference': 0.7917, 'context_recall': 0.8333, 'context_entity_recall': 0.2677, 'noise_sensitivity(mode=relevant)': 0.0000, 'answer_relevancy': 0.8086, 'faithfulness': 0.9041}
