# Activity #1: Advanced Retrieval Evaluation

Evaluates 6 retriever methods using Ragas SDG and LangSmith

### YOUR CODE HERE
# Section 1: Imports
import os
from getpass import getpass
import pandas as pd
from operator import itemgetter
import time
import random
import pypdf

# LangChain
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# Retrievers
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import (
    MultiQueryRetriever,
    ContextualCompressionRetriever,
    EnsembleRetriever,
    ParentDocumentRetriever
)
from langchain.retrievers.document_compressors import CohereRerank
from langchain.storage import InMemoryStore

# Ragas (0.2.10) 
from ragas import evaluate
from ragas.metrics import Faithfulness, ContextPrecision, ContextRecall, AnswerRelevancy, ContextEntityRecall
from ragas.testset import TestsetGenerator
from datasets import Dataset  # For Ragas 0.2.10 evaluation format

# LangSmith
from langsmith import Client

print("✅ All imports successful")

In [8]:
## YOUR CODE HERE
# Section 1: Imports
import os
from getpass import getpass
import pandas as pd
from operator import itemgetter
import time
import random
import pypdf

# LangChain
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_qdrant import Qdrant
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# Retrievers
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import (
    MultiQueryRetriever,
    ContextualCompressionRetriever,
    EnsembleRetriever,
    ParentDocumentRetriever
)
from langchain_cohere import CohereRerank
from langchain.storage import InMemoryStore

# Ragas (0.2.10) - 
from ragas import evaluate
from ragas.metrics import Faithfulness, ContextPrecision, ContextRecall, AnswerRelevancy, ContextEntityRecall
from ragas.testset import TestsetGenerator
from datasets import Dataset  # For Ragas 0.2.10 evaluation format

# LangSmith
from langsmith import Client

print("✅ All imports successful")


✅ All imports successful


In [2]:
# API Keys Setup
os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API Key: ")
os.environ["COHERE_API_KEY"] = getpass("Enter Cohere API Key: ")
os.environ["LANGCHAIN_API_KEY"] = getpass("Enter LangChain/LangSmith API Key: ")

# Enable LangSmith tracing
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Advanced_Retrieval_Evaluation"

langsmith_client = Client()
print("✅ API keys configured and LangSmith tracing enabled")


✅ API keys configured and LangSmith tracing enabled


In [3]:
# Section 2: Load PDF
pdf_path = "data/TC.pdf"
print(f"📄 Loading PDF: {pdf_path}")

loader = PyPDFLoader(pdf_path)
documents = loader.load()

print(f"✅ Loaded {len(documents)} pages from PDF")
print(f"   First page preview: {documents[0].page_content[:200]}...")

# Split for retrieval
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
print(f"✅ Split into {len(chunks)} chunks for retrieval")


📄 Loading PDF: data/TC.pdf
✅ Loaded 6 pages from PDF
   First page preview: Traffic Congestion and Reliability: Trends, 
Causes, Measurement, and Strategic 
Approaches 
Traffic congestion is a pervasive issue that impacts economic efficiency, quality of life, and 
overall tra...
✅ Split into 18 chunks for retrieval


In [4]:
# Section 3: Golden Dataset Generation (Ragas SDG 0.2.10) - 10 Questions


from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
golden_dataset = generator.generate_with_langchain_docs(documents, testset_size=10)

# Convert to pandas for easy access
golden_df = golden_dataset.to_pandas()

print(f"✅ Generated {len(golden_df)} synthetic questions!")
print("\\n📋 Sample questions:")
for i in range(min(3, len(golden_df))):
    print(f"   {i+1}. {golden_df.iloc[i]['user_input']}")

# Display full dataset
golden_df


Applying SummaryExtractor:   0%|          | 0/6 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/6 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/18 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

✅ Generated 10 synthetic questions!
\n📋 Sample questions:
   1. What role do traffic incidents play in causing traffic congestion?
   2. Wut are the effects of special evnts on traffic demand?
   3. How does congestion at border crossings affect international trade?


Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What role do traffic incidents play in causing...,"[Traffic Congestion and Reliability: Trends, \...","Traffic incidents, such as crashes, breakdowns...",single_hop_specifc_query_synthesizer
1,Wut are the effects of special evnts on traffi...,[speeds and increasing delays. These disruptio...,"Special events, such as major sporting contest...",single_hop_specifc_query_synthesizer
2,How does congestion at border crossings affect...,[Travel time is the fundamental metric for con...,Congestion at border crossings elevates costs ...,single_hop_specifc_query_synthesizer
3,How can travler informashun help in reducing c...,[Households face both financial and time-relat...,Traveler information provides real-time data o...,single_hop_specifc_query_synthesizer
4,What are ITS technolgies used for in traffic m...,[efficiently not only restores traffic flow mo...,ITS technologies are integrated into comprehen...,single_hop_specifc_query_synthesizer
5,What are the primary causes of traffic congest...,[<1-hop>\n\nefficiently not only restores traf...,Traffic congestion is primarily caused by a co...,multi_hop_specific_query_synthesizer
6,What are the causes of traffic congestion and ...,[<1-hop>\n\nefficiently not only restores traf...,Traffic congestion is caused by multiple inter...,multi_hop_specific_query_synthesizer
7,What are the causes of traffic congestion and ...,[<1-hop>\n\nefficiently not only restores traf...,Traffic congestion occurs when the number of v...,multi_hop_specific_query_synthesizer
8,How do traffic incidents contribute to traffic...,[<1-hop>\n\nefficiently not only restores traf...,"Traffic incidents, such as crashes, breakdowns...",multi_hop_specific_query_synthesizer
9,What are the primary causes of traffic congest...,[<1-hop>\n\nefficiently not only restores traf...,Traffic congestion is primarily caused by a co...,multi_hop_specific_query_synthesizer


In [7]:
# Section 4: Setup All 6 Retrievers
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# 1. Naive
vectorstore = FAISS.from_documents(chunks, embeddings)
naive = vectorstore.as_retriever(search_kwargs={"k": 5})

# 2. BM25
bm25 = BM25Retriever.from_documents(chunks); bm25.k = 5

# 3. Multi-Query
multi_query = MultiQueryRetriever.from_llm(retriever=naive, llm=llm)

# 4. Parent Document
store = InMemoryStore()
parent_doc = ParentDocumentRetriever(
    vectorstore=FAISS.from_documents(RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50).split_documents(documents), embeddings),
    docstore=store,
    child_splitter=RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50),
    parent_splitter=RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
)
parent_doc.add_documents(documents)

# 5. Contextual Compression (Cohere Rerank)
compression = ContextualCompressionRetriever(
    base_compressor=CohereRerank(model="rerank-english-v3.0", top_n=5),
    base_retriever=naive
)

# 6. Ensemble
ensemble = EnsembleRetriever(retrievers=[bm25, naive, multi_query], weights=[0.3, 0.4, 0.3])

retrievers = {"Naive": naive, "BM25": bm25, "Multi-Query": multi_query, 
              "Parent Document": parent_doc, "Contextual Compression": compression, "Ensemble": ensemble}

print(f"✅ All {len(retrievers)} retrievers ready")


✅ All 6 retrievers ready


  base_compressor=CohereRerank(model="rerank-english-v3.0", top_n=5),


In [9]:
# Section 5: Build RAG Chains with LangSmith Tracking
RAG_TEMPLATE = """You are a helpful AI assistant. Use the context to answer the question.
If you don't know, say so. Don't make up information.

Question: {question}
Context: {context}
Answer:"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)
rag_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def build_chain(retriever):
    return (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=lambda x: "\\n\\n".join(d.page_content for d in x["context"]))
        | {"response": rag_prompt | rag_llm, "context": itemgetter("context")}
    )

rag_chains = {name: build_chain(r) for name, r in retrievers.items()}
print(f"✅ {len(rag_chains)} RAG chains ready with LangSmith tracking")


✅ 6 RAG chains ready with LangSmith tracking


In [None]:
# Section 6: Evaluate All Retrievers with Ragas Metrics (0.2.10 API)
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

# Initialize metrics with LLM and embeddings
metrics = [
    ContextPrecision(llm=evaluator_llm), 
    ContextRecall(llm=evaluator_llm), 
    AnswerRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings), 
    Faithfulness(llm=evaluator_llm), 
    ContextEntityRecall(llm=evaluator_llm)
]

print(f"📊 Evaluating {len(retrievers)} retrievers with {len(metrics)} Ragas metrics...")
evaluation_results = {}

for name, chain in rag_chains.items():
    print(f"\\n{'='*50}\\nEvaluating: {name}\\n{'='*50}")
    start = time.time()
    
    # Build evaluation data (Ragas 0.2.10 format)
    # Required columns: question, answer, contexts, ground_truths, reference
    eval_data = {"question": [], "answer": [], "contexts": [], "ground_truths": [], "reference": []}
    
    # Use enumerate to get proper sequential count
    for idx in range(len(golden_df)):
        row = golden_df.iloc[idx]
        print(f"  Question {idx+1}/{len(golden_df)}...", end="\\r")
        try:
            question = row['user_input']
            reference = row.get('reference', '')
            
            result = chain.invoke({"question": question})
            
            eval_data["question"].append(question)
            eval_data["answer"].append(result["response"].content)
            eval_data["contexts"].append([result["context"]])
            eval_data["ground_truths"].append([reference] if reference else [question])
            eval_data["reference"].append(reference if reference else question)
            
            time.sleep(0.3)
        except Exception as e:
            print(f"\\n  ✗ Error on question {idx+1}: {e}")
            continue  # Skip failed questions
    
    print(f"\\n  Running Ragas evaluation...")
    from datasets import Dataset
    eval_dataset = Dataset.from_dict(eval_data)
    ragas_results = evaluate(eval_dataset, metrics=metrics, llm=evaluator_llm)
    elapsed = time.time() - start
    
    # Extract metric scores for display
    metric_scores = {}
    for metric in metrics:
        if hasattr(ragas_results, metric.name):
            metric_scores[metric.name] = f"{getattr(ragas_results, metric.name):.4f}"
    
    evaluation_results[name] = {"ragas": ragas_results, "latency": elapsed, "count": len(eval_data["question"])}
    print(f"  ✅ Complete ({elapsed:.1f}s): {metric_scores}")

print("\\n" + "="*70)
print("📊 EVALUATION COMPLETE - QUICK SUMMARY")
print("="*70)

# Quick summary table
summary_data = []
for name, res in evaluation_results.items():
    ragas_result = res["ragas"]
    row = {
        "Retriever": name,
        "Questions": res["count"],
        "Latency (s)": f"{res['latency']:.1f}"
    }
    # Add first few metrics for quick view
    for metric in metrics[:3]:  # Show first 3 metrics
        if hasattr(ragas_result, metric.name):
            row[metric.name] = f"{getattr(ragas_result, metric.name):.3f}"
    summary_data.append(row)

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
print("\\n✅ Full detailed results in next cell...")


📊 Evaluating 6 retrievers with 5 Ragas metrics...
  Question 1/10...\r  Question 2/10...\r  Question 3/10...\r  Question 4/10...\r  Question 5/10...\r  Question 6/10...\r  Question 7/10...\r  Question 8/10...\r  Question 9/10...\r  Question 10/10...\r\n  Running Ragas evaluation...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

  ✅ Complete (122.9s): {'context_precision': '1.0000', 'context_recall': '0.9250', 'answer_relevancy': '0.8696', 'faithfulness': '0.9537', 'context_entity_recall': '0.3681'}
  Question 1/10...\r  Question 2/10...\r  Question 3/10...\r  Question 4/10...\r  Question 5/10...\r  Question 6/10...\r  Question 7/10...\r  Question 8/10...\r  Question 9/10...\r  Question 10/10...\r\n  Running Ragas evaluation...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

  ✅ Complete (110.1s): {'context_precision': '1.0000', 'context_recall': '0.8000', 'answer_relevancy': '0.8773', 'faithfulness': '0.9329', 'context_entity_recall': '0.2899'}
  Question 1/10...\r  Question 2/10...\r  Question 3/10...\r  Question 4/10...\r  Question 5/10...\r  Question 6/10...\r  Question 7/10...\r  Question 8/10...\r  Question 9/10...\r  Question 10/10...\r\n  Running Ragas evaluation...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

  ✅ Complete (149.3s): {'context_precision': '1.0000', 'context_recall': '0.9167', 'answer_relevancy': '0.8697', 'faithfulness': '0.9758', 'context_entity_recall': '0.4250'}
  Question 1/10...\r  Question 2/10...\r  Question 3/10...\r  Question 4/10...\r  Question 5/10...\r  Question 6/10...\r  Question 7/10...\r  Question 8/10...\r  Question 9/10...\r  Question 10/10...\r\n  Running Ragas evaluation...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

  ✅ Complete (124.4s): {'context_precision': '1.0000', 'context_recall': '0.8250', 'answer_relevancy': '0.8697', 'faithfulness': '0.8739', 'context_entity_recall': '0.2947'}
  Question 1/10...\r  Question 2/10...\r  Question 3/10...\r  Question 4/10...\r  Question 5/10...\r  Question 6/10...\r  Question 7/10...\r  Question 8/10...\r  Question 9/10...\r  Question 10/10...\r\n  Running Ragas evaluation...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

  ✅ Complete (128.3s): {'context_precision': '1.0000', 'context_recall': '0.8917', 'answer_relevancy': '0.8695', 'faithfulness': '0.9446', 'context_entity_recall': '0.3136'}
  Question 1/10...\r  Question 2/10...\r  Question 3/10...\r  Question 4/10...\r  Question 5/10...\r  Question 6/10...\r  Question 7/10...\r  Question 8/10...\r  Question 9/10...\r  Question 10/10...\r\n  Running Ragas evaluation...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

  ✅ Complete (173.5s): {'context_precision': '1.0000', 'context_recall': '0.9500', 'answer_relevancy': '0.8685', 'faithfulness': '0.9800', 'context_entity_recall': '0.3349'}
📊 EVALUATION COMPLETE - QUICK SUMMARY
             Retriever  Questions Latency (s)
                 Naive         10       122.9
                  BM25         10       110.1
           Multi-Query         10       149.3
       Parent Document         10       124.4
Contextual Compression         10       128.3
              Ensemble         10       173.5
\n✅ Full detailed results in next cell...


In [14]:
# Section 7: Results Compilation and Analysis
results_data = []
for name, res in evaluation_results.items():
    # Ragas 0.2.10: EvaluationResult has metric scores as attributes
    ragas_result = res["ragas"]
    row = {
        "Retriever": name, 
        "Latency (s)": res["latency"], 
        "Questions": res["count"]
    }
    
    # Extract metric scores from EvaluationResult
    for metric in metrics:
        metric_name = metric.name
        if hasattr(ragas_result, metric_name):
            row[metric_name] = getattr(ragas_result, metric_name)
    
    results_data.append(row)

results_df = pd.DataFrame(results_data).sort_values("Latency (s)")
print("\\n📊 FINAL RESULTS:")
print(results_df.to_string(index=False))

results_df.to_csv("retriever_evaluation_results.csv", index=False)
print("\\n✅ Saved to: retriever_evaluation_results.csv")

# Analysis
best = {col: results_df.loc[results_df[col].idxmax(), "Retriever"] 
        for col in results_df.columns if col not in ["Retriever", "Latency (s)", "Questions"]}

print("\\n🏆 Best by Metric:")
for metric, retriever in best.items():
    print(f"   {metric}: {retriever}")

print("\\n📝 RECOMMENDATION:")
print("""
Based on cost, latency, and performance:
- For best performance: Check scores above
- For best speed: Check latency column  
- For balanced approach: Consider all factors
""")

# LangSmith URLs
project_name = "Advanced_Retrieval_Evaluation"
print("\\n🔗 LangSmith Links:")
print(f"   📊 Project Dashboard: https://smith.langchain.com/o/default/projects/p/{project_name.replace('_', '-').lower()}")
print(f"   🔍 View All Traces: https://smith.langchain.com/")
print(f"   📁 Project: {project_name}")
print("\\n   💡 Tip: Use the LangSmith dashboard to:")
print("      - View detailed traces for each retriever")
print("      - Compare latency and token usage")
print("      - Debug retrieval quality issues")
print("      - Analyze cost breakdowns")

results_df


\n📊 FINAL RESULTS:
             Retriever  Latency (s)  Questions
                  BM25   110.129075         10
                 Naive   122.944868         10
       Parent Document   124.400105         10
Contextual Compression   128.307050         10
           Multi-Query   149.312663         10
              Ensemble   173.492774         10
\n✅ Saved to: retriever_evaluation_results.csv
\n🏆 Best by Metric:
\n📝 RECOMMENDATION:

Based on cost, latency, and performance:
- For best performance: Check scores above
- For best speed: Check latency column  
- For balanced approach: Consider all factors

\n🔗 LangSmith Links:
   📊 Project Dashboard: https://smith.langchain.com/o/default/projects/p/advanced-retrieval-evaluation
   🔍 View All Traces: https://smith.langchain.com/
   📁 Project: Advanced_Retrieval_Evaluation
\n   💡 Tip: Use the LangSmith dashboard to:
      - View detailed traces for each retriever
      - Compare latency and token usage
      - Debug retrieval quality issues
  

Unnamed: 0,Retriever,Latency (s),Questions
1,BM25,110.129075,10
0,Naive,122.944868,10
3,Parent Document,124.400105,10
4,Contextual Compression,128.30705,10
2,Multi-Query,149.312663,10
5,Ensemble,173.492774,10
