# RAPTOR (Recursive Abstraction and Partitioning for Tree-based Organized Retrieval)


**RAPTOR (Recursive Abstraction and Partitioning for Tree-based Organized Retrieval)** is
another hierarchical document summarisation technique which could enhance the performance

of a RAG system. This method implements a tree structure which captures both high and low-
level details in the knowledge base. In order to achieve this, this approach groups text into

clusters, summarises each cluster and iteratively repeats this process while building a tree from
the ground up. This way, RAPTOR allows the system to incorporate knowledge on multiple
levels, traversing from high-level summaries down to detailed passages, enhancing the quality
of retrieved context. Furthermore, besides this invaluable structure, the retrieval is also
alleviated by the introduced clustering method, which organises contextually similar
knowledge in the same group. This is expected to improve coherence by margins.

In [1]:
import os

# Set environment variables
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGSMITH_PROJECT"] = "RAPTOR"

# Verify that the environment variables are set
print("Environment Variables Set:")
print(f'LANGSMITH_TRACING: {os.getenv("LANGSMITH_TRACING")}')
print(f'LANGSMITH_ENDPOINT: {os.getenv("LANGSMITH_ENDPOINT")}')
print(f'LANGSMITH_PROJECT: {os.getenv("LANGSMITH_PROJECT")}')

Environment Variables Set:
LANGSMITH_TRACING: true
LANGSMITH_ENDPOINT: https://api.smith.langchain.com
LANGSMITH_PROJECT: RAPTOR


In [3]:
import numpy as np
import pandas as pd
from langsmith import traceable
from typing import List, Dict, Any
from sklearn.mixture import GaussianMixture
from langchain.chains.llm import LLMChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.schema import AIMessage
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings

import matplotlib.pyplot as plt
import logging
import os
import sys
from dotenv import load_dotenv

In [4]:
import getpass
import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key: ········


## LLM and Embedding Model

In [6]:
# Load LLM
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=4000,
)


In [7]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large"
)

## Building the RAPTOR structure

In [None]:
@traceable
def extract_text(item):
    """Extract text content from either a string or an AIMessage object."""
    if isinstance(item, AIMessage):
        return item.content
    return item

@traceable
def embed_texts(texts: List[str]) -> List[List[float]]:
    """Embed texts using OpenAIEmbeddings."""
    logging.info(f"Embedding {len(texts)} texts")
    return embeddings.embed_documents([extract_text(text) for text in texts])

@traceable
def perform_clustering(embeddings: np.ndarray, n_clusters: int = 10) -> np.ndarray:
    """Perform clustering on embeddings using Gaussian Mixture Model."""
    logging.info(f"Performing clustering with {n_clusters} clusters")
    gm = GaussianMixture(n_components=n_clusters, random_state=42)
    return gm.fit_predict(embeddings)

@traceable
def summarize_texts(texts: List[str]) -> str:
    """Summarize a list of texts using OpenAI."""
    logging.info(f"Summarizing {len(texts)} texts")
    prompt = ChatPromptTemplate.from_template(
        "Summarize the following text concisely:\n\n{text}"
    )
    chain = prompt | llm
    input_data = {"text": texts}
    return chain.invoke(input_data)

@traceable
def visualize_clusters(embeddings: np.ndarray, labels: np.ndarray, level: int):
    """Visualize clusters using PCA."""
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)
    
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='viridis')
    plt.colorbar(scatter)
    plt.title(f'Cluster Visualization - Level {level}')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.show()

## The Core Function

In [None]:
@traceable
def build_raptor_tree(texts: List[str], max_levels: int = 3) -> Dict[int, pd.DataFrame]:
    """Build the RAPTOR tree structure with level metadata and parent-child relationships."""
    results = {}
    current_texts = [extract_text(text) for text in texts]
    current_metadata = [{"level": 0, "origin": "original", "parent_id": None} for _ in texts]
    
    for level in range(1, max_levels + 1):
        logging.info(f"Processing level {level}")
        
        embeddings = embed_texts(current_texts)
        n_clusters = min(10, len(current_texts) // 2)
        cluster_labels = perform_clustering(np.array(embeddings), n_clusters)
        
        df = pd.DataFrame({
            'text': current_texts,
            'embedding': embeddings,
            'cluster': cluster_labels,
            'metadata': current_metadata
        })
        
        results[level-1] = df
        
        summaries = []
        new_metadata = []
        for cluster in df['cluster'].unique():
            cluster_docs = df[df['cluster'] == cluster]
            cluster_texts = cluster_docs['text'].tolist()
            cluster_metadata = cluster_docs['metadata'].tolist()
            summary = summarize_texts(cluster_texts)
            summaries.append(summary)
            new_metadata.append({
                "level": level,
                "origin": f"summary_of_cluster_{cluster}_level_{level-1}",
                "child_ids": [meta.get('id') for meta in cluster_metadata],
                "id": f"summary_{level}_{cluster}"
            })
        
        current_texts = summaries
        current_metadata = new_metadata
        
        if len(current_texts) <= 1:
            results[level] = pd.DataFrame({
                'text': current_texts,
                'embedding': embed_texts(current_texts),
                'cluster': [0],
                'metadata': current_metadata
            })
            logging.info(f"Stopping at level {level} as we have only one summary")
            break
    
    return results

## Vectorstore


In [None]:
@traceable
def create_vector_store(docs, save_path="nvidia_RAPTOR"):
    """Create and save a FAISS vector store from given documents."""
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vectorstore = FAISS.from_documents(docs, embeddings)
    vectorstore.save_local(save_path)
    logging.info(f"FAISS vectorstore saved to {save_path}")
    return vectorstore

In [None]:
@traceable
def build_vectorstore(tree_results: Dict[int, pd.DataFrame], save_path="nvidia_RAPTOR") -> FAISS:
    """Build a FAISS vectorstore from all texts in the RAPTOR tree and save it."""
    all_texts = []
    all_embeddings = []
    all_metadatas = []

    for level, df in tree_results.items():
        all_texts.extend([str(text) for text in df['text'].tolist()])
        all_embeddings.extend([embedding.tolist() if isinstance(embedding, np.ndarray) else embedding for embedding in df['embedding'].tolist()])
        all_metadatas.extend(df['metadata'].tolist())

    logging.info(f"Building vectorstore with {len(all_texts)} texts")

    # Create Document objects manually to ensure correct types
    documents = [Document(page_content=str(text), metadata=metadata)
                 for text, metadata in zip(all_texts, all_metadatas)]

    return create_vector_store(documents, save_path)

## Define tree traversal retrieval

In [None]:
@traceable
def tree_traversal_retrieval(query: str, vectorstore: FAISS, k: int = 3) -> List[Document]:
    """Perform tree traversal retrieval."""
    query_embedding = embeddings.embed_query(query)
    
    def retrieve_level(level: int, parent_ids: List[str] = None) -> List[Document]:
        if parent_ids:
            docs = vectorstore.similarity_search_by_vector_with_relevance_scores(
                query_embedding,
                k=k,
                filter=lambda meta: meta['level'] == level and meta['id'] in parent_ids
            )
        else:
            docs = vectorstore.similarity_search_by_vector_with_relevance_scores(
                query_embedding,
                k=k,
                filter=lambda meta: meta['level'] == level
            )
        
        if not docs or level == 0:
            return docs
        
        child_ids = [doc.metadata.get('child_ids', []) for doc, _ in docs]
        child_ids = [item for sublist in child_ids for item in sublist]  # Flatten the list
        
        child_docs = retrieve_level(level - 1, child_ids)
        return docs + child_docs
    
    max_level = max(doc.metadata['level'] for doc in vectorstore.docstore.values())
    return retrieve_level(max_level)

## Create Retriever

In [None]:
@traceable
def create_retriever(vectorstore: FAISS) -> ContextualCompressionRetriever:
    """Create a retriever with contextual compression."""
    base_retriever = vectorstore.as_retriever()
    
    prompt = ChatPromptTemplate.from_template(
        "Given the following context and question, extract only the relevant information for answering the question:\n\n"
        "Context: {context}\n"
        "Question: {question}\n\n"
        "Relevant Information:"
    )
    
    extractor = LLMChainExtractor.from_llm(llm, prompt=prompt)
    
    return ContextualCompressionRetriever(
        base_compressor=extractor,
        base_retriever=base_retriever
    )


## Define hierarchical retraieval

In [None]:
@traceable
def hierarchical_retrieval(query: str, retriever: ContextualCompressionRetriever, max_level: int) -> List[Document]:
    """Perform hierarchical retrieval starting from the highest level, handling potential None values."""
    all_retrieved_docs = []
    
    for level in range(max_level, -1, -1):
        # Retrieve documents from the current level
        level_docs = retriever.get_relevant_documents(
            query,
            filter=lambda meta: meta['level'] == level
        )
        all_retrieved_docs.extend(level_docs)
        
        # If we've found documents, retrieve their children from the next level down
        if level_docs and level > 0:
            child_ids = [doc.metadata.get('child_ids', []) for doc in level_docs]
            child_ids = [item for sublist in child_ids for item in sublist if item is not None]  # Flatten and filter None
            
            if child_ids:  # Only modify query if there are valid child IDs
                child_query = f" AND id:({' OR '.join(str(id) for id in child_ids)})"
                query += child_query
    
    return all_retrieved_docs

## Load the Vector Store Back

In [8]:
def load_existing_vectorstore(save_path="nvidia_large_baseline") -> FAISS:
    """Load an existing FAISS vector store from disk."""
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vectorstore = FAISS.load_local(
        save_path,
        embeddings,
        allow_dangerous_deserialization=True
    )
    return vectorstore

## Create Retriever

In [9]:
def create_retriever(vectorstore: FAISS) -> ContextualCompressionRetriever:
    """Create a retriever with contextual compression."""
    logging.info("Creating contextual compression retriever")
    base_retriever = vectorstore.as_retriever()
    
    prompt = ChatPromptTemplate.from_template(
        "Given the following context and question, extract only the relevant information for answering the question:\n\n"
        "Context: {context}\n"
        "Question: {question}\n\n"
        "Relevant Information:"
    )
    
    extractor = LLMChainExtractor.from_llm(llm, prompt=prompt)
    
    return ContextualCompressionRetriever(
        base_compressor=extractor,
        base_retriever=base_retriever
    )


## RAPTOR Query Process 

In [10]:
from typing import Dict, Any
import logging
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.retrievers import BaseRetriever


@traceable
def raptor_query(
    query: str, 
    retriever: BaseRetriever, 
    max_level: int, 
    llm: Any
) -> Dict[str, Any]:
    """
    Process a query using an advanced RAG approach with hierarchical retrieval.
    
    Args:
        query (str): The input query to process
        retriever (BaseRetriever): The retriever to use for document retrieval
        max_level (int): Maximum hierarchical retrieval depth
        llm: The language model to use for generation
    
    Returns:
        Dict containing query results and metadata
    """
    logging.info(f"Processing query: {query}")
    
    # Retrieve documents
    retrieved_docs = retriever.invoke(query)
    
    # Format context
    def format_docs(docs):
        return [doc.page_content for doc in docs] 
    
    context = format_docs(retrieved_docs)
    
    # Define the RAG prompt
    rag_prompt = ChatPromptTemplate.from_template(
        "You are an expert assistant. Answer the question based on the following context:\n\n"
        "Context:\n{context}\n\n"
        "Question: {question}\n\n"
        "Provide a detailed and accurate answer."
    )
    
    # Create the RAG chain
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | rag_prompt
        | llm
        | StrOutputParser()
    )
    
    # Execute the chain
    try:
        # Generate answer
        answer = rag_chain.invoke(query)
        
        # Prepare result dictionary
        result = {
            "query": query,
            "retrieved_documents": [
                {
                    "index": i + 1,
                    "content": doc.page_content,
                    "metadata": doc.metadata,
                    "level": doc.metadata.get('level', 'Unknown'),
                    "similarity_score": doc.metadata.get('score', 'N/A')
                } for i, doc in enumerate(retrieved_docs)
            ],
            "num_docs_retrieved": len(retrieved_docs),
            "contexts": context,  # Explicitly include the full context
            "answer": answer,
            "model_used": getattr(llm, 'model_name', 'Unknown'),
        }
        
        logging.info("Query processing completed successfully")
        return result
    
    except Exception as e:
        logging.error(f"Error processing query: {str(e)}")
        return {
            "query": query,
            "error": str(e),
            "success": False
        }


In [11]:
def print_query_details(result: Dict[str, Any]):
    """Print detailed information about the query process, including tree level metadata."""
    print(f"Query: {result['query']}")
    print(f"\nNumber of documents retrieved: {result['num_docs_retrieved']}")
    print(f"\nRetrieved Documents:")
    for doc in result['retrieved_documents']:
        print(f"  Document {doc['index']}:")
        print(f"    Content: {doc['content'][:100]}...")  # Show first 100 characters
        print(f"    Similarity Score: {doc['similarity_score']}")
        print(f"    Tree Level: {doc['metadata'].get('level', 'Unknown')}")
        print(f"    Origin: {doc['metadata'].get('origin', 'Unknown')}")
        if 'child_docs' in doc['metadata']:
            print(f"    Number of Child Documents: {len(doc['metadata']['child_docs'])}")
        print()
    
    print(f"\nContext used for answer generation:")
    print(result['contexts'])
    
    print(f"\nGenerated Answer:")
    print(result['answer'])
    
    print(f"\nModel Used: {result['model_used']}")


In [12]:
vectorstore = load_existing_vectorstore("nvidia_RAPTOR")

2025-03-04 15:13:19,711 - INFO - Loading existing FAISS vectorstore from nvidia_RAPTOR
2025-03-04 15:13:19,821 - INFO - Loading faiss with AVX2 support.
2025-03-04 15:13:19,869 - INFO - Successfully loaded faiss with AVX2 support.
2025-03-04 15:13:19,883 - INFO - Successfully loaded FAISS vectorstore


In [13]:
# Create retriever
retriever = create_retriever(vectorstore)

2025-03-04 15:13:21,219 - INFO - Creating contextual compression retriever


In [14]:
# Run the pipeline
max_level = 3  # Adjust based on your tree depth
query = "Who is the CEO of NVIDIA?"
result = raptor_query(query, retriever, max_level, llm)
print_query_details(result)

2025-03-04 15:13:22,802 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-04 15:13:22,804 - INFO - Processing query: Who is the CEO of NVIDIA?
2025-03-04 15:13:24,020 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-03-04 15:13:25,043 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:13:25,769 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:13:26,682 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:13:27,517 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:14:06,106 - INFO - HTTP Request: POST https://api.openai.com/v1/emb

Query: Who is the CEO of NVIDIA?

Number of documents retrieved: 4

Retrieved Documents:
  Document 1:
    Content: Jen-Hsun Huang is the President and Chief Executive Officer of NVIDIA....
    Similarity Score: N/A
    Tree Level: 0
    Origin: original

  Document 2:
    Content: Jen-Hsun Huang is the President and Chief Executive Officer of NVIDIA Corporation....
    Similarity Score: N/A
    Tree Level: 0
    Origin: original

  Document 3:
    Content: Jen-Hsun Huang is the CEO of NVIDIA....
    Similarity Score: N/A
    Tree Level: 0
    Origin: original

  Document 4:
    Content: Jen-Hsun Huang is the CEO of NVIDIA....
    Similarity Score: N/A
    Tree Level: 0
    Origin: original


Context used for answer generation:
['Jen-Hsun Huang is the President and Chief Executive Officer of NVIDIA.', 'Jen-Hsun Huang is the President and Chief Executive Officer of NVIDIA Corporation.', 'Jen-Hsun Huang is the CEO of NVIDIA.', 'Jen-Hsun Huang is the CEO of NVIDIA.']

Generated Answer:
Th

In [15]:
result['contexts']

['Jen-Hsun Huang is the President and Chief Executive Officer of NVIDIA.',
 'Jen-Hsun Huang is the President and Chief Executive Officer of NVIDIA Corporation.',
 'Jen-Hsun Huang is the CEO of NVIDIA.',
 'Jen-Hsun Huang is the CEO of NVIDIA.']

In [16]:
result["answer"]

'The CEO of NVIDIA is Jen-Hsun Huang. He holds the position of President and Chief Executive Officer of NVIDIA Corporation, which is commonly referred to simply as NVIDIA. His leadership role is emphasized in multiple statements, confirming his status as the head of the company.'

## RAG Evaluation Dataset

In [17]:
# Import the LangSmith client
from langsmith import Client

# Initialize the LangSmith client
ls_client = Client()

# Read an existing dataset by name
dataset_name = "RAG_Eval_QA"
dataset = ls_client.read_dataset(dataset_name=dataset_name)

# List examples in the dataset
examples = ls_client.list_examples(dataset_name=dataset_name)

# Iterate through examples
for example in examples:
    print(example.inputs)  # Print input data
    print(example.outputs)  # Print output data


{'question': "What was NVIDIA's revenue in 2024 and 2023? Which year saw higher revenues and by how much?"}
{'answer': "Revenue in 2024 was $60,922 million ($60.9 billion) , revenue in 2023 was $26,974 million ($27.0 billion). NVIDIA's revenue increased by $33,948 million (or $33.95 billion depending on the rounding) in 2024 compared to 2023."}
{'question': 'Between January 29, 2023, and January 28, 2024, NVIDIA’s total assets increased by approximately $24.5 billion. Based on the balance sheet, which two line items contributed the most to this increase, and what does this suggest about the company’s capital deployment focus?'}
{'answer': 'Between January 29, 2023, and January 28, 2024, NVIDIA’s total assets increased by approximately $24.5 billion, rising from $41.2 billion to $65.7 billion. Based on the balance sheet, the two line items that contributed the most to this increase were marketable securities, which increased from $9.9 billion to $18.7 billion, and cash and cash equivale

In [18]:
from langchain_core.rate_limiters import InMemoryRateLimiter

rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.1,  # Slow down requests
    check_every_n_seconds=0.1,
    max_bucket_size=10
)

# Apply to your model

model = ChatOpenAI(model="gpt-4o", temperature=0, rate_limiter=rate_limiter)

  rate_limiter = InMemoryRateLimiter(


## RAG Evaluators

### Type 1: Reference Answer

First, lets consider the case in which we want to compare our RAG chain answer to a reference answer.

#### Eval flow

We simply use an LLM-as-judge with an easily customized grader prompt: 

In [19]:
# RAG chain
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = raptor_query(example["question"], retriever, max_level, llm)
    return {"answer": response["answer"]}

In [20]:
def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = raptor_query(example["question"], retriever, max_level, llm)
    return {"answer": response["answer"], "contexts": response["contexts"]}

In [21]:
# set the LANGSMITH_API_KEY environment variable (create key in settings)
from langchain import hub
grade_prompt_answer_accuracy = hub.pull("answer_vs_reference")

In [22]:
def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """
    
    # Get summary
    input_question = example.inputs["question"]
    reference = example.outputs["answer"]
    prediction = run.outputs["answer"]

    # LLM grader
    # llm = ChatOpenAI(model="gpt-4o", temperature=0)
    
    llm = model
 
    # Structured prompt
    
    answer_grader = grade_prompt_answer_accuracy | llm

    # Get score
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_score", "score": score}

In [55]:
from langsmith.evaluation import evaluate

dataset_name = "RAG_test_NVIDIA"
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[answer_evaluator],
    experiment_prefix="RAPTOR",
    metadata={"variant": "NVIDIA context, gpt-4o-mini"},
)

View the evaluation results for experiment: 'RAPTOR-478de21e' at:
https://smith.langchain.com/o/f09c54fc-44b8-4ffe-9055-bc40601f9137/datasets/7fad5e82-7b2d-438e-90e6-1e232f388896/compare?selectedSessions=4dc555d3-eeec-4ea4-9572-a53e194dedef




0it [00:00, ?it/s]

2025-03-03 14:54:24,332 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-03 14:54:24,334 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-03 14:54:24,337 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-03 14:54:24,338 - INFO - Processing query: In which large markets does NVIDIA apply its expertise and provide platforms?
2025-03-03 14:54:24,34

### Type 2: Answer Hallucination

#### Eval flow

We simply use an LLM-as-judge with an easily customized grader prompt: 

https://smith.langchain.com/hub/langchain-ai/rag-answer-hallucination

In [23]:
# set the LANGSMITH_API_KEY environment variable (create key in settings)
from langchain import hub
grade_prompt_hallucinations = hub.pull("hallucination")

In [24]:
# Prompt 

# grade_prompt_hallucinations = hub.pull("langchain-ai/rag-answer-hallucination")

def answer_hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for generation hallucination
    """
    
    # RAG inputs
    input_question = example.inputs["question"]
    contexts = run.outputs["contexts"]
        
    # RAG answer 
    prediction = run.outputs["answer"]

    # LLM grader
    # llm = ChatOpenAI(model="gpt-4o", temperature=0)
    
    llm = model

    # Structured prompt
    answer_grader = grade_prompt_hallucinations | llm

    # Get score
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_hallucination", "score": score}

In [58]:
dataset_name = "RAG_test_NVIDIA"
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="RAPTOR",
    metadata={"variant": "NVIDIA context, gpt-4o-mini"},
)

View the evaluation results for experiment: 'RAPTOR-18775abf' at:
https://smith.langchain.com/o/f09c54fc-44b8-4ffe-9055-bc40601f9137/datasets/7fad5e82-7b2d-438e-90e6-1e232f388896/compare?selectedSessions=b4dac82c-0f01-464c-a8ef-7c2d2ab3d230




0it [00:00, ?it/s]

2025-03-03 14:56:35,814 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-03 14:56:35,816 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-03 14:56:35,817 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-03 14:56:35,819 - INFO - Processing query: In which large markets does NVIDIA apply its expertise and provide platforms?
2025-03-03 14:56:35,82

### Type 3: Document Relevance to Question

#### Eval flow

We simply use an LLM-as-judge with an easily customized grader prompt: 

https://smith.langchain.com/hub/langchain-ai/rag-document-relevance

In [25]:
# set the LANGSMITH_API_KEY environment variable (create key in settings)
from langchain import hub
grade_prompt_doc_relevance = hub.pull("doc_question_relevance")

In [26]:
# Grade prompt 
# grade_prompt_doc_relevance = hub.pull("langchain-ai/rag-document-relevance")

def docs_relevance_evaluator(run, example) -> dict:
    """
    A simple evaluator for document relevance
    """
    
    # RAG inputs
    input_question = example.inputs["question"]
    contexts = run.outputs["contexts"]
        
    # RAG answer 
    prediction = run.outputs["answer"]

    # LLM grader
    # llm = ChatOpenAI(model="gpt-4o", temperature=0)
    
    llm = model

    # Structured prompt
    answer_grader = grade_prompt_doc_relevance | llm

    # Get score
    score = answer_grader.invoke({"question":input_question,
                                  "documents":contexts})
    score = score["Score"]

    return {"key": "document_relevance", "score": score}

In [61]:
dataset_name = "RAG_test_NVIDIA"
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[docs_relevance_evaluator],
    experiment_prefix="RAPTOR",
    metadata={"variant": "NVIDIA context, gpt-4o-mini"},
)

View the evaluation results for experiment: 'RAPTOR-19c10d49' at:
https://smith.langchain.com/o/f09c54fc-44b8-4ffe-9055-bc40601f9137/datasets/7fad5e82-7b2d-438e-90e6-1e232f388896/compare?selectedSessions=a933762d-6704-4bb0-9303-a7274b7018a8




0it [00:00, ?it/s]

2025-03-03 14:59:16,480 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-03 14:59:16,482 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-03 14:59:16,484 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-03 14:59:16,486 - INFO - Processing query: In which large markets does NVIDIA apply its expertise and provide platforms?
2025-03-03 14:59:16,48

## Complete Evaluation

In [27]:
from langsmith.evaluation import evaluate

dataset_name = "RAG_Eval_QA"
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_evaluator, answer_hallucination_evaluator, docs_relevance_evaluator],
    experiment_prefix="RAPTOR",
    metadata={"variant": "NVIDIA context, gpt-4o-mini"},
)

View the evaluation results for experiment: 'RAPTOR-98deaee5' at:
https://smith.langchain.com/o/f09c54fc-44b8-4ffe-9055-bc40601f9137/datasets/176f75c1-604f-435b-b0c7-9eda1bdef775/compare?selectedSessions=ac8d5f21-57a6-4025-b704-9b3b6755d7f1




0it [00:00, ?it/s]

2025-03-04 15:16:27,973 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-04 15:16:27,976 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-04 15:16:27,978 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-04 15:16:27,979 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetrie

2025-03-04 15:16:33,777 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:34,015 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:34,287 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:34,369 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:34,640 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:34,725 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:34,923 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:34,984 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-03-04 15:16:35,246 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1

2025-03-04 15:16:45,546 - INFO - Query processing completed successfully
2025-03-04 15:16:45,548 - ERROR - Failed to use model_dump to serialize <class 'langchain.retrievers.contextual_compression.ContextualCompressionRetriever'> to JSON: PydanticSerializationError(Unable to serialize unknown type: <class 'langchain_community.vectorstores.faiss.FAISS'>)
2025-03-04 15:16:45,552 - INFO - Processing query: What was the net value of P&E in 2024 and how is it calculated in this specific report?
2025-03-04 15:16:45,583 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:45,864 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:45,894 - INFO - Query processing completed successfully
2025-03-04 15:16:45,895 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-03-04 15:16:45,897 - ERROR - Failed to use model_dump to serialize <class 'langchain.retriever

2025-03-04 15:16:54,805 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:55,035 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:55,154 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-03-04 15:16:55,445 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:56,001 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:56,100 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:56,156 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:16:56,573 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-03-04 15:16:56,724 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200

2025-03-04 15:17:08,845 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:17:09,353 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:17:09,813 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:17:09,814 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:17:09,820 - INFO - Query processing completed successfully
2025-03-04 15:17:09,824 - INFO - Query processing completed successfully
2025-03-04 15:17:10,735 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:17:11,988 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:17:11,993 - INFO - Query processing completed successfully
2025-03-04 15:17:12,422 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.

2025-03-04 15:25:41,366 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:25:47,908 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:26:02,980 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:26:12,176 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:26:19,649 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:26:29,582 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:26:40,232 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-03-04 15:26:56,412 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
