In [1]:
print("Hello World")

Hello World


In [2]:
from dotenv import load_dotenv
load_dotenv()


True

In [4]:
import pandas as pd

# QA
inputs = [
    "For customer-facing applications, which company's models dominate the top rankings?",
    "What percentage of respondents are using RAG in some form?",
    "How often are most respondents updating their models?",
]

outputs = [
    "OpenAI models dominate, with 3 of the top 5 and half of the top 10 most popular models for customer-facing apps.",
    "70% of respondents are using RAG in some form.",
    "More than 50% update their models at least monthly, with 17% doing so weekly.",
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "/Users/yashchinawale/Desktop/end-end-rag/data/goldens.csv"
df.to_csv(csv_path, index=False)

In [5]:
from langsmith import Client

client = Client()
dataset_name = "AgenticAIReportGoldens"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input and expected output pairs for AgenticAIReport",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['18ee822f-f06e-4080-980f-d35a181470f5',
  'a8b88298-3d5c-41f7-981c-fca46da17113',
  '5a7898a8-41b5-4974-9d7b-caf02120d7d5'],
 'count': 3}

In [6]:
import sys
sys.path.append("/Users/yashchinawale/Desktop/end-end-rag")

from pathlib import Path
from multi_doc_chat.src.document_ingestion.data_ingestion import ChatIngestor
from multi_doc_chat.src.document_chat.retrieval import ConversationalRAG
import os

# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "/Users/yashchinawale/Desktop/end-end-rag/data/The 2025 AI Engineering Report.txt",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}


In [7]:
# Test the function with a sample question
test_input = {"question": "For customer-facing applications, which company's models dominate the top rankings?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])


{"timestamp": "2025-11-11T11:14:15.936263Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T11:14:15.938286Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T11:14:15.938955Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_tf...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T11:14:15.939499Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T11:14:15.943387Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251111_164415_2ad535c8", "temp_dir": "data/session_20251111_164415_2ad535c8", "faiss_dir": "faiss_index/session_20251111_164415_2ad535c8", "sessionized": true, "timestamp": "2025-11-11T11:14:15.944731Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "The 2025 AI Engineering Report.txt", "saved_as

Question: For customer-facing applications, which company's models dominate the top rankings?

Answer: OpenAI models dominate the top rankings for customer-facing applications. Specifically, 3 of the top 5 models and half of the top 10 most popular models in production come from OpenAI. This dominance reflects strong industry preference and trust in OpenAI’s offerings for real-world user interactions.


In [8]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator


In [9]:
# Example: Test with all golden questions
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")


{"timestamp": "2025-11-11T11:15:21.122264Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T11:15:21.123524Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T11:15:21.124902Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_tf...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T11:15:21.125901Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T11:15:21.130747Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251111_164521_47245cbe", "temp_dir": "data/session_20251111_164521_47245cbe", "faiss_dir": "faiss_index/session_20251111_164521_47245cbe", "sessionized": true, "timestamp": "2025-11-11T11:15:21.131947Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "The 2025 AI Engineering Report.txt", "saved_as

Testing all questions from the dataset:



{"added": 1, "index": "faiss_index/session_20251111_164521_47245cbe", "timestamp": "2025-11-11T11:15:22.269975Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-11T11:15:22.270619Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-11T11:15:22.272505Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T11:15:22.272837Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T11:15:22.273145Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_tf...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T11:15:22.273446Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T11:15:22.275485Z", "level": "info", "event": "YAML config loaded"}
{"provider": "google", "model": "gemini-2.0-fla

Q1: For customer-facing applications, which company's models dominate the top rankings?
A1: OpenAI models dominate the top rankings for customer-facing applications. Specifically, 3 of the top 5 models and half of the top 10 most popular models in production come from OpenAI. This dominance reflects strong industry preference and trust in OpenAI’s offerings for real-world user interactions.

--------------------------------------------------------------------------------



{"added": 1, "index": "faiss_index/session_20251111_164529_7b43906f", "timestamp": "2025-11-11T11:15:30.455903Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-11T11:15:30.457607Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-11T11:15:30.461254Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T11:15:30.462076Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T11:15:30.462795Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_tf...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T11:15:30.463422Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T11:15:30.467262Z", "level": "info", "event": "YAML config loaded"}
{"provider": "google", "model": "gemini-2.0-fla

Q2: What percentage of respondents are using RAG in some form?
A2: A substantial 70% of AI engineers surveyed are using RAG in some form to enhance their applications. This highlights RAG as a mainstream technique for improving accuracy and relevance in AI outputs. It also signifies RAG's essential role in production AI systems.

--------------------------------------------------------------------------------



{"added": 1, "index": "faiss_index/session_20251111_164536_1bdce860", "timestamp": "2025-11-11T11:15:37.147384Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-11T11:15:37.148453Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-11T11:15:37.151569Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T11:15:37.152115Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T11:15:37.152693Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_tf...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T11:15:37.153297Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T11:15:37.157285Z", "level": "info", "event": "YAML config loaded"}
{"provider": "google", "model": "gemini-2.0-fla

Q3: How often are most respondents updating their models?
A3: More than 50% of respondents update their AI models at least monthly. This rapid iteration pace points to the dynamic nature of AI deployments and the need to keep models aligned with evolving data and user requirements.

--------------------------------------------------------------------------------



In [10]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Evaluators
qa_evaluator = [LangChainStringEvaluator("cot_qa")]
dataset_name = "AgenticAIReportGoldens"

# Run evaluation using our RAG function
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-agenticAIReport-qa-rag",
    # Experiment metadata
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'test-agenticAIReport-qa-rag-a61f523b' at:
https://smith.langchain.com/o/51176af9-8474-4702-ac30-3e6cc89332ab/datasets/07a3bcd5-90fd-4dd6-a28b-37dee5ea89dc/compare?selectedSessions=c38c3d59-c21d-4977-bbcb-0ee8ea3c3492




0it [00:00, ?it/s]{"timestamp": "2025-11-11T11:17:01.847050Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T11:17:01.847826Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T11:17:01.848130Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_tf...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T11:17:01.848516Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T11:17:01.850940Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251111_164701_ea3446dd", "temp_dir": "data/session_20251111_164701_ea3446dd", "faiss_dir": "faiss_index/session_20251111_164701_ea3446dd", "sessionized": true, "timestamp": "2025-11-11T11:17:01.851730Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "The 2025 AI Engineering Repo

## Custom Correctness Evaluator

Creating an LLM-as-a-Judge evaluator to assess semantic and factual alignment


In [11]:
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

- If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
- If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

Do not penalize for stylistic or formatting differences unless they change meaning."""),
        ("human", """<example>
<input>
{input}
</input>

<output>
Expected Output: {expected_output}

Actual Output: {actual_output}
</output>
</example>

Please grade the following agent run given the input, expected output, and actual output.
Focus only on correctness (semantic and factual alignment).

Respond with:
1. A brief reasoning (1-2 sentences)
2. A final verdict: either "CORRECT" or "INCORRECT"

Format your response as:
Reasoning: [your reasoning]
Verdict: [CORRECT or INCORRECT]""")
    ])
    
    # Initialize LLM (using Gemini as shown in your config)
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro",
        temperature=0
    )
    
    # Create chain and invoke
    chain = eval_prompt | llm
    
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text.split('\n'):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
        
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }


### Run Evaluation with Custom Correctness Evaluator


In [12]:
# Run evaluation with the custom correctness evaluator
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "AgenticAIReportGoldens"

# Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="agenticAIReport-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print("\nEvaluation completed! Check the LangSmith UI for detailed results.")


View the evaluation results for experiment: 'agenticAIReport-correctness-eval-e1ea3751' at:
https://smith.langchain.com/o/51176af9-8474-4702-ac30-3e6cc89332ab/datasets/07a3bcd5-90fd-4dd6-a28b-37dee5ea89dc/compare?selectedSessions=4df111c0-534f-4181-bb46-c9ea4492440f




0it [00:00, ?it/s]{"timestamp": "2025-11-11T11:27:01.627457Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-11T11:27:01.628501Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-11T11:27:01.628903Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_tf...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-11T11:27:01.629291Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-11T11:27:01.642581Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251111_165701_293ff9b4", "temp_dir": "data/session_20251111_165701_293ff9b4", "faiss_dir": "faiss_index/session_20251111_165701_293ff9b4", "sessionized": true, "timestamp": "2025-11-11T11:27:01.645806Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "The 2025 AI Engineering Repo


Evaluation completed! Check the LangSmith UI for detailed results.





### Optional: Combine Multiple Evaluators

You can use multiple evaluators together to get different perspectives on your RAG system's performance.


In [None]:
# Example: Combine custom correctness evaluator with LangChain's built-in evaluators
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Combine custom and built-in evaluators
combined_evaluators = [
    correctness_evaluator,  # Custom LLM-as-a-Judge
    LangChainStringEvaluator("cot_qa"),  # Chain-of-thought QA evaluator
]

# Run evaluation with multiple evaluators
# Uncomment to run:
# experiment_results_combined = evaluate(
#     answer_ai_report_question,
#     data=dataset_name,
#     evaluators=combined_evaluators,
#     experiment_prefix="agenticAIReport-multi-eval",
#     description="Evaluating RAG system with multiple evaluators",
#     metadata={
#         "variant": "RAG with FAISS",
#         "evaluators": "correctness + cot_qa",
#         "chunk_size": 1000,
#         "chunk_overlap": 200,
#         "k": 5,
#     },
# )
