In [1]:
import sys
import os
from pathlib import Path
import asyncio
from concurrent.futures import ThreadPoolExecutor

# Add src directory to path
project_root = os.path.join(str(Path.cwd().parent ),'src')
sys.path.insert(0, project_root)

# config
from utils import Config
config = Config(environment = "local")
print(config)

2025-08-02 09:25:45,908 - root - INFO - ✅ Loaded environment file for 'local' environment
2025-08-02 09:25:45,908 - root - INFO - 🔧 Logging configured at level: DEBUG
2025-08-02 09:25:45,908 - root - INFO - ✅ All required configuration loaded for 'local' environment


Config(log_level='DEBUG', environment='local', openai_api_key=None, openai_model=None, openai_api_type='azure', azure_openai_endpoint='https://aoai-zvbgv7oohofri.cognitiveservices.azure.com/', azure_openai_api_key=***, azure_openai_model='gpt-4o', azure_openai_deployment='gpt-4o', azure_openai_api_version='2024-12-01-preview', azure_openai_embedding_endpoint='https://aoai-zvbgv7oohofri.cognitiveservices.azure.com/', azure_openai_embedding_api_key=***, azure_openai_embedding_model='text-embedding-3-small', azure_openai_embedding_deployment='text-embedding-3-small', azure_openai_embedding_api_version='2024-12-01-preview', chroma_db_path='/data/chroma_db', serp_api_key=***, project_root=WindowsPath('c:/Users/aprilhazel/Source/sk_mcp_demo/mcp_rag'))


# 🎨 RAG Architecture - Mermaid Diagram

```mermaid
graph TD
    %% Data Sources
    A("📄 Data Source<br/>Products CSV<br/>Contoso Catalog") --> B("📝 Text Processing<br/>Chunking & Structure")
    
    %% Embedding Generation  
    B --> C("🧠 Embedding Model<br/>text-embedding-ada-002<br/>1536 dimensions")
    
    %% Vector Storage
    C --> D("💾 Vector Store<br/>ChromaDB<br/>product_collection")
    
    %% Query Processing
    E("❓ User Query<br/>Search Question") --> F("🔄 Query Embedding<br/>text-embedding-ada-002")
    
    %% Similarity Search
    F --> G("🔍 Similarity Search<br/>Cosine Distance<br/>Top-K Results")
    D --> G
    
    %% Generation
    G --> H("🤖 Chat Model<br/>GPT-4o<br/>Response Generation")
    E --> H
    H --> I("💬 Final Response<br/>Grounded Answer")
    
    %% Styling with black text
    classDef dataSource fill:#e1f5fe,stroke:#0288d1,stroke-width:2px,color:#000000
    classDef processing fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px,color:#000000
    classDef storage fill:#e8f5e8,stroke:#388e3c,stroke-width:2px,color:#000000
    classDef query fill:#fff3e0,stroke:#f57400,stroke-width:2px,color:#000000
    classDef output fill:#fce4ec,stroke:#c2185b,stroke-width:2px,color:#000000
    
    class A dataSource
    class B,C,F processing
    class D,G storage
    class E query
    class H,I output
```

## 🔄 RAG Process Flow

1. **📊 Data Ingestion**: Products CSV → Text processing & chunking
2. **🧠 Embedding**: Product descriptions → Vector embeddings (1536-dim)
3. **💾 Storage**: Embeddings → ChromaDB product collection
4. **🔍 Query**: User question → Query embedding → Similarity search
5. **📋 Retrieval**: Top-K similar products retrieved as context
6. **🤖 Generation**: Product context + Query → LLM → Final response

**Key Components:**
- **Data Source**: Contoso Products CSV with product details
- **Embedding Model**: Azure OpenAI text-embedding-ada-002
- **Vector Database**: ChromaDB with persistent storage
- **Chat Model**: Azure OpenAI GPT-4o
- **Collection**: Single product_collection for product data

Creating and testing the LLM chat component

In [None]:
# Azure OpenAI Chat Client
from openai import AzureOpenAI

# Create Azure OpenAI chat client using config values
aoai_chat_client = AzureOpenAI(
    api_key=config.azure_openai_api_key,
    api_version=config.azure_openai_api_version,
    azure_endpoint=config.azure_openai_endpoint
)

# Test LLM
response = aoai_chat_client.chat.completions.create(
    model=config.azure_openai_deployment,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Tell me a one liner joke about AI."}
    ],
    max_tokens=150,
    temperature=0.7
)

print("✅ Azure OpenAI Chat Client created successfully!")
print(f"📍 Endpoint: {config.azure_openai_endpoint}")
print(f"🤖 Model: {config.azure_openai_model}")
print(f"🚀 Deployment: {config.azure_openai_deployment}")
print(f"📋 API Version: {config.azure_openai_api_version}")
print(f"💬 Response: {response.choices[0].message.content.strip()}")

2025-08-02 09:25:47,811 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/deployments/gpt-4o/chat/completions', 'headers': {'api-key': '<redacted>'}, 'files': None, 'idempotency_key': 'stainless-python-retry-fe962563-1e99-477e-a9be-c8bc179c8146', 'json_data': {'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Tell me a one liner joke about AI.'}], 'model': 'gpt-4o', 'max_tokens': 150, 'temperature': 0.7}}
2025-08-02 09:25:47,811 - openai._base_client - DEBUG - Sending HTTP Request: POST https://aoai-zvbgv7oohofri.cognitiveservices.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-12-01-preview
2025-08-02 09:25:47,819 - httpcore.connection - DEBUG - connect_tcp.started host='aoai-zvbgv7oohofri.cognitiveservices.azure.com' port=443 local_address=None timeout=5.0 socket_options=None
2025-08-02 09:25:48,042 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backen

✅ Azure OpenAI Chat Client created successfully!
📍 Endpoint: https://aoai-zvbgv7oohofri.cognitiveservices.azure.com/
🤖 Model: gpt-4o
🚀 Deployment: gpt-4o
📋 API Version: 2024-12-01-preview
💬 Response: Sure! Here's one:

"I asked AI to write me a joke, but it kept debugging the punchline!"


Creating the ChromaDB vector database, embedding, and storing data

Populated with contoso products (csv source: https://github.com/Azure-Samples/contoso-chat/blob/main/data/product_info/products.csv)

In [3]:
import chromadb
from chromadb.utils import embedding_functions
from pathlib import Path

# ChromaDB embeding_function 
aoai_embedding_function = embedding_functions.OpenAIEmbeddingFunction(
    api_key=config.azure_openai_embedding_api_key,
    api_base=config.azure_openai_embedding_endpoint,
    api_type=config.openai_api_type,
    api_version=config.azure_openai_embedding_api_version,
    model_name=config.azure_openai_embedding_model,
    deployment_id=config.azure_openai_embedding_deployment
)
# https://docs.trychroma.com/docs/collections/manage-collections
chroma_db_path = Path(config.project_root/config.chroma_db_path.lstrip('./')/'chroma_db')
chroma_client = chromadb.PersistentClient(path=str(chroma_db_path))
print(f"ChromaDB path: {chroma_db_path}")
# Create or get the collection
try:
    products_collection = chroma_client.get_or_create_collection (
        name="product_collection", embedding_function=aoai_embedding_function)
except Exception as e:
    raise e

# See the top 10 records
products_collection.peek()

2025-08-02 09:25:53,820 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-08-02 09:25:53,860 - chromadb.config - DEBUG - Starting component System
2025-08-02 09:25:53,860 - chromadb.config - DEBUG - Starting component Posthog


ChromaDB path: c:\Users\aprilhazel\Source\sk_mcp_demo\mcp_rag\data\chroma_db\chroma_db


{'ids': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
 'embeddings': array([[ 1.41727021e-02,  1.49021791e-02,  1.75986402e-02, ...,
         -3.55945830e-03,  1.71296895e-02, -5.08524954e-06],
        [ 2.44422704e-02, -4.82865900e-04,  1.69588719e-02, ...,
         -2.43603578e-03, -1.88854206e-02, -1.77452192e-02],
        [ 8.51014908e-03,  1.44666352e-03, -2.56910156e-02, ...,
         -1.57975033e-02,  2.48387661e-02,  1.56492870e-02],
        ...,
        [ 1.25831766e-02, -1.96192153e-02,  2.62994990e-02, ...,
         -1.26358811e-02, -5.32314507e-03,  2.76961643e-02],
        [-4.46882518e-03,  9.49968025e-03,  7.98494089e-03, ...,
         -1.32762492e-02, -1.66004524e-02, -1.12474570e-02],
        [ 1.46424361e-02,  2.95980796e-02, -3.25996466e-02, ...,
         -4.83122095e-02, -4.44525824e-04, -3.22342403e-02]],
       shape=(10, 1536)),
 'documents': ['ID: 1, Name: TrailMaster X4 Tent, Price: $250.0, Category: Tents, Brand: OutdoorLiving, Description: Unveiling th

# Test cases using the product collection

- **Test case 1**: User asks a question that goes beyond internal context: What is the best footwear I can buy for hiking?
- **Test case 2**: User asks a specific question about internal data: Tell me more about our RainGuard Hiking Jacket product.


In [4]:
test1_user_query = "What is the best footwear I can buy for hiking?"
test2_user_query = "Tell me more about our RainGuard Hiking Jacket product."

Let's do a chroma search

In [5]:
async def search_chroma(collection, query, n_results=5):
    """Search the ChromaDB collection for the given query asynchronously."""
    
    def _sync_search():
        """Synchronous ChromaDB search to run in thread pool."""
        try:
            # Get ChromaDB information
            chroma_info = {
                'db_path': str(chroma_db_path),
                'collection_name': collection.name,
                'collection_count': collection.count()
            }

            # Search the collection with the query
            search_results = collection.query(
                query_texts=[query],
                n_results=n_results,
                include=['documents', 'metadatas', 'distances']
            )
            
            # Format the retrieved context
            context_contents = search_results['documents'][0] if search_results['documents'] else []
            context_metadata = search_results['metadatas'][0] if search_results['metadatas'] else []
            # Get IDs separately - ChromaDB always returns IDs
            context_ids = search_results.get('ids', [[]])[0] if 'ids' in search_results else []
            
            # If no IDs in search results, we need to get them differently
            if not context_ids and context_contents:
                # Use index-based IDs as fallback
                context_ids = [f"doc_{i}" for i in range(len(context_contents))]
            
            structured_context = []
            
            if context_contents:  # Only build if we have contents
                for i, (doc, metadata) in enumerate(zip(context_contents, context_metadata)):
                    # Get content ID, using index as fallback
                    doc_id = context_ids[i] if i < len(context_ids) else f"doc_{i}"
                    
                    # Create formatted citation
                    citation = f"[Source: ChromaDB | Path: {chroma_info['db_path']} | Collection: {chroma_info['collection_name']} | content ID: {doc_id}]"
                    
                    # Create structured result object
                    result_obj = {
                        'search_order': i + 1,
                        'content': doc,
                        'citation': citation,
                        'metadata': metadata,
                        'content_id': doc_id
                    }
                    structured_context.append(result_obj)

            return structured_context, chroma_info
            
        except Exception as e:
            # Always return a tuple to avoid unpacking errors
            return [], {'error': str(e)}
    
    # Run the synchronous search in a thread pool to make it async
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as executor:
        return await loop.run_in_executor(executor, _sync_search)

# Test the async search function
search_results1, chroma_info1 = await search_chroma(products_collection, test1_user_query, n_results=5)
search_results2, chroma_info2 = await search_chroma(products_collection, test2_user_query, n_results=5)


print("=" * 50)
print(f"🎯 Search Chroma Results Test 1 - {test1_user_query}")
print(f"Found {len(search_results1)} results\n")
if len(search_results1) >= 0:
    for result in search_results1:
        print(f"--- Result #{result['search_order']} ---")
        print(f"Content: {result['content'][:100]}...")
        print(f"Citation: {result['citation']}")
print("=" * 50)
print(f"🎯 Search Chroma Results Test 2 - {test2_user_query}")
print(f"Found {len(search_results2)} results\n")
if len(search_results2) >= 0:
    for result in search_results2:
        print(f"--- Result #{result['search_order']} ---")
        print(f"Content: {result['content'][:100]}...")
        print(f"Citation: {result['citation']}")
print("=" * 50)

2025-08-02 09:25:54,192 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/embeddings', 'headers': {'api-key': '<redacted>'}, 'files': None, 'idempotency_key': 'stainless-python-retry-aec06b6c-d3a3-4693-8d59-39c0b0e78eda', 'post_parser': <function Embeddings.create.<locals>.parser at 0x000001E0C53B28E0>, 'json_data': {'input': ['What is the best footwear I can buy for hiking?'], 'model': 'text-embedding-3-small', 'encoding_format': 'base64'}}
2025-08-02 09:25:54,197 - openai._base_client - DEBUG - Sending HTTP Request: POST https://aoai-zvbgv7oohofri.cognitiveservices.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview
2025-08-02 09:25:54,198 - httpcore.connection - DEBUG - connect_tcp.started host='aoai-zvbgv7oohofri.cognitiveservices.azure.com' port=443 local_address=None timeout=5.0 socket_options=None
2025-08-02 09:25:54,338 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.

🎯 Search Chroma Results Test 1 - What is the best footwear I can buy for hiking?
Found 5 results

--- Result #1 ---
Content: ID: 11, Name: TrailWalker Hiking Shoes, Price: $110.0, Category: Hiking Footwear, Brand: TrekReady, ...
Citation: [Source: ChromaDB | Path: c:\Users\aprilhazel\Source\sk_mcp_demo\mcp_rag\data\chroma_db\chroma_db | Collection: product_collection | content ID: 11]
--- Result #2 ---
Content: ID: 4, Name: TrekReady Hiking Boots, Price: $140.0, Category: Hiking Footwear, Brand: TrekReady, Des...
Citation: [Source: ChromaDB | Path: c:\Users\aprilhazel\Source\sk_mcp_demo\mcp_rag\data\chroma_db\chroma_db | Collection: product_collection | content ID: 4]
--- Result #3 ---
Content: ID: 18, Name: TrekStar Hiking Sandals, Price: $70.0, Category: Hiking Footwear, Brand: TrekReady, De...
Citation: [Source: ChromaDB | Path: c:\Users\aprilhazel\Source\sk_mcp_demo\mcp_rag\data\chroma_db\chroma_db | Collection: product_collection | content ID: 18]
--- Result #4 ---
Content: ID: 10

And, now a web search

First, we need to formulate appropriate web search queries given the user query and any internal context

In [6]:
from typing import List
from pydantic import BaseModel, Field
from datetime import datetime

# Define Pydantic models for structured output
class GeneratedSearchQuery(BaseModel):
    priority_rank: int
    search_query: str
    purpose: str

class GeneratedSearchQueries(BaseModel):
    queries: List[GeneratedSearchQuery] = Field(default_factory=list)
    error: str = None

async def get_web_search_query(llm_chat_client: AzureOpenAI, deployment_name, user_query, internal_context=None):
    """Generates an external web search query based on the user query and any internal context asynchronously."""
    
    def _sync_query_generation():
        """Synchronous OpenAI API call to run in thread pool."""
        # the prompt to be used is below
        current_date = datetime.now().strftime("%B %d, %Y")
        system_message = f"""You are an assistant whose job is to transform a user's internal chatbot question plus the internal search results into one or more actionable web-search queries.

Inputs: 
  - today's date is: {current_date}
  - user_question (string): the original question the user posed to the chatbot
  - internal_results (array of search results with citations)

Steps: 
1. Intent Analysis
  - Determine what the user really wants, including any ambiguous or implied needs.
2. Context Review
  - Examine internal_results:
    - What sub-topics are already covered?
    - Where are the gaps or outdated pieces?
    - Is there jargon, acronyms or product names that need clarification?
3. External-Search Planning
  - For each gap or elaboration, craft a concise web search query.
  - Rank queries numerically by prioritization: highest (essential to answer), secondary, exploratory.
  - Aim for queries that return authoritative, up-to-date, and user-friendly results.
  - If the user includes and exact match to an internal product name and is asking for internal product information, do not generate a query. Internal context should be used to answer questions about internal products.
  - If the user asks questions about similar products or general information that is not specific to internal products, generate one more more queries.
  - When queries are generated, generate 1 to 3 queries, each with a clear purpose.
  - Generated search queries should include any relevant information from the internal context to optimize search results.

Output Format:
Return a JSON object with a "queries" array, each query object containing:
  - "priority_rank": integer (1, 2, 3, 4...)
  - "search_query": string
  - "purpose": brief description of what this query aims to find

Example:
{{
  "queries": [
    {{
      "priority_rank": 1,
      "search_query": "specific search terms",
      "purpose": "why this search helps"
    }}
  ]
}}
"""
        
        user_message = f"""User Query: {user_query}
Internal Context: {internal_context}

Should we search the web for additional information or is the internal context fully sufficient?"""
        
        try: 
            response = llm_chat_client.chat.completions.parse(
                model=deployment_name,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ],
                max_tokens=400,
                temperature=0.4,
                response_format=GeneratedSearchQueries
            )
            
            # Get parsed response
            parsed_response = response.choices[0].message.parsed
            if not parsed_response or not parsed_response.queries:
                return GeneratedSearchQueries(queries=[])
            return GeneratedSearchQueries(queries=parsed_response.queries)
            
        except Exception as e:
            # Fallback: return an empty GeneratedSearchQueries object with error info
            return GeneratedSearchQueries(queries=[], error=str(e))
    
    # Run the synchronous query generation in a thread pool to make it async
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as executor:
        return await loop.run_in_executor(executor, _sync_query_generation)

# Demo Test Cases
generated_queries_test1 = await get_web_search_query(
    llm_chat_client=aoai_chat_client,
    deployment_name=config.azure_openai_deployment,
    user_query=test1_user_query,
    internal_context=search_results1
)
generated_queries_test2 = await get_web_search_query(
    llm_chat_client=aoai_chat_client,
    deployment_name=config.azure_openai_deployment,
    user_query=test2_user_query,
    internal_context=search_results2
)
print("=" * 50)
print(f"🎯 Generated Search Queries Test 1 - {test1_user_query}")
print(generated_queries_test1.queries)
print("=" * 50)
print(f"🎯 Generated Search Queries Test 2 - {test2_user_query}")
print(generated_queries_test2.queries)
print("=" * 50)


2025-08-02 09:25:55,174 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/deployments/gpt-4o/chat/completions', 'headers': {'X-Stainless-Helper-Method': 'chat.completions.parse', 'api-key': '<redacted>'}, 'files': None, 'idempotency_key': 'stainless-python-retry-d8a4999c-0e0a-4d41-abd0-4ae419d82b07', 'post_parser': <function Completions.parse.<locals>.parser at 0x000001E0B4086B60>, 'json_data': {'messages': [{'role': 'system', 'content': 'You are an assistant whose job is to transform a user\'s internal chatbot question plus the internal search results into one or more actionable web-search queries.\n\nInputs: \n  - today\'s date is: August 02, 2025\n  - user_question (string): the original question the user posed to the chatbot\n  - internal_results (array of search results with citations)\n\nSteps: \n1. Intent Analysis\n  - Determine what the user really wants, including any ambiguous or implied needs.\n2. Context Review\n  - Examine internal_results:\n    

🎯 Generated Search Queries Test 1 - What is the best footwear I can buy for hiking?
[GeneratedSearchQuery(priority_rank=1, search_query='best hiking footwear comparison 2025', purpose='To find a comprehensive comparison of the best hiking footwear options available in 2025, including user reviews and expert opinions.'), GeneratedSearchQuery(priority_rank=2, search_query='top-rated hiking shoes and boots for long treks', purpose='To identify highly-rated hiking shoes and boots suitable for long-distance trekking, focusing on comfort and durability.'), GeneratedSearchQuery(priority_rank=3, search_query='hiking footwear recommendations for various terrains', purpose='To gather recommendations for hiking footwear optimized for different terrains and weather conditions.')]
🎯 Generated Search Queries Test 2 - Tell me more about our RainGuard Hiking Jacket product.
[]


Create helper function to process Serp API Bing Search results

In [7]:
from typing import Any, Dict, List

def transform_serpapi_bing_results(
    serp_query: str,
    serp_response: Dict[str, Any],
    n_results: int = 5,
    includes: List = None
) -> List[Dict[str, Any]]:
    """
    Normalize SerpAPI Bing JSON into a flat list of:
      {
        'content': str,
        'citation': str,
        'metadata': dict,
        'content_id': Any
      }
    Only the sections in `includes` will be processed.
    """
    # --- Setup defaults & locals once ---
    if includes is None:
        includes = ["organic_results"]
    engine_name = "Bing"
    results: List[Dict[str, Any]] = []

    def add_item(query: str, content: str, citation: str, metadata: Dict[str, Any], cid: Any):
        results.append({
            "query": query,
            "content": content,
            "citation": citation,
            "metadata": metadata,
            "content_id": cid,
        })

    # --- 1) Answer-box (single dict) ---
    if "answer_box" in includes:
        box = serp_response.get("answer_box")
        if box:
            links = [src["link"] for src in box.get("sources", []) if "link" in src]
            citation = (
                f"[Source: {engine_name} Answer Box | "
                f"Links: {', '.join(links)}]"
            )
            add_item(
                query=serp_query,
                content=box.get("snippet", ""),
                citation=citation,
                metadata={"highlighted_snippets": box.get("type")},
                cid=1
            )

    # --- 2) Ads ---
    if "ads" in includes:
        for idx, ad in enumerate(serp_response.get("ads", [])[:n_results], start=1):
            desc = ad.get("description")
            if not desc:
                continue
            pos = ad.get("position", idx)
            citation = (
                f"[Source: {engine_name} Ads | "
                f"Link: {ad.get('tracking_link','')} | "
                f"Position: {pos}]"
            )
            add_item(
                query=serp_query,
                content=desc,
                citation=citation,
                metadata={
                    "title": ad.get("title"),
                    "displayed_link": ad.get("displayed_link")
                },
                cid=pos
            )

    # --- 3) Organic results ---
    if "organic_results" in includes:
        for idx, org in enumerate(serp_response.get("organic_results", [])[:n_results], start=1):
            snippet = org.get("snippet")
            link    = org.get("link")
            if not (snippet and link):
                continue
            pos = org.get("position", idx)
            citation = (
                f"[Source: {engine_name} Search | "
                f"Link: {link} | "
                f"Position: {pos}]"
            )
            add_item(
                query=serp_query,
                content=snippet,
                citation=citation,
                metadata={
                    "title": org.get("title", ""),
                    "displayed_link": org.get("displayed_link", ""),
                    "snippet_highlighted_words": org.get("snippet_highlighted_words", []),
                },
                cid=pos
            )
    return results


Asynchronously execute the top n web queries

In [None]:
import aiohttp

async def search_serpapi_bing(generated_queries: GeneratedSearchQueries, n_results=5):
    """
    Perform web searches using SerpAPI Bing for the generated queries.
    Returns a list of search results and search info.
    """
    # SerpAPI configuration
    SERPAPI_KEY = config.serp_api_key
    BASE_URL = "https://serpapi.com/search"

    async def single_search(query_obj: GeneratedSearchQuery):
        """Perform a single search query asynchronously"""
        serp_query = query_obj.search_query
        params = {
            "engine": "bing",
            "q": f"{serp_query} -site:ell.stackexchange.com -site:www.tenforums.com",
            "api_key": SERPAPI_KEY,
            "mkt": "en-us",  # Specify market if needed
            "cc": "US",  # Specify country if needed
            "safeSearch": "on",  # Disable safe search for more results
            "num": n_results * 2  # Request extra in case of filtering
        }
        
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(BASE_URL, params=params) as response:
                    response.raise_for_status()
                    json_data = await response.json()
                    data = transform_serpapi_bing_results(serp_query = serp_query,serp_response=json_data, n_results=n_results)
                    return data
        except Exception as e:
            print(f"Error searching for '{query_obj.search_query}': {e}")
            return [{
                "query": query_obj.search_query,
                "content": None,
                "citation": f"[Source: Bing Search | Error: {str(e)}]",
                "metadata": {"search_query": query_obj.search_query, "error": str(e)},
                "content_id": f"{query_obj.priority_rank}",
            }]

    # Handle empty queries
    all_results = []
    total_queries = len(generated_queries.queries)
    
    if total_queries == 0:
        return [], {'search_engine': 'bing', 'total_queries': 0, 'api_provider': 'SerpAPI'}
    
    # Sort queries by priority rank to ensure proper ordering
    sorted_queries = sorted(generated_queries.queries, key=lambda q: q.priority_rank)
    
    # Run all searches concurrently
    search_tasks = [single_search(query) for query in sorted_queries]
    search_results_lists = await asyncio.gather(*search_tasks)
    
    # Flatten results from all queries
    for results in search_results_lists:
        all_results.extend(results)
    
    # Update search_order to be sequential across all results
    for i, result in enumerate(all_results):
        result['search_order'] = i + 1
        if result['content_id'] and not str(result['content_id']).startswith('error_'):
            result['content_id'] = f"web_{i + 1}"
    
    # Create search info
    serp_info = {
        'search_engine': 'Bing',
        'api_provider': 'SerpAPI',
        'results_count': len(all_results)
    }
    
    return all_results, serp_info


serpapi_bing_results_test1, serpapi_bing_info_test1 = await search_serpapi_bing(
        
        generated_queries=generated_queries_test1, 
        n_results=5
    )

serpapi_bing_results_test2, serpapi_bing_info_test2 = await search_serpapi_bing(
        generated_queries=generated_queries_test2, 
        n_results=5
    )

print("=" * 50)
print(f"🎯 Demo: Web Search Query Execution Test 1 - {test1_user_query}")
print(serpapi_bing_results_test1)
print("=" * 50)
print(f"🎯 Demo: Web Search Query Execution Test 2 - {test2_user_query}")
print(serpapi_bing_results_test2)
print("=" * 50)

🎯 Demo: Web Search Query Execution Test 1 - What is the best footwear I can buy for hiking?
[{'query': 'best hiking footwear comparison 2025', 'content': 'We tested and ranked the best hiking shoes for men and women for 2025, including top picks from Salomon, Merrell, Altra, and more.', 'citation': '[Source: Bing Search | Link: https://gearjunkie.com/footwear/best-hiking-shoes | Position: 1]', 'metadata': {'title': 'The Best Hiking Shoes of 2025 | GearJunkie Tested', 'displayed_link': 'https://gearjunkie.com › footwear › best-hiking-shoes', 'snippet_highlighted_words': []}, 'content_id': 'web_1', 'search_order': 1}, {'query': 'top-rated hiking shoes and boots for long treks', 'content': 'The Top Hat Story; Meet the Team; Newsroom; Terms and Conditions; Security; Privacy Policy; Contact Us; Careers; More. Publisher Partnership; Bookstore Services; Compare Top Hat', 'citation': '[Source: Bing Search | Link: https://success.tophat.com/login?locale=us | Position: 1]', 'metadata': {'title':

With the combined search result as context, now, let's to RAG (LLM + Search Results)

In [9]:
from openai import AzureOpenAI

async def generate_chat_response(
    user_question: str,
    chroma_collection,
    llm_chat_client: AzureOpenAI,
    n_chroma_results: int = 5,
    n_web_results: int = 5
) -> dict:
    """
    Generate a comprehensive LLM chatbot response using both internal ChromaDB search and external web search.
    
    Args:
        user_question: The user's query
        chroma_collection: ChromaDB collection to search
        llm_chat_client: Azure OpenAI client
        n_chroma_results: Number of ChromaDB results to retrieve
        n_web_results: Number of web search results to retrieve
    
    Returns:
        Dictionary containing the response and metadata
    """
    # Create Azure OpenAI chat client using config values
    llm_chat_client = AzureOpenAI(
        api_key=config.azure_openai_api_key,
        api_version=config.azure_openai_api_version,
        azure_endpoint=config.azure_openai_endpoint
    )
    try:
        # Step 1: Run ChromaDB search first
        print("🔍 Searching internal database...")
        
        chroma_results, chroma_info = await search_chroma(
            collection=chroma_collection,
            query=user_question,
            n_results=n_chroma_results
        )
        
        print(f"1️⃣ Internal search found {len(chroma_results)} results")
        
        # Step 2: Generate web search queries using ChromaDB results as context and then execute them
        print("🌐 Generating web search queries with internal context...")
        
        generated_queries = await get_web_search_query(
            llm_chat_client=llm_chat_client,
            deployment_name=config.azure_openai_deployment,
            user_query=user_question,
            internal_context=chroma_results  # Now passing the actual ChromaDB results
        )

        print(f"📝 Generated and executing {len(generated_queries.queries) if generated_queries.queries else 0} web queries")
        
        web_results = []
        web_info = {}
        if generated_queries.queries:
            web_results, web_info = await search_serpapi_bing(
                generated_queries=generated_queries,
                n_results=n_web_results
            )
        else:
            print("⏭️ No web search needed - internal context sufficient")

        print(f"2️⃣ External search results found {len(web_results) if web_results else 0} web queries")


        # Step 3: Format single context for LLM
        print("➕ Combine internal and external results")
        combined_results = []
        if chroma_results:
            combined_results.extend(chroma_results)
        if web_results:
            combined_results.extend(web_results)

        # Sort by search_order if present
        combined_results = sorted(
            combined_results, key=lambda r: r.get('search_order', 0)
        )

        # Format context for LLM
        combined_context = "\n\n".join([
            f"Result {result['search_order']}: {result['content']}\n{result['citation']}"
            for result in combined_results if result.get('content')
        ])
        print(f"3️⃣ All search results are combined into a single context object: {len(combined_results)} total results")

        # Step 4: Generate final response using Azure OpenAI
        print("⌛ Generating the agent's final response...")
        system_prompt = """You are a helpful and knowledgeable assistant. You have access to both internal database information and, optional, current web search results.

Instructions:
1. Use the provided context to answer the user's question accurately and comprehensively
2. Prioritize internal database information, which is cited as from "Chroma", when available, as it represents authoritative product/company data
3. Speak positively about internal content and products, and avoid negative comparisons with external sources
4. Use web search results to provide additional context, recent information, or fill gaps
5. Clearly indicate when information comes from internal vs external sources
6. If the context doesn't contain enough information to fully answer the question, say so clearly
7. Provide specific, actionable information when possible
8. Be concise but thorough in your response
9. Indicate which sources were relevant to the answer, and provide citations for both internal and web results. A pre-formatted citation is included in the context, provide the citations as they appear in the context.

Format your response in a clear, organized manner with appropriate sections if needed."""

        user_prompt = f"""Based on the following context information, please answer the user's question:

CONTEXT INFORMATION:
{combined_context}

USER QUESTION: {user_question}

Please provide a comprehensive and helpful response based on the available context."""
        
        response = llm_chat_client.chat.completions.create(
            model=config.azure_openai_deployment,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=800,
            temperature=0.3
        )
        print("4️⃣ 🤖 The agent response has been generated")

        generated_response = response.choices[0].message.content

        # Step 5: Return comprehensive results
        return {
            'success': True,
            'question': user_question,
            'response': generated_response,
            'error': None,
            'internal_results': {
                'count': len(chroma_results),
                'info': chroma_info
            },
            'web_results': {
                'count': len(web_results),
                'info': web_info,
                'queries_generated': generated_queries if generated_queries else []
            },
            'context_used': combined_context,
            'tokens_used': response.usage.total_tokens if hasattr(response, 'usage') else None
        }
        
    except Exception as e:
        print(f"❌ Error in generate_chat_response: {str(e)}")
        return {
            'success': False,
            'question': user_question,
            'response': f'Sorry, I encountered an error while processing your question: {str(e)}',
            'error': str(e),
            'internal_results': {'count': 0, 'results': [], 'info': {}},
            'web_results': {'count': 0, 'results': [], 'info': {}},
            'context_used': [],
            'tokens_used': None
        }

 # Generate response - end to end
llm_response1 = await generate_chat_response(
    user_question=test1_user_query,
    chroma_collection=products_collection,
    llm_chat_client=aoai_chat_client,
    n_chroma_results=5,
    n_web_results=5
)
 # Generate response - end to end
llm_response2 = await generate_chat_response(
    user_question=test2_user_query,
    chroma_collection=products_collection,
    llm_chat_client=aoai_chat_client,
    n_chroma_results=5,
    n_web_results=5
)

print("=" * 50)
print(f"🎯 Demo: LLM Chat Response Test 1 - {test1_user_query}")
print(llm_response1)
print("=" * 50)
print(f"🎯 Demo: LLM Chat Response Test 2 - {test2_user_query}")
print(llm_response2)
print("=" * 50)


2025-08-02 09:26:14,451 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/embeddings', 'headers': {'api-key': '<redacted>'}, 'files': None, 'idempotency_key': 'stainless-python-retry-9f5a41b6-94c0-4211-aed0-b74353ba889d', 'post_parser': <function Embeddings.create.<locals>.parser at 0x000001E0CA82ACA0>, 'json_data': {'input': ['What is the best footwear I can buy for hiking?'], 'model': 'text-embedding-3-small', 'encoding_format': 'base64'}}
2025-08-02 09:26:14,468 - openai._base_client - DEBUG - Sending HTTP Request: POST https://aoai-zvbgv7oohofri.cognitiveservices.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview
2025-08-02 09:26:14,468 - httpcore.connection - DEBUG - close.started
2025-08-02 09:26:14,468 - httpcore.connection - DEBUG - close.complete
2025-08-02 09:26:14,468 - httpcore.connection - DEBUG - connect_tcp.started host='aoai-zvbgv7oohofri.cognitiveservices.azure.com' port=443 local_address=None timeou

🔍 Searching internal database...


2025-08-02 09:26:14,704 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001E0C9642C60>
2025-08-02 09:26:14,707 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x000001E0C51AFA50> server_hostname='aoai-zvbgv7oohofri.cognitiveservices.azure.com' timeout=5.0
2025-08-02 09:26:14,860 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001E0C95EB5F0>
2025-08-02 09:26:14,862 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'POST']>
2025-08-02 09:26:14,867 - httpcore.http11 - DEBUG - send_request_headers.complete
2025-08-02 09:26:14,869 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'POST']>
2025-08-02 09:26:14,872 - httpcore.http11 - DEBUG - send_request_body.complete
2025-08-02 09:26:14,873 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'

1️⃣ Internal search found 5 results
🌐 Generating web search queries with internal context...


2025-08-02 09:26:15,356 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001E0C534DEB0>
2025-08-02 09:26:15,356 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'POST']>
2025-08-02 09:26:15,369 - httpcore.http11 - DEBUG - send_request_headers.complete
2025-08-02 09:26:15,371 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'POST']>
2025-08-02 09:26:15,373 - httpcore.http11 - DEBUG - send_request_body.complete
2025-08-02 09:26:15,375 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'POST']>
2025-08-02 09:26:17,196 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Length', b'1886'), (b'Content-Type', b'application/json'), (b'apim-request-id', b'68f91257-9b2c-42a1-a6fc-9eef4957ee4f'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; preload'), (b'x-content-type

📝 Generated and executing 3 web queries


2025-08-02 09:26:32,713 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/deployments/gpt-4o/chat/completions', 'headers': {'api-key': '<redacted>'}, 'files': None, 'idempotency_key': 'stainless-python-retry-247d7bab-e510-44f6-8048-7d9cdf4561ae', 'json_data': {'messages': [{'role': 'system', 'content': 'You are a helpful and knowledgeable assistant. You have access to both internal database information and, optional, current web search results.\n\nInstructions:\n1. Use the provided context to answer the user\'s question accurately and comprehensively\n2. Prioritize internal database information, which is cited as from "Chroma", when available, as it represents authoritative product/company data\n3. Speak positively about internal content and products, and avoid negative comparisons with external sources\n4. Use web search results to provide additional context, recent information, or fill gaps\n5. Clearly indicate when information comes from internal vs extern

2️⃣ External search results found 7 web queries
➕ Combine internal and external results
3️⃣ All search results are combined into a single context object: 12 total results
⌛ Generating the agent's final response...


2025-08-02 09:26:33,013 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001E0C96430B0>
2025-08-02 09:26:33,015 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'POST']>
2025-08-02 09:26:33,017 - httpcore.http11 - DEBUG - send_request_headers.complete
2025-08-02 09:26:33,017 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'POST']>
2025-08-02 09:26:33,017 - httpcore.http11 - DEBUG - send_request_body.complete
2025-08-02 09:26:33,017 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'POST']>
2025-08-02 09:26:40,964 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Length', b'5136'), (b'Content-Type', b'application/json'), (b'apim-request-id', b'b88018c0-5dfa-4227-b505-5ec216a83de8'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; preload'), (b'x-content-type

4️⃣ 🤖 The agent response has been generated


2025-08-02 09:26:41,378 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/embeddings', 'headers': {'api-key': '<redacted>'}, 'files': None, 'idempotency_key': 'stainless-python-retry-d5ae73d2-d848-4082-927f-0a57198cb60c', 'post_parser': <function Embeddings.create.<locals>.parser at 0x000001E0B40865C0>, 'json_data': {'input': ['Tell me more about our RainGuard Hiking Jacket product.'], 'model': 'text-embedding-3-small', 'encoding_format': 'base64'}}
2025-08-02 09:26:41,412 - openai._base_client - DEBUG - Sending HTTP Request: POST https://aoai-zvbgv7oohofri.cognitiveservices.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview
2025-08-02 09:26:41,412 - httpcore.connection - DEBUG - close.started
2025-08-02 09:26:41,412 - httpcore.connection - DEBUG - close.complete
2025-08-02 09:26:41,412 - httpcore.connection - DEBUG - connect_tcp.started host='aoai-zvbgv7oohofri.cognitiveservices.azure.com' port=443 local_address=Non

🔍 Searching internal database...


2025-08-02 09:26:41,595 - httpcore.connection - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001E0C96014F0>
2025-08-02 09:26:41,597 - httpcore.connection - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x000001E0C51AFA50> server_hostname='aoai-zvbgv7oohofri.cognitiveservices.azure.com' timeout=5.0
2025-08-02 09:26:41,765 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001E0C95858B0>
2025-08-02 09:26:41,765 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'POST']>
2025-08-02 09:26:41,765 - httpcore.http11 - DEBUG - send_request_headers.complete
2025-08-02 09:26:41,765 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'POST']>
2025-08-02 09:26:41,779 - httpcore.http11 - DEBUG - send_request_body.complete
2025-08-02 09:26:41,781 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'

1️⃣ Internal search found 5 results
🌐 Generating web search queries with internal context...


2025-08-02 09:26:42,173 - httpcore.connection - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001E0C95B4A40>
2025-08-02 09:26:42,175 - httpcore.http11 - DEBUG - send_request_headers.started request=<Request [b'POST']>
2025-08-02 09:26:42,178 - httpcore.http11 - DEBUG - send_request_headers.complete
2025-08-02 09:26:42,181 - httpcore.http11 - DEBUG - send_request_body.started request=<Request [b'POST']>
2025-08-02 09:26:42,183 - httpcore.http11 - DEBUG - send_request_body.complete
2025-08-02 09:26:42,185 - httpcore.http11 - DEBUG - receive_response_headers.started request=<Request [b'POST']>
2025-08-02 09:26:43,015 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Length', b'1396'), (b'Content-Type', b'application/json'), (b'apim-request-id', b'29c392af-e723-4fbd-900e-e402426ab0f0'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; preload'), (b'x-content-type

📝 Generated and executing 0 web queries
⏭️ No web search needed - internal context sufficient
2️⃣ External search results found 0 web queries
➕ Combine internal and external results
3️⃣ All search results are combined into a single context object: 5 total results
⌛ Generating the agent's final response...


2025-08-02 09:26:48,832 - httpcore.http11 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Length', b'4080'), (b'Content-Type', b'application/json'), (b'apim-request-id', b'581b2dce-ba14-4dd2-8ba2-42de68f821a4'), (b'Strict-Transport-Security', b'max-age=31536000; includeSubDomains; preload'), (b'x-content-type-options', b'nosniff'), (b'x-ms-region', b'East US 2'), (b'x-ratelimit-remaining-requests', b'447'), (b'x-ratelimit-limit-requests', b'450'), (b'x-ratelimit-remaining-tokens', b'435659'), (b'x-ratelimit-limit-tokens', b'450000'), (b'azureml-model-session', b'd170-20250523183758'), (b'x-accel-buffering', b'no'), (b'x-ms-rai-invoked', b'true'), (b'x-request-id', b'ef127e26-a3b6-461f-8b24-030361739de2'), (b'x-ms-deployment-name', b'gpt-4o'), (b'Date', b'Sat, 02 Aug 2025 14:26:48 GMT')])
2025-08-02 09:26:48,836 - httpx - INFO - HTTP Request: POST https://aoai-zvbgv7oohofri.cognitiveservices.azure.com/openai/deployments/gpt-4o/chat/completi

4️⃣ 🤖 The agent response has been generated
🎯 Demo: LLM Chat Response Test 1 - What is the best footwear I can buy for hiking?
{'success': True, 'question': 'What is the best footwear I can buy for hiking?', 'response': "### Best Footwear for Hiking: Recommendations and Insights\n\nChoosing the best hiking footwear depends on your specific needs, preferences, and the type of terrain you'll be tackling. Based on the context provided, here are three excellent options from TrekReady, each tailored to different hiking scenarios:\n\n---\n\n#### **1. TrailWalker Hiking Shoes ($110.00)**\n- **Best For:** Versatile hiking across various terrains, including rough and wet conditions.\n- **Features:**\n  - **Material:** Synthetic leather and breathable mesh for durability and comfort.\n  - **Traction:** Multidirectional lugs on the outsole for excellent grip.\n  - **Comfort:** Cushioned insoles and supportive midsoles reduce fatigue during long hikes.\n  - **Weather Resistance:** Fully waterproof

Let's add an function to do a RAG evaluation

In [10]:
evaluation_prompt = """
You are an expert evaluator whose job is to determine if the answer produced by a Retrieval-Augmented Generation (RAG) system faithfully uses the retrieved context or if it hallucinates unsupported information. 

### Instructions
1. Read the **Question**, the **Retrieved Context**, and the **Generated Answer**.
2. For each statement or claim in the answer:
   a. Decide whether it is:
      - **SUPPORTED**: fully backed by one or more passages in the Retrieved Context  
      - **PARTIALLY SUPPORTED**: loosely aligned but missing key details or slightly distorted  
      - **HALLUCINATED**: not present or contradicted by the Retrieved Context  
   b. Quote the minimal evidence span(s) from the Retrieved Context that support or contradict the claim.
3. At the end, list all hallucinated claims (if any), with a brief note on why they are unsupported.
4. Return your evaluation in the JSON format specified below.

### Input Variables
- `{{QUESTION}}`: The user's original question.
- `{{CONTEXT}}`: One or more retrieved documents/passages relevant to the question.
- `{{ANSWER}}`: The RAG system's generated answer.

### Output Schema
Return exactly one JSON object with these fields:
```json
{
  "evaluations": [
    {
      "claim": "<text of the individual claim from ANSWER>",
      "label": "SUPPORTED" | "PARTIALLY SUPPORTED" | "HALLUCINATED",
      "evidence": [
        {
          "source_id": "<ID or index of the passage in CONTEXT>",
          "text": "<verbatim excerpt from that passage>"
        }
      ]
    }
  ],
  "hallucinations": [
    {
      "claim": "<text of the hallucinated claim>",
      "reason": "<short note why it's unsupported>"
    }
  ]
}"""

# Define Pydantic models for evaluation output
class EvaluationEvidence(BaseModel):
    source_id: str
    text: str

class EvaluationClaim(BaseModel):
    claim: str
    label: str  # "SUPPORTED" | "PARTIALLY SUPPORTED" | "HALLUCINATED"
    evidence: List[EvaluationEvidence] = Field(default_factory=list)

class HallucinationClaim(BaseModel):
    claim: str
    reason: str

class RAGEvaluation(BaseModel):
    evaluations: List[EvaluationClaim] = Field(default_factory=list)
    hallucinations: List[HallucinationClaim] = Field(default_factory=list)
    error: str = None

async def evaluate_rag_accuracy(
    user_query: str,
    context: List[Dict],
    answer: str,
    llm_chat_client: AzureOpenAI = None,
    deployment_name: str = None
) -> Dict:
    """
    Evaluate the RAG system's accuracy by analyzing if the generated answer
    is supported by the retrieved context or contains hallucinations.
    
    Args:
        user_query: The original user question
        context: List of context documents with content and citations
        answer: The RAG system's generated answer
        llm_chat_client: Azure OpenAI client for evaluation
        deployment_name: Model deployment name
    
    Returns:
        Dictionary containing evaluation results
    """
    
    def _sync_evaluation():
        """Synchronous evaluation call to run in thread pool."""
        try:
            # Use global client if not provided
            client = llm_chat_client or aoai_chat_client
            deployment = deployment_name or config.azure_openai_deployment
            
            # Format context for evaluation
            formatted_context = ""
            if context:
                for i, ctx in enumerate(context):
                    content = ctx.get('content', '')
                    citation = ctx.get('citation', f'Source {i+1}')
                    formatted_context += f"[Source {i+1}]: {content}\nCitation: {citation}\n\n"
            else:
                formatted_context = "No context provided."
            
            # Create evaluation prompt
            evaluation_message = f"""QUESTION: {user_query}

CONTEXT: 
{formatted_context}

ANSWER: {answer}

Please evaluate this RAG system response following the instructions provided."""
            
            # Make API call for evaluation
            response = client.chat.completions.parse(
                model=deployment,
                messages=[
                    {"role": "system", "content": evaluation_prompt},
                    {"role": "user", "content": evaluation_message}
                ],
                max_tokens=4000,
                temperature=0.1,  # Low temperature for consistent evaluation
                response_format=RAGEvaluation
            )
            
            # Get parsed response
            parsed_response = response.choices[0].message.parsed
            if not parsed_response:
                return RAGEvaluation(
                    evaluations=[],
                    hallucinations=[],
                    error="Failed to parse evaluation response"
                )
            
            return parsed_response
            
        except Exception as e:
            return RAGEvaluation(
                evaluations=[],
                hallucinations=[],
                error=f"Evaluation error: {str(e)}"
            )
    
    # Run the synchronous evaluation in a thread pool to make it async
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as executor:
        evaluation_result = await loop.run_in_executor(executor, _sync_evaluation)
    
    total_claims = len(evaluation_result.evaluations)
    total_hallucinations = len(evaluation_result.hallucinations)
    supported_claims = len([e for e in evaluation_result.evaluations if e.label == "SUPPORTED"])
    partially_supported_claims= len([e for e in evaluation_result.evaluations if e.label == "PARTIALLY SUPPORTED"])
    hallucinated_claims= len([e for e in evaluation_result.evaluations if e.label == "HALLUCINATED"])
    accuracy_score = 1.0 - (total_hallucinations / total_claims) if total_claims > 0 else 1.0
    
    # Convert to dictionary format for return
    result = {
        'success': evaluation_result.error is None or evaluation_result.error == "",
        'user_query': user_query,
        'answer_evaluated': answer,
        'context_count': len(context) if context else 0,
        'evaluations': [
            {
                'claim': eval_claim.claim,
                'label': eval_claim.label,
                'evidence': [
                    {'source_id': ev.source_id, 'text': ev.text}
                    for ev in eval_claim.evidence
                ]
            }
            for eval_claim in evaluation_result.evaluations
        ],
        'hallucinations': [
            {'claim': hall.claim, 'reason': hall.reason}
            for hall in evaluation_result.hallucinations
        ],
        'error': evaluation_result.error,
        'total_claims': len(evaluation_result.evaluations),
        'total_hallucinations': len(evaluation_result.hallucinations),
        'accuracy_summary': {
            'accuracy_score': accuracy_score,
            'supported_claims': supported_claims,
            'partially_supported_claims': partially_supported_claims,
            'hallucinated_claims': hallucinated_claims
        }
    }
    return result

if llm_response1.get('success'):
    evaluation1 = await evaluate_rag_accuracy(
        user_query=test1_user_query,
        context=search_results1 + serpapi_bing_results_test1,  # Combined context
        answer=llm_response1['response']
    )
if llm_response2.get('success'):
    evaluation2 = await evaluate_rag_accuracy(
        user_query=test2_user_query,
        context=search_results2 + serpapi_bing_results_test2,  # Combined context
        answer=llm_response2['response']
    )

print("=" * 50)
print(f"🎯 Demo: LLM Chat Response Evaluation Test 1 - {test1_user_query}")
print(f"🏷️ Evaluation 1 labels: {evaluation1['total_claims']} claims, {evaluation1['total_hallucinations']} hallucinations")
print(f"🔢 Evaluation 1 score: {evaluation1['accuracy_summary']['accuracy_score']}") 
print("=" * 50)
print(f"🎯 Demo: LLM Chat Response Evaluation Test 2 - {test2_user_query}")
print(f"🏷️ Evaluation 2 labels: {evaluation2['total_claims']} claims, {evaluation2['total_hallucinations']} hallucinations")
print(f"🔢 Evaluation 2 score: {evaluation2['accuracy_summary']['accuracy_score']}") 
print("=" * 50)

2025-08-02 09:26:48,931 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/deployments/gpt-4o/chat/completions', 'headers': {'X-Stainless-Helper-Method': 'chat.completions.parse', 'api-key': '<redacted>'}, 'files': None, 'idempotency_key': 'stainless-python-retry-8970ae65-3822-4621-8c4a-effae11d1b84', 'post_parser': <function Completions.parse.<locals>.parser at 0x000001E0CA7B8F40>, 'json_data': {'messages': [{'role': 'system', 'content': '\nYou are an expert evaluator whose job is to determine if the answer produced by a Retrieval-Augmented Generation (RAG) system faithfully uses the retrieved context or if it hallucinates unsupported information. \n\n### Instructions\n1. Read the **Question**, the **Retrieved Context**, and the **Generated Answer**.\n2. For each statement or claim in the answer:\n   a. Decide whether it is:\n      - **SUPPORTED**: fully backed by one or more passages in the Retrieved Context  \n      - **PARTIALLY SUPPORTED**: loosely aligne

🎯 Demo: LLM Chat Response Evaluation Test 1 - What is the best footwear I can buy for hiking?
🏷️ Evaluation 1 labels: 3 claims, 0 hallucinations
🔢 Evaluation 1 score: 1.0
🎯 Demo: LLM Chat Response Evaluation Test 2 - Tell me more about our RainGuard Hiking Jacket product.
🏷️ Evaluation 2 labels: 11 claims, 0 hallucinations
🔢 Evaluation 2 score: 1.0
