Set path (to work when running from ./sk_mcp_demo or ./sk_mcp_demo/mcp_rag)

In [3]:
import sys
import os
import asyncio
import time
from pathlib import Path
# Detect and set up proper path to src directory
current_dir = Path.cwd()
# Check if we're in the root folder
if 'mcp_rag' in os.listdir(current_dir):
    # Running from root folder (sk_mcp_demo)
    project_root = os.path.join(str(current_dir), 'mcp_rag', 'src')
else:
    # Running from mcp_rag folder or subfolder
    mcp_rag_dir = current_dir
    while mcp_rag_dir.name != 'mcp_rag' and mcp_rag_dir != mcp_rag_dir.parent:
        mcp_rag_dir = mcp_rag_dir.parent
    project_root = os.path.join(str(mcp_rag_dir), 'src')

# Add src directory to path if it's not already there
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added {project_root} to Python path")

In [4]:
# config
from utils import McpConfig
config = McpConfig(environment = "local")
print(config)

2025-08-08 08:24:48,449 - root - INFO - ✅ Loaded environment file for 'local': c:\Users\aprilhazel\Source\sk_mcp_demo\mcp_rag\.env.local




Creating the ChromaDB vector database, embedding, and storing data

In [5]:
import chromadb
from chromadb.utils import embedding_functions

# ChromaDB embeding_function
aoai_embedding_function = embedding_functions.OpenAIEmbeddingFunction(
    api_key=config.azure_openai_embedding_api_key,
    api_base=config.azure_openai_embedding_endpoint,
    api_type=config.openai_api_type,
    api_version=config.azure_openai_embedding_api_version,
    model_name=config.azure_openai_embedding_model,
    deployment_id=config.azure_openai_embedding_deployment
)

In [6]:
from pathlib import Path

chroma_db_path = Path(config.project_root/config.chroma_db_path.lstrip('./'))
chroma_client = chromadb.PersistentClient(path=str(chroma_db_path))
print(f"ChromaDB path: {chroma_db_path}")

ChromaDB path: c:\Users\aprilhazel\Source\sk_mcp_demo\mcp_rag\data\chroma_db


Create and a collection within the db

In [7]:
# Create or get the collection
try:
    products_collection = chroma_client.get_or_create_collection (
        name="product_collection", embedding_function=aoai_embedding_function)
except Exception as e:
    raise e

chroma_client.list_collections()

[Collection(name=product_collection)]

In [8]:
# Delete data (if you want to start fresh)
products_collection.delete(
    where={"name": {"$ne": ""}}
)

In [9]:
# https://docs.trychroma.com/docs/collections/manage-collections
# See the top 10 records
products_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents', 'embeddings'],
 'data': None,
 'metadatas': []}

Import contoso products (csv source: https://github.com/Azure-Samples/contoso-chat/blob/main/data/product_info/products.csv)

In [10]:
import pandas as pd

# Read the CSV file
products_df = pd.read_csv('products.csv')

# Prepare data for ChromaDB insertion
contents = []
metadatas = []
ids = []

for _, row in products_df.iterrows():
    # Create a string that includes all column information
    content = {
        'id': row['id'],
        'name': row['name'],
        'price': row['price'],
        'category': row['category'],
        'brand': row['brand'],
        'description': row['description']
    }
    
    # Convert the dictionary to a string for ChromaDB
    content_text = f"ID: {row['id']}, Name: {row['name']}, Price: ${row['price']}, Category: {row['category']}, Brand: {row['brand']}, Description: {row['description']}"
    
    # Create metadata with id, name, price, category, and brand
    metadata = {
        'id': str(row['id']),  # Convert to string for metadata
        'name': row['name'],
        'price': float(row['price']),
        'category': row['category'],
        'brand': row['brand']
    }
    
    contents.append(content_text)
    metadatas.append(metadata)
    ids.append(str(row['id']))  # Use the id column as ChromaDB id

# Upsert data into ChromaDB collection
products_collection.upsert(
    documents=contents,
    metadatas=metadatas,
    ids=ids
)

print(f"Successfully upserted {len(contents)} products into the ChromaDB collection.")
print(f"Collection now contains {products_collection.count()} contents.")

Successfully upserted 20 products into the ChromaDB collection.
Collection now contains 20 contents.


Take a peek

In [11]:
# https://docs.trychroma.com/docs/collections/manage-collections
# See the top 10 records
products_collection.peek()

{'ids': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
 'embeddings': array([[ 1.42184244e-02,  1.49091445e-02,  1.76068656e-02, ...,
         -3.54808965e-03,  1.70855671e-02, -5.18944262e-06],
        [ 2.44422704e-02, -4.82865900e-04,  1.69588719e-02, ...,
         -2.43603578e-03, -1.88854206e-02, -1.77452192e-02],
        [ 8.51014908e-03,  1.44666352e-03, -2.56910156e-02, ...,
         -1.57975033e-02,  2.48387661e-02,  1.56492870e-02],
        ...,
        [ 1.25831766e-02, -1.96192153e-02,  2.62994990e-02, ...,
         -1.26358811e-02, -5.32314507e-03,  2.76961643e-02],
        [-4.46882518e-03,  9.49968025e-03,  7.98494089e-03, ...,
         -1.32762492e-02, -1.66004524e-02, -1.12474570e-02],
        [ 1.46424361e-02,  2.95980796e-02, -3.25996466e-02, ...,
         -4.83122095e-02, -4.44525824e-04, -3.22342403e-02]],
       shape=(10, 1536)),
 'documents': ['ID: 1, Name: TrailMaster X4 Tent, Price: $250.0, Category: Tents, Brand: OutdoorLiving, Description: Unveiling th

# Test cases using the product collection

- **Test case 1**: User asks a question that goes beyond internal context: What is the best footwear I can buy for hiking?
- **Test case 2**: User asks a specific question about internal data: Tell me more about our RainGuard Hiking Jacket product.


In [12]:
test1_user_query = "What is the best footwear I can buy for hiking?"
test2_user_query = "Tell me more about our RainGuard Hiking Jacket product."

Let's do a chroma search

In [13]:
def search_chroma(collection, query, n_results=5):
    """Search the ChromaDB collection for the given query."""
    try:
        # Get ChromaDB information
        chroma_info = {
            'db_path': str(chroma_db_path),
            'collection_name': collection.name,
            'collection_count': collection.count()
        }

        # Search the collection with the query
        search_results = collection.query(
            query_texts=[query],
            n_results=n_results,
            include=['documents', 'metadatas', 'distances']
        )
        
        # Format the retrieved context
        context_contents = search_results['documents'][0] if search_results['documents'] else []
        context_metadata = search_results['metadatas'][0] if search_results['metadatas'] else []
        # Get IDs separately - ChromaDB always returns IDs
        context_ids = search_results.get('ids', [[]])[0] if 'ids' in search_results else []
        
        # If no IDs in search results, we need to get them differently
        if not context_ids and context_contents:
            # Use index-based IDs as fallback
            context_ids = [f"doc_{i}" for i in range(len(context_contents))]

        structured_context = []
        
        if context_contents:  # Only build if we have contents
            for i, (doc, metadata) in enumerate(zip(context_contents, context_metadata)):
                # Get content ID, using index as fallback
                doc_id = context_ids[i] if i < len(context_ids) else f"doc_{i}"
                
                # Create formatted citation
                citation = f"[Source: ChromaDB | Path: {chroma_info['db_path']} | Collection: {chroma_info['collection_name']} | content ID: {doc_id}]"
                
                # Create structured result object
                result_obj = {
                    'search_order': i + 1,
                    'content': doc,
                    'citation': citation,
                    'metadata': metadata,
                    'context_id': doc_id
                }
                structured_context.append(result_obj)

        return structured_context, chroma_info
        
    except Exception as e:
        # Always return a tuple to avoid unpacking errors
        return [], {'error': str(e)}
    
# Test the search function
search_query = test1_user_query
search_results, chroma_info = search_chroma(products_collection, test1_user_query, n_results=5)
print("###############################")
print("🔍 Search Results:")
print(f"Query: {search_query}")
print(f"ChromaDB Info: {chroma_info}")

if len(search_results) >= 0:
    print(f"Found {len(search_results)} results\n")
    for result in search_results:
        print(f"--- Result #{result['search_order']} ---")
        print(f"content: {result['content'][:100]}...")
        print(f"Citation: {result['citation']}")
else:
    print(f"Error or no results: {search_results}")

###############################
🔍 Search Results:
Query: What is the best footwear I can buy for hiking?
ChromaDB Info: {'db_path': 'c:\\Users\\aprilhazel\\Source\\sk_mcp_demo\\mcp_rag\\data\\chroma_db', 'collection_name': 'product_collection', 'collection_count': 20}
Found 5 results

--- Result #1 ---
content: ID: 11, Name: TrailWalker Hiking Shoes, Price: $110.0, Category: Hiking Footwear, Brand: TrekReady, ...
Citation: [Source: ChromaDB | Path: c:\Users\aprilhazel\Source\sk_mcp_demo\mcp_rag\data\chroma_db | Collection: product_collection | content ID: 11]
--- Result #2 ---
content: ID: 4, Name: TrekReady Hiking Boots, Price: $140.0, Category: Hiking Footwear, Brand: TrekReady, Des...
Citation: [Source: ChromaDB | Path: c:\Users\aprilhazel\Source\sk_mcp_demo\mcp_rag\data\chroma_db | Collection: product_collection | content ID: 4]
--- Result #3 ---
content: ID: 18, Name: TrekStar Hiking Sandals, Price: $70.0, Category: Hiking Footwear, Brand: TrekReady, De...
Citation: [Source: Chrom