# Building a Vector Database with ChromaDB

This notebook implements a vector database using ChromaDB to store and efficiently retrieve the embeddings we generated in the previous step. ChromaDB is a lightweight, embedded vector database that works well for RAG applications and doesn't require any external services.

In [1]:
# Import required libraries
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm # For progress bars in Jupyter
import logging 
import time # For timing the main execution
from datetime import datetime

In [2]:
# Import ChromaDB
import chromadb

In [3]:
# For query embedding and testing
from sentence_transformers import SentenceTransformer
import torch # For checking CUDA availability for SentenceTransformer

In [4]:
# Directory with stored embeddings (output from generate_embeddings.ipynb)
EMBEDDINGS_DIR = "../data/embeddings"  
# File containing chunks with their pre-computed embeddings and metadata
CHUNKS_WITH_EMBEDDINGS_FILE = os.path.join(EMBEDDINGS_DIR, "chunks_with_embeddings.json") 
# Directory to store ChromaDB persistent files
CHROMA_DB_DIR = "../data/chroma_db"  
# Output directory for log files
LOGS_DIR = "../logs" 
# Name for the ChromaDB collection
COLLECTION_NAME = "uchicago_ms_applied_ds"
# Sentence Transformer model name (must be the same as used for generating document embeddings)
QUERY_EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
# Batch size for adding documents to ChromaDB
CHROMA_ADD_BATCH_SIZE = 100 # Adjust based on your system's memory

In [5]:
# Create output directories if they don't exist
Path(CHROMA_DB_DIR).mkdir(parents=True, exist_ok=True)
Path(LOGS_DIR).mkdir(parents=True, exist_ok=True)

In [6]:
# Setup logging
log_file = Path(LOGS_DIR) / f"vector_database_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(module)s - %(funcName)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file, encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [7]:
def load_data_for_chroma(chunks_with_embeddings_filepath_str):
    """
    Load chunks, their pre-computed embeddings, and metadata from the JSON file.
    Prepares data in the format required by ChromaDB.
    """
    chunks_file = Path(chunks_with_embeddings_filepath_str)
    if not chunks_file.exists():
        logger.error(f"Data file not found: {chunks_file}. Please run the embedding generation script first.")
        return None, None, None, None

    try:
        with open(chunks_file, "r", encoding="utf-8") as f:
            loaded_data = json.load(f)
        logger.info(f"Successfully loaded {len(loaded_data)} items from {chunks_file}")

        ids_list = []
        embeddings_list = []
        documents_list = []
        metadatas_list = []

        for i, item in enumerate(loaded_data):
            content_key = 'page_content' if 'page_content' in item else 'content'
            doc_content = item.get(content_key, "")
            embedding_vector = item.get('embedding')
            metadata_dict = item.get('metadata', {}).copy() # Make a copy to modify

            if not doc_content or embedding_vector is None:
                logger.warning(f"Skipping item at index {i} due to missing content or embedding.")
                continue

            # Ensure metadata values are Chroma-compatible (str, int, float, bool)
            for key, value in metadata_dict.items():
                if not isinstance(value, (str, int, float, bool)):
                    metadata_dict[key] = str(value) # Convert other types to string

            # Use 'chunk_id' from metadata if available, otherwise generate one
            # The updated_markdown_chunking script should provide a good 'chunk_id'
            chunk_id = metadata_dict.get('chunk_id', f"chunk_{metadata_dict.get('filename', 'unknown')}_{metadata_dict.get('chunk_number', i)}")
            
            ids_list.append(chunk_id)
            embeddings_list.append(embedding_vector) # Chroma expects list of lists for embeddings
            documents_list.append(doc_content)
            metadatas_list.append(metadata_dict)
            
        if not ids_list: # If all items were skipped
            logger.error("No valid data to load for ChromaDB.")
            return None, None, None, None

        logger.info(f"Prepared {len(ids_list)} items for ChromaDB.")
        logger.info(f"Sample ID: {ids_list[0]}")
        logger.info(f"Sample Document (start): {documents_list[0][:100]}...")
        logger.info(f"Sample Metadata: {metadatas_list[0]}")
        
        return ids_list, embeddings_list, documents_list, metadatas_list

    except json.JSONDecodeError as e:
        logger.error(f"Error decoding JSON from {chunks_file}: {e}")
        return None, None, None, None
    except Exception as e:
        logger.error(f"Unexpected error loading data for ChromaDB: {e}", exc_info=True)
        return None, None, None, None

In [8]:
def initialize_chroma_client_and_collection(db_path_str, collection_name_str):
    """
    Initialize ChromaDB client and create/get a collection.
    The collection will use cosine distance, suitable for Sentence Transformers.
    """
    db_path = Path(db_path_str)
    try:
        client = chromadb.PersistentClient(path=str(db_path))
        logger.info(f"Initialized ChromaDB client with persistent storage at: {db_path}")
        
        # Get or create the collection with cosine distance
        # Sentence Transformer embeddings are typically compared using cosine similarity.
        # Chroma's "cosine" distance is 1 - cosine_similarity. Lower distance = more similar.
        collection = client.get_or_create_collection(
            name=collection_name_str,
            metadata={"hnsw:space": "cosine"} # Specifies cosine distance for HNSW index
        )
        logger.info(f"Successfully got or created collection: '{collection_name_str}' with cosine distance.")
        return client, collection
    except Exception as e:
        logger.error(f"Error initializing ChromaDB client or collection: {e}", exc_info=True)
        return None, None

In [9]:
def add_data_to_chroma_collection(collection_instance, ids_list, embeddings_list, documents_list, metadatas_list, batch_size=CHROMA_ADD_BATCH_SIZE):
    """
    Add data (with pre-computed embeddings) to the ChromaDB collection in batches.
    """
    if collection_instance is None or not ids_list:
        logger.error("ChromaDB collection not initialized or no data to add.")
        return False
    
    if not (len(ids_list) == len(embeddings_list) == len(documents_list) == len(metadatas_list)):
        logger.error("Mismatch in lengths of IDs, embeddings, documents, or metadatas lists.")
        return False

    try:
        num_items = len(ids_list)
        for i in tqdm(range(0, num_items, batch_size), desc="Adding data to ChromaDB"):
            batch_end_idx = min(i + batch_size, num_items)
            
            current_batch_ids = ids_list[i:batch_end_idx]
            current_batch_embeddings = embeddings_list[i:batch_end_idx]
            current_batch_documents = documents_list[i:batch_end_idx]
            current_batch_metadatas = metadatas_list[i:batch_end_idx]
            
            collection_instance.add(
                ids=current_batch_ids,
                embeddings=current_batch_embeddings,
                documents=current_batch_documents,
                metadatas=current_batch_metadatas
            )
        logger.info(f"Successfully added/updated {num_items} documents in ChromaDB collection '{collection_instance.name}'.")
        logger.info(f"Collection now contains {collection_instance.count()} items.")
        return True
    except Exception as e:
        logger.error(f"Error adding data to ChromaDB: {e}", exc_info=True)
        return False

In [10]:
def initialize_query_embedding_model(model_name=QUERY_EMBEDDING_MODEL_NAME):
    """
    Initialize the Sentence Transformer model for embedding queries.
    This should be the same model used to generate the document embeddings.
    """
    logger.info(f"Initializing Sentence Transformer model for queries: {model_name}")
    try:
        model = SentenceTransformer(model_name)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device) # Move model to GPU if available
        logger.info(f"Query embedding model '{model_name}' initialized on device: {device}")
        return model
    except Exception as e:
        logger.error(f"Error loading query embedding model '{model_name}': {e}", exc_info=True)
        return None

In [11]:
def query_chroma_collection(collection_instance, query_text_str, query_model_instance, top_n_results=5, metadata_filter_dict=None):
    """
    Embed a query and search the ChromaDB collection.
    """
    if collection_instance is None or query_model_instance is None:
        logger.error("ChromaDB collection or query model not initialized.")
        return None
    
    try:
        query_embedding_vector = query_model_instance.encode(query_text_str).tolist() # Chroma expects list
        
        logger.info(f"Querying collection for: \"{query_text_str}\"")
        if metadata_filter_dict:
            logger.info(f"Applying metadata filter: {metadata_filter_dict}")

        results_dict = collection_instance.query(
            query_embeddings=[query_embedding_vector], # Query embeddings must be a list of lists
            n_results=top_n_results,
            where=metadata_filter_dict, # Optional metadata filter
            include=['metadatas', 'documents', 'distances'] # Include distances for relevance
        )
        return results_dict
    except Exception as e:
        logger.error(f"Error querying ChromaDB: {e}", exc_info=True)
        return None

In [12]:
def display_query_search_results(query_results_dict, original_query_str):
    """
    Display ChromaDB query results in a readable format.
    """
    if query_results_dict is None or not query_results_dict.get("ids") or not query_results_dict["ids"][0]:
        logger.info(f"No results found for query: \"{original_query_str}\"")
        return

    logger.info(f"\n--- Search Results for Query: \"{original_query_str}\" ---")
    
    for i in range(len(query_results_dict["ids"][0])):
        doc_id = query_results_dict["ids"][0][i]
        document_text = query_results_dict["documents"][0][i]
        metadata_info = query_results_dict["metadatas"][0][i]
        distance_score = query_results_dict["distances"][0][i] if query_results_dict.get("distances") else None
        
        logger.info("-" * 80)
        logger.info(f"Result #{i+1} - ID: {doc_id}")
        if distance_score is not None:
            # For cosine distance, similarity = 1 - distance. Higher similarity is better.
            # For L2 distance, lower is better.
            # Assuming cosine as per collection setup:
            relevance_score = 1 - distance_score 
            logger.info(f"Relevance (1 - Cosine Distance): {relevance_score:.4f} (Distance: {distance_score:.4f})")
        
        logger.info(f"  Source: {metadata_info.get('source_file', metadata_info.get('filename', 'Unknown'))}")
        logger.info(f"  Title: {metadata_info.get('title', 'N/A')}")
        logger.info(f"  Category: {metadata_info.get('category', 'N/A')}")
        logger.info(f"  Main Heading: {metadata_info.get('main_heading', 'N/A')}")
        logger.info(f"  Content Snippet: {document_text[:250]}...")
    logger.info("-" * 80)


In [13]:
def build_retrieval_context(query_str, top_k_docs=3, filter_options_dict=None, 
                           collection_inst=None, query_embed_model_inst=None): # Added collection and model as params
    """
    Retrieve relevant context strings for a given query, to be used in a RAG system.
    """
    if collection_inst is None or query_embed_model_inst is None:
        logger.error("Collection or query embedding model not provided for context retrieval.")
        return ["Error: Vector DB or query model not initialized."]

    query_search_results = query_chroma_collection(
        collection_inst, 
        query_str, 
        query_embed_model_inst, 
        top_n_results=top_k_docs, 
        metadata_filter_dict=filter_options_dict
    )
    
    if query_search_results is None or not query_search_results.get("documents") or not query_search_results["documents"][0]:
        return ["No relevant information found in the knowledge base."]

    context_strings_list = []
    for i in range(len(query_search_results["documents"][0])):
        doc_text = query_search_results["documents"][0][i]
        meta_info = query_search_results["metadatas"][0][i]
        source_filename = meta_info.get('source_file', meta_info.get('filename', "Unknown source"))
        doc_title = meta_info.get('title', "")
        
        source_ref = f"[Source: {Path(source_filename).name}" + (f", Title: {doc_title}" if doc_title else "") + "]"
        context_strings_list.append(f"Context:\n{doc_text}\n{source_ref}\n")
        
    return context_strings_list

In [14]:
logger.info("--- Starting Vector Database Creation and Testing Process ---")
db_creation_start_time = time.time()

# 1. Load pre-computed embeddings, documents, and metadata
ids, embeddings, documents, metadatas = load_data_for_chroma(CHUNKS_WITH_EMBEDDINGS_FILE)

if ids: # Check if data was loaded successfully
    # 2. Initialize ChromaDB client and collection
    chroma_client_instance, chroma_collection_instance = initialize_chroma_client_and_collection(
        CHROMA_DB_DIR, COLLECTION_NAME
    )

    if chroma_collection_instance:
        # 3. Add data to the collection
        # Check if the collection is empty before adding to avoid duplicates if script is re-run
        # Note: initialize_chroma_client_and_collection can be modified to delete/recreate if needed
        if chroma_collection_instance.count() == 0:
            logger.info(f"Collection '{COLLECTION_NAME}' is empty. Populating with new data.")
            add_data_to_chroma_collection(
                chroma_collection_instance, ids, embeddings, documents, metadatas
            )
        else:
            logger.info(f"Collection '{COLLECTION_NAME}' already contains {chroma_collection_instance.count()} items. Skipping data addition.")
            logger.info("To re-populate, delete the CHROMA_DB_DIR or modify initialize_chroma_client_and_collection to delete existing collections.")


        # 4. Initialize the embedding model for queries
        query_model = initialize_query_embedding_model(QUERY_EMBEDDING_MODEL_NAME)

        if query_model:
            # 5. Test some queries
            sample_test_queries = [
                "What are the core courses for the MS in Applied Data Science?",
                "Tell me about the faculty specializing in machine learning.",
                "What are the admission requirements?",
                "How is the capstone project structured?"
            ]
            for test_q in sample_test_queries:
                results = query_chroma_collection(chroma_collection_instance, test_q, query_model, top_n_results=3)
                display_query_search_results(results, test_q)
            
            # Test with metadata filter (example)
            if metadatas: # Check if metadatas list is not empty
                sample_category = metadatas[0].get('category')
                if sample_category:
                    logger.info(f"\n--- Testing Query with Metadata Filter (category: {sample_category}) ---")
                    filtered_query = "capstone project"
                    filter_criteria = {"category": sample_category} 
                    # Example of more complex filter: {"$and": [{"category": "education"}, {"title": {"$contains": "Online"}}]}
                    
                    filtered_results = query_chroma_collection(
                        chroma_collection_instance, filtered_query, query_model, 
                        top_n_results=2, metadata_filter_dict=filter_criteria
                    )
                    display_query_search_results(filtered_results, f"{filtered_query} (filtered by category: {sample_category})")
                else:
                    logger.info("Skipping metadata filter test as no sample category found in the first chunk.")
            
            # 6. Test the context retrieval function
            logger.info("\n--- Testing Context Retrieval Function ---")
            retrieved_contexts = build_retrieval_context(
                "What are the core courses?", 
                top_k_docs=2, 
                collection_inst=chroma_collection_instance, # Pass instances
                query_embed_model_inst=query_model           # Pass instances
            )
            for i, ctx in enumerate(retrieved_contexts):
                logger.info(f"\nRetrieved Context #{i+1}:\n{ctx}")
        else:
            logger.error("Query embedding model could not be initialized. Query testing skipped.")
    else:
        logger.error("ChromaDB collection could not be initialized. Process halted.")
else:
    logger.error(f"Failed to load data from {CHUNKS_WITH_EMBEDDINGS_FILE}. Process halted.")

db_creation_end_time = time.time()
elapsed_processing_time = db_creation_end_time - db_creation_start_time
logger.info(f"--- Vector Database Process Completed in {elapsed_processing_time:.2f} seconds ---")


2025-05-12 20:31:58,318 - INFO - 3026463495 - <module> - --- Starting Vector Database Creation and Testing Process ---
2025-05-12 20:32:01,876 - INFO - 1859704975 - load_data_for_chroma - Successfully loaded 11532 items from ..\data\embeddings\chunks_with_embeddings.json
2025-05-12 20:32:01,901 - INFO - 1859704975 - load_data_for_chroma - Prepared 11532 items for ChromaDB.
2025-05-12 20:32:01,903 - INFO - 1859704975 - load_data_for_chroma - Sample ID: 11th-hour-project_0
2025-05-12 20:32:01,904 - INFO - 1859704975 - load_data_for_chroma - Sample Document (start): 11th Hour Project Software and Data Hub...
2025-05-12 20:32:01,906 - INFO - 1859704975 - load_data_for_chroma - Sample Metadata: {'title': '11th Hour Project Software and Data Hub – DSI', 'original_url': 'https://datascience.uchicago.edu/11th-hour-project', 'category': '11th-hour-project', 'processing_date': '2025-05-12 13:20:18', 'source_file': 'C:\\Users\\alen.pavlovic\\Documents\\GitLab\\gen-ai-midterm-project\\data\\markdo

Adding data to ChromaDB:   0%|          | 0/116 [00:00<?, ?it/s]

2025-05-12 20:32:20,213 - INFO - 994437601 - add_data_to_chroma_collection - Successfully added/updated 11532 documents in ChromaDB collection 'uchicago_ms_applied_ds'.
2025-05-12 20:32:20,219 - INFO - 994437601 - add_data_to_chroma_collection - Collection now contains 11532 items.
2025-05-12 20:32:20,219 - INFO - 2929584852 - initialize_query_embedding_model - Initializing Sentence Transformer model for queries: all-MiniLM-L6-v2
2025-05-12 20:32:20,223 - INFO - SentenceTransformer - __init__ - Use pytorch device_name: cpu
2025-05-12 20:32:20,224 - INFO - SentenceTransformer - __init__ - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-05-12 20:32:23,177 - INFO - 2929584852 - initialize_query_embedding_model - Query embedding model 'all-MiniLM-L6-v2' initialized on device: cpu


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-12 20:32:23,307 - INFO - 3705718800 - query_chroma_collection - Querying collection for: "What are the core courses for the MS in Applied Data Science?"
2025-05-12 20:32:23,359 - INFO - 3860143155 - display_query_search_results - 
--- Search Results for Query: "What are the core courses for the MS in Applied Data Science?" ---
2025-05-12 20:32:23,361 - INFO - 3860143155 - display_query_search_results - --------------------------------------------------------------------------------
2025-05-12 20:32:23,362 - INFO - 3860143155 - display_query_search_results - Result #1 - ID: education-masters-programs-ms-in-applied-data-science-our-students_5
2025-05-12 20:32:23,364 - INFO - 3860143155 - display_query_search_results - Relevance (1 - Cosine Distance): 0.8033 (Distance: 0.1967)
2025-05-12 20:32:23,365 - INFO - 3860143155 - display_query_search_results -   Source: C:\Users\alen.pavlovic\Documents\GitLab\gen-ai-midterm-project\data\markdown_processed\education\education-masters-progr

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-12 20:32:23,419 - INFO - 3705718800 - query_chroma_collection - Querying collection for: "Tell me about the faculty specializing in machine learning."
2025-05-12 20:32:23,423 - INFO - 3860143155 - display_query_search_results - 
--- Search Results for Query: "Tell me about the faculty specializing in machine learning." ---
2025-05-12 20:32:23,424 - INFO - 3860143155 - display_query_search_results - --------------------------------------------------------------------------------
2025-05-12 20:32:23,424 - INFO - 3860143155 - display_query_search_results - Result #1 - ID: news-2020-cdac-summer-lab-kicks-off-with-37-student-researchers_2
2025-05-12 20:32:23,425 - INFO - 3860143155 - display_query_search_results - Relevance (1 - Cosine Distance): 0.6581 (Distance: 0.3419)
2025-05-12 20:32:23,425 - INFO - 3860143155 - display_query_search_results -   Source: C:\Users\alen.pavlovic\Documents\GitLab\gen-ai-midterm-project\data\markdown_processed\news\news-2020-cdac-summer-lab-kicks-off

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-12 20:32:23,464 - INFO - 3705718800 - query_chroma_collection - Querying collection for: "What are the admission requirements?"
2025-05-12 20:32:23,468 - INFO - 3860143155 - display_query_search_results - 
--- Search Results for Query: "What are the admission requirements?" ---
2025-05-12 20:32:23,469 - INFO - 3860143155 - display_query_search_results - --------------------------------------------------------------------------------
2025-05-12 20:32:23,469 - INFO - 3860143155 - display_query_search_results - Result #1 - ID: education-masters-programs-ms-in-applied-data-science-faqs_17
2025-05-12 20:32:23,470 - INFO - 3860143155 - display_query_search_results - Relevance (1 - Cosine Distance): 0.6327 (Distance: 0.3673)
2025-05-12 20:32:23,470 - INFO - 3860143155 - display_query_search_results -   Source: C:\Users\alen.pavlovic\Documents\GitLab\gen-ai-midterm-project\data\markdown_processed\education\education-masters-programs-ms-in-applied-data-science-faqs.md
2025-05-12 20:32:2

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-12 20:32:23,520 - INFO - 3705718800 - query_chroma_collection - Querying collection for: "How is the capstone project structured?"
2025-05-12 20:32:23,522 - INFO - 3860143155 - display_query_search_results - 
--- Search Results for Query: "How is the capstone project structured?" ---
2025-05-12 20:32:23,523 - INFO - 3860143155 - display_query_search_results - --------------------------------------------------------------------------------
2025-05-12 20:32:23,523 - INFO - 3860143155 - display_query_search_results - Result #1 - ID: capstone-projects_0
2025-05-12 20:32:23,524 - INFO - 3860143155 - display_query_search_results - Relevance (1 - Cosine Distance): 0.8184 (Distance: 0.1816)
2025-05-12 20:32:23,524 - INFO - 3860143155 - display_query_search_results -   Source: C:\Users\alen.pavlovic\Documents\GitLab\gen-ai-midterm-project\data\markdown_processed\capstone-projects\capstone-projects.md
2025-05-12 20:32:23,525 - INFO - 3860143155 - display_query_search_results -   Title: C

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-12 20:32:23,621 - INFO - 3705718800 - query_chroma_collection - Querying collection for: "capstone project"
2025-05-12 20:32:23,622 - INFO - 3705718800 - query_chroma_collection - Applying metadata filter: {'category': '11th-hour-project'}
2025-05-12 20:32:24,305 - INFO - 3860143155 - display_query_search_results - 
--- Search Results for Query: "capstone project (filtered by category: 11th-hour-project)" ---
2025-05-12 20:32:24,306 - INFO - 3860143155 - display_query_search_results - --------------------------------------------------------------------------------
2025-05-12 20:32:24,307 - INFO - 3860143155 - display_query_search_results - Result #1 - ID: 11th-hour-project_2
2025-05-12 20:32:24,308 - INFO - 3860143155 - display_query_search_results - Relevance (1 - Cosine Distance): 0.2619 (Distance: 0.7381)
2025-05-12 20:32:24,309 - INFO - 3860143155 - display_query_search_results -   Source: C:\Users\alen.pavlovic\Documents\GitLab\gen-ai-midterm-project\data\markdown_processe

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-12 20:32:24,445 - INFO - 3705718800 - query_chroma_collection - Querying collection for: "What are the core courses?"
2025-05-12 20:32:24,449 - INFO - 3026463495 - <module> - 
Retrieved Context #1:
Context:
Core Courses (6)
You will complete six core courses toward your Master’s in Applied Data Science degree. Core courses allow you to build your theoretical data science knowledge and practice applying this theory to examine real-world business problems.
Elective Courses (4)
[Source: education-masters-programs-in-person-program.md, Title: In-Person Program – DSI]

2025-05-12 20:32:24,450 - INFO - 3026463495 - <module> - 
Retrieved Context #2:
Context:
Core Courses (6)
You will complete six core courses toward your Master’s in Applied Data Science degree. Core courses allow you to build your theoretical data science knowledge and practice applying this theory to examine real-world business problems.
Elective Courses (4)
[Source: education-masters-programs-ms-in-applied-data-scie