## Create Embeddings & Store in Vector database

In [None]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from typing import List, Dict
import uuid

# Import the markdown processing functions
from markdown_processor_debug import process_all_markdown_files


def create_embeddings_and_store(chunks: List[Dict[str, str]], collection_name: str = "fasthtml_docs"):
    # Initialize Chroma client
    client = chromadb.Client()

    # Create a collection
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
    collection = client.create_collection(name=collection_name, embedding_function=embedding_function)

    # Initialize the embedding model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Prepare data for Chroma
    documents = []
    metadatas = []
    ids = []

    for chunk in chunks:
        documents.append(chunk['text'])
        metadatas.append({"source": chunk['source']})
        ids.append(str(uuid.uuid4()))  # Generate a unique ID for each chunk

    # Create embeddings
    embeddings = model.encode(documents)

    # Add to Chroma DB
    collection.add(
        documents=documents,
        embeddings=embeddings.tolist(),
        metadatas=metadatas,
        ids=ids
    )

    print(f"Added {len(documents)} chunks to Chroma DB collection '{collection_name}'")

    return collection

if __name__ == "__main__":
    # Process markdown files and get chunks
    directory = './docs/DocsFasthtMl'
    base_url = 'https://docs.fastht.ml'
    
    print(f"Processing directory: {directory}")
    print(f"Base URL: {base_url}")
    
    all_chunks = process_all_markdown_files(directory, base_url)
    print(f"Total chunks extracted: {len(all_chunks)}")

    if all_chunks:
        # Create embeddings and store in Chroma DB
        collection = create_embeddings_and_store(all_chunks)

        # Example query to test the embeddings
        query = "What is FastHTML?"
        results = collection.query(
            query_texts=[query],
            n_results=2
        )

        print("\nExample query results:")
        for i, (document, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
            print(f"\nResult {i+1}:")
            print(f"Source: {metadata['source']}")
            print(f"Text: {document[:200]}...")  # Print first 200 characters
    else:
        print("No chunks were extracted. Please check your markdown files.")