In [1]:
import re
from typing import List, Dict
import markdown
from bs4 import BeautifulSoup
import os

In [2]:

def extract_sections(markdown_content: str) -> List[Dict[str, str]]:
    """
    Extract sections from markdown content.
    """
    # Convert markdown to HTML
    html = markdown.markdown(markdown_content)
    soup = BeautifulSoup(html, 'html.parser')
    
    sections = []
    current_section = {"title": "Introduction", "content": ""}
    
    for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
        if element.name in ['h1', 'h2', 'h3']:
            if current_section["content"].strip():
                sections.append(current_section)
            current_section = {"title": element.text.strip(), "content": ""}
        else:
            current_section["content"] += element.text + "\n"
    
    if current_section["content"].strip():
        sections.append(current_section)
    
    return sections

def create_chunks(text: str, chunk_size: int = 300, overlap: int = 50) -> List[str]:
    """
    Split text into overlapping chunks.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def process_markdown_file(file_path: str, base_url: str) -> List[Dict[str, str]]:
    """
    Process a markdown file and return a list of chunks with their sources.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        if not content.strip():
            print(f"Warning: File {file_path} is empty.")
            return []
        
        sections = extract_sections(content)
        result = []
        
        for section in sections:
            section_id = re.sub(r'\W+', '-', section['title'].lower())
            url = f"{base_url}#{section_id}"
            
            chunks = create_chunks(section['content'])
            
            for chunk in chunks:
                result.append({
                    "source": url,
                    "text": chunk
                })
        
        return result
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return []

def process_all_markdown_files(directory: str, base_url: str) -> List[Dict[str, str]]:
    """
    Process all markdown files in a directory and its subdirectories.
    """
    all_chunks = []
    
    if not os.path.exists(directory):
        print(f"Error: Directory {directory} does not exist.")
        return all_chunks
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.md'):
                file_path = os.path.join(root, file)
                relative_path = os.path.relpath(file_path, directory)
                file_url = f"{base_url}/{relative_path.replace('.md', '.html')}"
                
                print(f"Processing file: {file_path}")
                chunks = process_markdown_file(file_path, file_url)
                all_chunks.extend(chunks)
    
    if not all_chunks:
        print(f"Warning: No chunks were extracted from {directory}.")
    
    return all_chunks



In [3]:
# Example usage
if __name__ == "__main__":
    directory = './docs'
    base_url = 'https://docs.fastht.ml'
    
    print(f"Processing directory: {directory}")
    print(f"Base URL: {base_url}")
    
    all_chunks = process_all_markdown_files(directory, base_url)
    print(f"Total chunks extracted: {len(all_chunks)}")
    
    if all_chunks:
        print("Example chunk:")
        print(all_chunks[0])
    else:
        print("No chunks were extracted.")

Processing directory: ./docs
Base URL: https://docs.fastht.ml
Processing file: ./docs/index-html.md
Processing file: ./docs/index-html-md.md
Processing file: ./docs/ref/defining-xt-component.md
Processing file: ./docs/ref/live-reload-html-md.md
Processing file: ./docs/ref/live-reload-html.md
Processing file: ./docs/ref/defining-xt-component-md.md
Processing file: ./docs/ref/defining-xt-component-html.md
Processing file: ./docs/.ipynb_checkpoints/index-html-md-checkpoint.md
Processing file: ./docs/.ipynb_checkpoints/index-html-checkpoint.md
Processing file: ./docs/api/js-html.md
Processing file: ./docs/api/pico-html-md.md
Processing file: ./docs/api/components-html-md.md
Processing file: ./docs/api/core-html.md
Processing file: ./docs/api/xtend-html.md
Processing file: ./docs/api/xtend-html-md.md
Processing file: ./docs/api/pico-html.md
Processing file: ./docs/api/oauth-html-md.md
Processing file: ./docs/api/core-html-md.md
Processing file: ./docs/api/fastapp-html-md.md
Processing file:

In [6]:
print(all_chunks[1:3])

[{'source': 'https://docs.fastht.ml/index-html.html#installation', 'text': 'Since fasthtml is a Python library, you can install it with: pip install python-fasthtml In the near future, we hope to add component libraries that can likewise be installed via pip.'}, {'source': 'https://docs.fastht.ml/index-html.html#usage', 'text': 'For a minimal app, create a file “main.py” as follows: ``` from fasthtml.common import * app,rt = fast_app() @rt(\'/\') def get(): return Div(P(\'Hello World!\'), hx_get="/change") serve() ``` Running the app with python main.py prints out a link to your running app: http://localhost:5001. Visit that link in your browser and you should see a page with the text “Hello World!”. Congratulations, you’ve just created your first FastHTML app! Adding interactivity is surprisingly easy, thanks to HTMX. Modify the file to add this function: @rt(\'/change\') def get(): return P(\'Nice to be here!\') You now have a page with a clickable element that changes the text when 

### create_embeddings_and_store

In [None]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from typing import List, Dict
import uuid

# Import the markdown processing functions
from markdown_processor_debug import process_all_markdown_files

In [None]:
def create_embeddings_and_store(chunks: List[Dict[str, str]], collection_name: str = "fasthtml_docs"):
    # Initialize Chroma client
    client = chromadb.Client()

    # Create a collection
    embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
    collection = client.create_collection(name=collection_name, embedding_function=embedding_function)

    # Initialize the embedding model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Prepare data for Chroma
    documents = []
    metadatas = []
    ids = []

    for chunk in chunks:
        documents.append(chunk['text'])
        metadatas.append({"source": chunk['source']})
        ids.append(str(uuid.uuid4()))  # Generate a unique ID for each chunk

    # Create embeddings
    embeddings = model.encode(documents)

    # Add to Chroma DB
    collection.add(
        documents=documents,
        embeddings=embeddings.tolist(),
        metadatas=metadatas,
        ids=ids
    )

    print(f"Added {len(documents)} chunks to Chroma DB collection '{collection_name}'")

    return collection

if __name__ == "__main__":
    # Process markdown files and get chunks
    directory = './docs/DocsFasthtMl'
    base_url = 'https://docs.fastht.ml'
    
    print(f"Processing directory: {directory}")
    print(f"Base URL: {base_url}")
    
    all_chunks = process_all_markdown_files(directory, base_url)
    print(f"Total chunks extracted: {len(all_chunks)}")

    if all_chunks:
        # Create embeddings and store in Chroma DB
        collection = create_embeddings_and_store(all_chunks)

        # Example query to test the embeddings
        query = "What is FastHTML?"
        results = collection.query(
            query_texts=[query],
            n_results=2
        )

        print("\nExample query results:")
        for i, (document, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
            print(f"\nResult {i+1}:")
            print(f"Source: {metadata['source']}")
            print(f"Text: {document[:200]}...")  # Print first 200 characters
    else:
        print("No chunks were extracted. Please check your markdown files.")