In [1]:
import os
import re
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

# Path to the specific docs folder inside the cloned repo
DOCS_PATH = "./mkdocs/docs"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_mkdocs_content(text):
    """
    Cleans raw markdown text from MkDocs specific artifacts.
    """
    # 1. Remove YAML Frontmatter (metadata between --- and --- at start)
    # This regex looks for --- at start, followed by content, ending with ---
    text = re.sub(r'^---\n(.*?)\n---\n', '', text, flags=re.DOTALL)
    
    # 2. Remove "admonition" syntax but keep content (e.g., !!! note "Title")
    # This removes the !!! type "Title" line
    text = re.sub(r'!!! [a-z]+ "(.*)"', r'\1:', text)
    text = re.sub(r'!!! [a-z]+', '', text)
    
    # 3. Standardize whitespace (remove excessive newlines)
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text

print("Cleaning function defined.")

Cleaning function defined.


In [3]:
# 1. Load all .md files (guard against missing directory)
if os.path.exists(DOCS_PATH):
    loader = DirectoryLoader(DOCS_PATH, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding': 'utf-8'})
    raw_documents = loader.load()
else:
    print(f"Directory not found: {DOCS_PATH!r}. Skipping loading documents.")
    raw_documents = []

# 2. Apply Cleaning (only if documents were loaded)
if raw_documents:
    for doc in raw_documents:
        doc.page_content = clean_mkdocs_content(doc.page_content)

    # 3. Define Headers to Split On (The Strategy)
    # We want to keep sections together.
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    # 4. Process documents
    md_header_splits = []
    for doc in raw_documents:
        # Split the document based on headers
        splits = markdown_splitter.split_text(doc.page_content)

        # Add the file path metadata back to these chunks (important for citations!)
        for split in splits:
            split.metadata["source"] = doc.metadata.get("source", None)

        md_header_splits.extend(splits)

    # 5. Secondary Split (Refinement)
    # Sometimes a header section is still too long for the embedding model.
    # We do a secondary split purely on character count to ensure they fit.
    chunk_size = 500
    chunk_overlap = 50
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    # Final Chunks
    final_chunks = text_splitter.split_documents(md_header_splits)
else:
    md_header_splits = []
    final_chunks = []

print(f"Total documents loaded: {len(raw_documents)}")
print(f"Total chunks created: {len(final_chunks)}")
print("--- Example Chunk ---")
if len(final_chunks) > 5:
    print(final_chunks[5].page_content)
    print(final_chunks[5].metadata)
else:
    print("Not enough chunks available to show an example.")

Total documents loaded: 19
Total chunks created: 736
--- Example Chunk ---
whenever anything in the configuration file, documentation directory, or theme
directory changes.  
Open the `docs/index.md` document in your text editor of choice, change the
initial heading to `MkLorum`, and save your changes. Your browser will
auto-reload and you should see your updated documentation immediately.  
Now try editing the configuration file: `mkdocs.yml`. Change the
[`site_name`][site_name] setting to `MkLorum` and save the file.  
```yaml
site_name: MkLorum
```
{'Header 1': 'Getting Started with MkDocs', 'Header 2': 'Creating a new project', 'source': 'mkdocs\\docs\\getting-started.md'}


In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# 1. Initialize Embedding Model (Deliverable 3)
# "all-MiniLM-L6-v2" is fast, free, and runs locally.
print("Loading embedding model...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Create Vector DB (Deliverable 4)
# This processes the 736 chunks and saves them to the './chroma_db' folder.
print("Creating Vector DB... this might take a minute...")
vector_db = Chroma.from_documents(
    documents=final_chunks,
    embedding=embedding_model,
    persist_directory="./chroma_db", 
    collection_name="mkdocs_collection"
)

print("✅ Vector DB Created and Saved to ./chroma_db")

Loading embedding model...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Creating Vector DB... this might take a minute...
✅ Vector DB Created and Saved to ./chroma_db


In [5]:
# Test Query to verify everything works
query = "How do I change the theme?"
results = vector_db.similarity_search(query, k=3)

print("\n--- Test Query Results ---")
for i, res in enumerate(results):
    print(f"\nResult {i+1}:")
    print(f"Content: {res.page_content[:200]}...") # Show first 200 chars
    print(f"Source: {res.metadata.get('source')}")


--- Test Query Results ---

Result 1:
Content: Altering a theme to suit your needs.  
---  
If you would like to make a few tweaks to an existing theme, there is no need
to create your own theme from scratch. For minor tweaks which only require
so...
Source: mkdocs\docs\user-guide\customizing-your-theme.md

Result 2:
Content: > is required for the theme.  
[Customizing Your Theme]: ../user-guide/customizing-your-theme.md#using-the-theme-custom_dir
[custom_dir]: ../user-guide/configuration.md#custom_dir
[name]: ../user-guid...
Source: mkdocs\docs\dev-guide\themes.md

Result 3:
Content: <p class="card-text">
There's a stack of good looking <a href="user-guide/choosing-your-theme">themes</a> available for MkDocs.
Choose between the built in themes:
<a href="user-guide/choosing-your-th...
Source: mkdocs\docs\index.md


In [5]:
import google.generativeai as genai
import os
import getpass

# 1. Setup - prefer environment variable, but avoid using the placeholder value
env_key = os.environ.get("GOOGLE_API_KEY")
if not env_key or env_key == "AIzaSyAT3AOHDbw-T4L6pJoZgDl_wzAW080EgRU":
    print("No valid GOOGLE_API_KEY found in environment.")
    # Prompt the user to securely enter the API key (hidden input)
    key = getpass.getpass("Enter your Google Generative AI API key (input hidden): ").strip()
    if not key:
        raise RuntimeError("No API key provided. Set the GOOGLE_API_KEY environment variable or enter a key when prompted.")
    os.environ["GOOGLE_API_KEY"] = key

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# 2. List all available models
print("Searching for available models...")
try:
    for m in genai.list_models():
        if 'generateContent' in getattr(m, "supported_generation_methods", []):
            print(f"✅ Found: {m.name}")
except Exception as e:
    # Provide a clearer error message to help debugging invalid API keys
    print("Failed to list models. Please verify your API key and network connectivity.")
    print("Error details:", e)

No valid GOOGLE_API_KEY found in environment.
Searching for available models...
Failed to list models. Please verify your API key and network connectivity.
Error details: Model.__init__() got an unexpected keyword argument 'thinking'
