# Navigation Extraction Tests

This notebook tests functionality to identify and extract navigation structures from HTML content, focusing on lists of links and other navigation patterns.

In [1]:
import sys
import os
import re
from typing import List, Dict, Optional, Tuple

# Add the parent directory to the path so we can import from the llm_browser package
sys.path.append(os.path.abspath('..'))

In [2]:
# Import existing utilities
from bs4 import BeautifulSoup
import json

# Define a function to extract navigation elements from HTML
def extract_navigation(html: str) -> List[Dict]:
    """
    Extract navigation structures from HTML.
    
    Looks for common navigation patterns including:
    - <nav> elements
    - Lists of links (<ul>/<ol> with multiple <li><a> elements)
    - Elements with navigation-related classes
    
    Args:
        html: Raw HTML content
        
    Returns:
        List of dictionaries with navigation items and their structure
    """
    soup = BeautifulSoup(html, "html.parser")
    navigation_sections = []
    
    # Find explicit nav elements
    nav_elements = soup.find_all("nav")
    
    # Find elements with navigation-related classes
    nav_classes = ["menu", "navigation", "nav", "navbar", "sidebar", "toc", "table-of-contents"]
    class_nav_elements = []
    for cls in nav_classes:
        elements = soup.find_all(class_=lambda c: c and cls in c.lower())
        class_nav_elements.extend(elements)
    
    # Find lists that likely represent navigation
    list_elements = []
    for list_tag in soup.find_all(["ul", "ol"]):
        # Count links inside this list
        links = list_tag.find_all("a")
        if len(links) >= 3:  # Arbitrary threshold for navigation lists
            list_elements.append(list_tag)
    
    # Process all potential navigation elements
    for element in set(nav_elements + class_nav_elements + list_elements):
        # Extract navigation items with their hierarchy
        nav_structure = extract_hierarchical_navigation(element)
        
        if nav_structure:  # Only add if we found navigation items
            # Try to determine a title for this navigation section
            title = extract_navigation_title(element)
            
            navigation_sections.append({
                "title": title,
                "element_type": element.name,
                "classes": element.get("class", []),
                "items": nav_structure
            })
    
    return navigation_sections

def extract_hierarchical_navigation(element) -> List[Dict]:
    """
    Extract navigation links from an element, preserving hierarchy.
    
    Args:
        element: BeautifulSoup element to extract navigation from
        
    Returns:
        List of navigation items with their hierarchical structure
    """
    items = []
    
    # Handle direct links in the element
    direct_links = element.find_all("a", href=True, recursive=False)
    for link in direct_links:
        items.append({
            "href": link["href"],
            "text": link.get_text(strip=True),
            "is_active": has_active_marker(link),
            "level": 0
        })
    
    # Handle list items which may contain links
    list_items = element.find_all("li")
    for li in list_items:
        items.extend(process_list_item(li))
    
    return items

def process_list_item(li_element, level=0) -> List[Dict]:
    """
    Process a list item element to extract links and nested structures.
    
    Args:
        li_element: The list item element to process
        level: Current nesting level (for hierarchical navigation)
        
    Returns:
        List of navigation items from this list item and its children
    """
    items = []
    
    # Find direct links in this list item
    links = li_element.find_all("a", href=True, recursive=False)
    for link in links:
        items.append({
            "href": link["href"],
            "text": link.get_text(strip=True),
            "is_active": has_active_marker(link),
            "level": level
        })
    
    # If no direct links, try one level deeper
    if not links:
        deeper_links = li_element.find_all("a", href=True, limit=1)
        for link in deeper_links:
            items.append({
                "href": link["href"],
                "text": link.get_text(strip=True),
                "is_active": has_active_marker(link),
                "level": level
            })
    
    # Look for nested lists (submenus)
    nested_lists = li_element.find_all(["ul", "ol"], recursive=False)
    for nested_list in nested_lists:
        nested_items = nested_list.find_all("li")
        for nested_item in nested_items:
            items.extend(process_list_item(nested_item, level=level+1))
            
    # Check for details/summary structures (common in modern navigation)
    details = li_element.find_all("details", recursive=False)
    for detail in details:
        # Extract the summary text (often the parent menu item)
        summary = detail.find("summary")
        if summary:
            summary_text = summary.get_text(strip=True)
            items.append({
                "text": summary_text,
                "is_category": True,
                "level": level
            })
        
        # Extract links within the details
        detail_lists = detail.find_all(["ul", "ol"])
        for detail_list in detail_lists:
            detail_items = detail_list.find_all("li")
            for detail_item in detail_items:
                items.extend(process_list_item(detail_item, level=level+1))
    
    return items

def has_active_marker(element) -> bool:
    """
    Check if an element has indicators of being the active/current page.
    
    Args:
        element: The element to check
        
    Returns:
        True if the element appears to be marked as active
    """
    # Check for common active classes
    active_classes = ["active", "current", "selected", "highlight"]
    classes = element.get("class", [])
    if any(cls in classes or f"{cls}-item" in classes for cls in active_classes):
        return True
    
    # Check for aria-current attribute
    if element.get("aria-current"):
        return True
    
    return False

def extract_navigation_title(element) -> Optional[str]:
    """
    Try to find a title for the navigation element.
    
    Args:
        element: Navigation element
        
    Returns:
        Title string or None if no title found
    """
    # Try to find heading directly before the element
    prev_element = element.find_previous_sibling()
    while prev_element:
        if prev_element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            return prev_element.get_text(strip=True)
        prev_element = prev_element.find_previous_sibling()
    
    # Look for heading within the element (often in nav sections)
    heading = element.find(["h1", "h2", "h3", "h4", "h5", "h6"])
    if heading:
        return heading.get_text(strip=True)
    
    # Look for elements with title-like classes
    title_classes = ["title", "heading", "header", "nav-title"]
    for cls in title_classes:
        title_elem = element.find(class_=lambda c: c and cls in c.lower())
        if title_elem:
            return title_elem.get_text(strip=True)
    
    # Infer from element attributes or parent container
    if element.get("aria-label"):
        return element.get("aria-label")
    
    # If the element has an ID, use it as a fallback
    if element.get("id"):
        # Convert id like "main-navigation" to "Main Navigation"
        id_text = element.get("id").replace("-", " ").replace("_", " ")
        return id_text.title()
    
    return None

def format_navigation_as_markdown(navigation_sections: List[Dict]) -> str:
    """
    Format extracted navigation as a markdown document.
    
    Args:
        navigation_sections: List of navigation sections with their items
        
    Returns:
        Markdown representation of the navigation
    """
    if not navigation_sections:
        return "No navigation found"
        
    markdown = "# Site Navigation\n\n"
    
    for section in navigation_sections:
        # Add section header
        title = section.get("title") or "Navigation"
        markdown += f"## {title}\n\n"
        
        # Add items
        for item in section["items"]:
            # Indent based on level
            indent = "  " * item.get("level", 0)
            
            # Format as category or link
            if item.get("is_category"):
                markdown += f"{indent}- **{item['text']}**\n"
            elif "href" in item:
                text = item["text"]
                href = item["href"]
                active_marker = " (current)" if item.get("is_active") else ""
                markdown += f"{indent}- [{text}]({href}){active_marker}\n"
            else:
                markdown += f"{indent}- {item['text']}\n"
        
        markdown += "\n"
    
    return markdown

## Test with Example HTML

Let's test our navigation extraction with the example HTML from daisyUI.

In [3]:
# Example HTML from daisyUI with a menu structure
daisy_ui_nav_html = """
<ul class="menu w-full px-4 py-0">
   <!--[--><!--[-->
   <li data-sveltekit-preload-data="">
      <!--[!--><!--]--> <!--[-->
      <a href="/docs/v5/" class="group    ">
         <!--[-->
         <span>
            <!---->
            <svg class="text-blue-500 size-5" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48">
               <g fill="currentColor">
                  <defs>
                     <path id="1733501816477-2951362_a" d="M0 0h48v48H0V0z"></path>
                  </defs>
                  <clipPath id="1733501816477-2951362_b">
                     <use xlink:href="#1733501816477-2951362_a" overflow="visible"></use>
                  </clipPath>
                  <path clip-path="url(#1733501816477-2951362_b)" d="M40 8H8c-2.21 0-3.98 1.79-3.98 4L4 36c0 2.21 1.79 4 4 4h32c2.21 0 4-1.79 4-4V12c0-2.21-1.79-4-4-4zM17 30h-2.4l-5.1-7v7H7V18h2.5l5 7v-7H17v12zm10-9.49h-5v2.24h5v2.51h-5v2.23h5V30h-8V18h8v2.51zM41 28c0 1.1-.9 2-2 2h-8c-1.1 0-2-.9-2-2V18h2.5v9.01h2.25v-7.02h2.5v7.02h2.25V18H41v10z"></path>
               </g>
            </svg>
            <!---->
         </span>
         <!--]--> 
         <span>
            <!---->daisyUI 5 release notes<!---->
         </span>
         <!--[!--><!--]--> <!--[!--><!--]-->
      </a>
      <!--]-->
   </li>
   <!--]--><!--[-->
   <li data-sveltekit-preload-data="">
      <!--[--><!--[-->
      <details id="disclosure-docs" open="">
         <summary class="group">
            <!--[-->
            <span>
               <!---->
               <svg class="text-lime-500 size-5" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48">
                  <g fill="currentColor">
                     <path d="m13.3 15 3-3 .7.7-2.2 2.2 2.2 2.3-.7.7zm12 12 2.2-2.2-2.2-2.3.7-.7 3 3-3 3zm-11.6 0 3-3-3-3.1.7-.7 3.7 3.8-3.7 3.7z"></path>
                     <path d="M24 44q-4.2 0-7.85-1.575Q12.5 40.85 9.8 38.15q-2.7-2.7-4.275-6.35Q4 28.15 4 24q0-4.2 1.575-7.85Q7.15 12.5 9.85 9.8q2.7-2.7 6.35-4.275Q19.85 4 24 4q4.2 0 7.85 1.575Q35.5 7.15 38.2 9.85q2.7 2.7 4.275 6.35Q44 19.85 44 24q0 4.2-1.575 7.85-1.575 3.65-4.275 6.35-2.7 2.7-6.35 4.275Q28.2 44 24 44zm0-3.4q6.85 0 11.725-4.875T40.6 24q0-6.85-4.875-11.725T24 7.4q-6.85 0-11.725 4.875T7.4 24q0 6.85 4.875 11.725T24 40.6zm0-16.6z"></path>
                  </g>
               </svg>
               <!---->
            </span>
            <!--]--> 
            <span>
               <!---->Docs<!---->
            </span>
            <svg class="inline-block ml-auto -mr-1 {open ? 'rotate-90' : 'rotate-0'} size-4 transition-transform duration-200" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor" aria-hidden="true">
               <path fill-rule="evenodd" d="M7.21 14.77a.75.75 0 01.02-1.06L11.168 10 7.23 6.29a.75.75 0 111.04-1.08l4.5 4.25a.75.75 0 010 1.08l-4.5 4.25a.75.75 0 01-1.06-.02z" clip-rule="evenodd"></path>
            </svg>
         </summary>
         <ul class="menu w-full pl-4 mt-1">
            <!--[--><!--[-->
            <li data-sveltekit-preload-data="">
               <!--[!--><!--]--> <!--[-->
               <a href="/docs/install/" class="group active">
                  <!--[-->
                  <span>
                     <!---->
                     <svg class="text-red-500 size-5" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
                        <path fill="currentColor" d="M14.6 16.6l4.6-4.6-4.6-4.6L16 6l6 6-6 6zm-5.2 0L4.8 12l4.6-4.6L8 6l-6 6 6 6z"></path>
                     </svg>
                     <!---->
                  </span>
                  <!--]--> 
                  <span>
                     <!---->Install<!---->
                  </span>
                  <!--[!--><!--]--> <!--[!--><!--]-->
               </a>
               <!--]-->
            </li>
            <!--]--><!--[-->
            <li data-sveltekit-preload-data="">
               <!--[--><!--[-->
               <details id="disclosure-components">
                  <summary class="group">
                     <!--[-->
                     <span>
                        <!---->
                        <svg class="text-teal-500 size-5" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48">
                           <g fill="currentColor">
                              <path d="M7 40q-1.2 0-2.1-.9Q4 38.2 4 37V11q0-1.2.9-2.1Q5.8 8 7 8h34q1.2 0 2.1.9.9.9.9 2.1v26q0 1.2-.9 2.1-.9.9-2.1.9zm0-3h34V11H7v26z"></path>
                           </g>
                        </svg>
                        <!---->
                     </span>
                     <!--]--> 
                     <span>
                        <!---->Components<!---->
                     </span>
                     <svg class="inline-block ml-auto -mr-1 {open ? 'rotate-90' : 'rotate-0'} size-4 transition-transform duration-200" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor" aria-hidden="true">
                        <path fill-rule="evenodd" d="M7.21 14.77a.75.75 0 01.02-1.06L11.168 10 7.23 6.29a.75.75 0 111.04-1.08l4.5 4.25a.75.75 0 010 1.08l-4.5 4.25a.75.75 0 01-1.06-.02z" clip-rule="evenodd"></path>
                     </svg>
                  </summary>
                  <ul class="menu w-full pl-4 mt-1">
                     <!--[--><!--[-->
                     <li data-sveltekit-preload-data="">
                        <!--[!--><!--]--> <!--[-->
                        <a href="/components/accordion/" class="group    ">
                           <!--]--> 
                           <span>
                              <!---->Accordion<!---->
                           </span>
                           <!--[!--><!--]--> <!--[!--><!--]-->
                        </a>
                        <!--]-->
                     </li>
                     <!--]--><!--[-->
                     <li data-sveltekit-preload-data="">
                        <!--[!--><!--]--> <!--[-->
                        <a href="/components/alert/" class="group    ">
                           <!--]--> 
                           <span>
                              <!---->Alert<!---->
                           </span>
                           <!--[!--><!--]--> <!--[!--><!--]-->
                        </a>
                        <!--]-->
                     </li>
                  </ul>
               </details>
               <!--]-->
            </li>
         </ul>
      </details>
      <!--]-->
   </li>
   <!--]--><!--[-->
   <li data-sveltekit-preload-data="">
      <!--[!--><!--]--> <!--[-->
      <a href="/theme-generator/" class="group    ">
         <!--[-->
         <span>
            <!---->
            <svg class="text-pink-500 size-5" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48">
               <g fill="currentColor">
                  <path d="M24 44q-4.2 0-7.85-1.575Q12.5 40.85 9.8 38.15q-2.7-2.7-4.275-6.35Q4 28.15 4 24q0-4.2 1.575-7.85Q7.15 12.5 9.85 9.8q2.7-2.7 6.35-4.275Q19.85 4 24 4q4.2 0 7.85 1.575Q35.5 7.15 38.2 9.85q2.7 2.7 4.275 6.35Q44 19.85 44 24q0 4.2-1.575 7.85-1.575 3.65-4.275 6.35-2.7 2.7-6.35 4.275Q28.2 44 24 44zm0-20q0-2.9 1.55-4.45Q27.1 18 30 18q1.85 0 2.6-1t.25-2.6q-.3-1.95-.2-2.925.1-.975 1.55-2.425 1.35 1.35 2.1 1.8.75.45 2.05 1.15 1.3.7 1.8 1.625.5.925.5 2.425 0 4.4-3.05 7.45-3.05 3.05-7.45 3.05-2.9 0-4.45-1.55Q24 24 24 24zm-9 5.35q4.05 0 6.95 2.9 2.9 2.9 2.9 6.95 0 1.45-.125 1.925-.125.475-.825 1.175-1.35-1.35-2.375-1.875Q20.5 33.9 18.65 33.9q-1.9 0-2.6-1.05-.7-1.05-.25-2.55.3-1.05.275-2.65-.025-1.6-1.075-2.65z"></path>
               </g>
            </svg>
            <!---->
         </span>
         <!--]--> 
         <span>
            <!---->Theme Generator<!---->
         </span>
         <!--[!--><!--]--> <!--[!--><!--]-->
      </a>
      <!--]-->
   </li>
</ul>
"""

# Test the navigation extraction
navigation_sections = extract_navigation(daisy_ui_nav_html)

# Display the results as JSON
print(json.dumps(navigation_sections, indent=2))

# Format as markdown
markdown_nav = format_navigation_as_markdown(navigation_sections)
print("\n\nNavigation as Markdown:\n")
print(markdown_nav)

[
  {
    "title": null,
    "element_type": "ul",
    "classes": [
      "menu",
      "w-full",
      "px-4",
      "py-0"
    ],
    "items": [
      {
        "href": "/docs/v5/",
        "text": "daisyUI 5 release notes",
        "is_active": false,
        "level": 0
      },
      {
        "href": "/docs/install/",
        "text": "Install",
        "is_active": true,
        "level": 0
      },
      {
        "text": "Docs",
        "is_category": true,
        "level": 0
      },
      {
        "href": "/docs/install/",
        "text": "Install",
        "is_active": true,
        "level": 1
      },
      {
        "href": "/components/accordion/",
        "text": "Accordion",
        "is_active": false,
        "level": 1
      },
      {
        "text": "Components",
        "is_category": true,
        "level": 1
      },
      {
        "href": "/components/accordion/",
        "text": "Accordion",
        "is_active": false,
        "level": 2
      },
      {
       

## Test with Different Navigation Patterns

Let's test with other common navigation patterns.

In [None]:
# Bootstrap-style navigation
bootstrap_nav_html = """
<nav class="navbar navbar-expand-lg navbar-light bg-light">
  <div class="container-fluid">
    <a class="navbar-brand" href="#">Navbar</a>
    <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
      <span class="navbar-toggler-icon"></span>
    </button>
    <div class="collapse navbar-collapse" id="navbarNav">
      <ul class="navbar-nav">
        <li class="nav-item">
          <a class="nav-link active" aria-current="page" href="#">Home</a>
        </li>
        <li class="nav-item">
          <a class="nav-link" href="#">Features</a>
        </li>
        <li class="nav-item">
          <a class="nav-link" href="#">Pricing</a>
        </li>
        <li class="nav-item">
          <a class="nav-link disabled" href="#" tabindex="-1" aria-disabled="true">Disabled</a>
        </li>
      </ul>
    </div>
  </div>
</nav>
"""

# Extract and display Bootstrap navigation
bootstrap_nav = extract_navigation(bootstrap_nav_html)
bootstrap_markdown = format_navigation_as_markdown(bootstrap_nav)
print("\nBootstrap Navigation:\n")
print(bootstrap_markdown)

In [None]:
# Sidebar-style navigation
sidebar_nav_html = """
<div class="sidebar">
  <h3>Documentation</h3>
  <ul class="menu">
    <li><a href="/getting-started">Getting Started</a></li>
    <li class="has-submenu">
      <a href="/api">API Reference</a>
      <ul class="submenu">
        <li><a href="/api/authentication">Authentication</a></li>
        <li><a href="/api/endpoints">Endpoints</a></li>
        <li><a href="/api/errors">Error Handling</a></li>
      </ul>
    </li>
    <li><a href="/examples">Examples</a></li>
    <li><a href="/faq" class="active">FAQ</a></li>
  </ul>
</div>
"""

# Extract and display sidebar navigation
sidebar_nav = extract_navigation(sidebar_nav_html)
sidebar_markdown = format_navigation_as_markdown(sidebar_nav)
print("\nSidebar Navigation:\n")
print(sidebar_markdown)

## Integration with Browser Functionality

Let's test how we would integrate navigation extraction with the browser functionality.

In [None]:
def enhance_markdown_with_navigation(html_content: str, markdown_content: str) -> str:
    """
    Enhance markdown content with navigation extracted from HTML.
    
    Args:
        html_content: Original HTML content
        markdown_content: Markdown content converted from HTML
        
    Returns:
        Enhanced markdown with navigation section
    """
    # Extract navigation
    navigation_sections = extract_navigation(html_content)
    
    # If no navigation found, return original markdown
    if not navigation_sections:
        return markdown_content
    
    # Format navigation as markdown
    navigation_markdown = format_navigation_as_markdown(navigation_sections)
    
    # Add navigation section to the top of the markdown
    enhanced_markdown = f"{navigation_markdown}\n\n---\n\n{markdown_content}"
    
    return enhanced_markdown

# Test with a simple HTML document
test_html = """
<html>
<head>
    <title>Test Page</title>
</head>
<body>
    <nav>
        <ul>
            <li><a href="/">Home</a></li>
            <li><a href="/about">About</a></li>
            <li><a href="/contact">Contact</a></li>
        </ul>
    </nav>
    <main>
        <h1>Welcome to the Test Page</h1>
        <p>This is a test page with navigation.</p>
    </main>
</body>
</html>
"""

# Simulate markdown conversion (simplified)
from bs4 import BeautifulSoup
soup = BeautifulSoup(test_html, "html.parser")
main_content = soup.find("main")
basic_markdown = f"# {soup.title.text}\n\n{main_content.get_text()}\n"

# Enhance with navigation
enhanced_content = enhance_markdown_with_navigation(test_html, basic_markdown)
print(enhanced_content)

## Implementation Notes

To integrate this navigation extraction functionality into the llm-browser:

1. Add to `utils/html.py`:
   - `extract_navigation(html)` function to identify navigation structures
   - `format_navigation_as_markdown(navigation)` function for rendering

2. Modify `server.py`:
   - Update `browse_url()` to extract navigation from HTML
   - Add a `include_navigation` parameter (default: True)
   - Include navigation in the markdown output when requested

3. Update `cli.py`:
   - Add a `--no-navigation` flag to disable navigation extraction

This implementation will enhance the llm-browser by:
1. Providing structured site navigation to help understand document relationships
2. Making it easier to explore multi-page documentation
3. Keeping the original navigation structure intact for better context

The navigation extraction algorithm can handle various patterns including:
- Traditional navigation menus in `<nav>` elements
- Menu structures built with lists (like in the daisyUI example)
- Accordions and nested navigation (via `<details>` and nested lists)
- Bootstrap-style and other common UI framework navigation patterns

In [7]:
async def test_with_url(url: str):
    """Test navigation extraction with a real website URL."""
    import httpx

    # Fetch the URL content
    async with httpx.AsyncClient() as client:
        response = await client.get(url)
        html_content = response.text

    # Extract and display navigation
    navigation = extract_navigation(html_content)
    markdown_nav = format_navigation_as_markdown(navigation)
    print(f"Navigation from {url}:\n")
    print(markdown_nav)

In [9]:
await test_with_url("https://orm.drizzle.team/docs/get-started-sqlite")

Navigation from https://orm.drizzle.team/docs/get-started-sqlite:

# Site Navigation

## libsql

- [Get started](/docs/get-started)
- [Why Drizzle?](/docs/overview)
- [Guides](/docs/guides)
- [Tutorials](/docs/tutorials)
- [Latest releases](/docs/latest-releases)
- [Gotchas](/docs/gotchas)
- [Schema](/docs/sql-schema-declaration)
- [Database connection](/docs/connect-overview)
- [Query data](/docs/data-querying)
- [Migrations](/docs/migrations)
- [Indexes & Constraints](/docs/indexes-constraints)
- [Sequences](/docs/sequences)
- [Views](/docs/views)
- [Schemas](/docs/schemas)
- [Row-Level Security (RLS)](/docs/rls)
- [Relations](/docs/relations)
- [Overview](/docs/kit-overview)
- [generate](/docs/drizzle-kit-generate)
- [migrate](/docs/drizzle-kit-migrate)
- [push](/docs/drizzle-kit-push)
- [pull](/docs/drizzle-kit-pull)
- [export](/docs/drizzle-kit-export)
- [check](/docs/drizzle-kit-check)
- [up](/docs/drizzle-kit-up)
- [studio](/docs/drizzle-kit-studio)
- [Custom migrations](/docs/k