# Content Filtering Tests

This notebook tests the enhanced content filtering functionality of llm-browser.

In [None]:
import sys
import os
import asyncio
import httpx
from bs4 import BeautifulSoup
from typing import Dict, List, Optional

# Add the parent directory to the path so we can import from the llm_browser package
sys.path.append(os.path.abspath('..'))

In [None]:
# Import the enhanced utility functions
from llm_browser.utils.html import (
    extract_main_content,
    html_to_markdown,
    extract_navigation,
    format_navigation_as_markdown,
    find_github_source_link
)
from llm_browser.utils.url import github_url_to_raw, is_github_url

## Helper Functions for Testing

In [None]:
async def fetch_html(url: str) -> Optional[str]:
    """Fetch HTML content from a URL"""
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    headers = {"User-Agent": user_agent}
    
    try:
        async with httpx.AsyncClient() as client:
            response = await client.get(url, headers=headers, timeout=30.0, follow_redirects=True)
            response.raise_for_status()
            return response.text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def print_section(title, content, max_length=1000):
    """Print a section of content with optional truncation"""
    print(f"\n{'=' * 80}\n{title}\n{'=' * 80}\n")
    if len(content) > max_length:
        print(f"{content[:max_length]}...\n\n[truncated - {len(content)} characters total]")
    else:
        print(content)
        
async def test_content_extraction(url: str, content_priorities=["auto", "main", "article", "largest", "dense"]):
    """Test content extraction with different priority modes"""
    html_content = await fetch_html(url)
    if not html_content:
        print(f"Failed to fetch {url}")
        return
        
    print(f"\n\n{'#' * 80}\nTESTING URL: {url}\n{'#' * 80}\n")
    
    # Check if this is a GitHub URL
    if is_github_url(url):
        raw_url = github_url_to_raw(url)
        print_section("GitHub Raw URL", raw_url)
        
    # Check for GitHub source links
    github_link = find_github_source_link(html_content)
    if github_link:
        print_section("Found GitHub Source Link", 
                      f"Text: {github_link['text']}\nURL: {github_link['href']}\nRaw URL: {github_link.get('raw_url')}")
    
    # Extract and print navigation
    nav_sections = extract_navigation(html_content)
    if nav_sections:
        nav_markdown = format_navigation_as_markdown(nav_sections)
        print_section("Navigation Structure", nav_markdown, 2000)
    
    # Test different content extraction strategies
    for priority in content_priorities:
        main_content_html = extract_main_content(html_content, content_priority=priority)
        main_content_text = BeautifulSoup(main_content_html, "html.parser").get_text(strip=True)
        print_section(f"Extracted Content ({priority.upper()} priority)", 
                      f"[Characters: {len(main_content_html)} HTML / {len(main_content_text)} text]\n\n{main_content_text[:500]}...",
                      800)
        
        # Convert to markdown with different options
        markdown_content = html_to_markdown(
            html_content, 
            content_priority=priority,
            strip_comments=True,
            strip_ads=True
        )
        print_section(f"Markdown ({priority.upper()} priority)", markdown_content[:1000] + "...", 1200)

## Test Various Website Types

In [None]:
# Test a documentation page with GitHub edit link
await test_content_extraction("https://svelte.dev/docs/introduction")

In [None]:
# Test a GitHub repository page directly
await test_content_extraction("https://github.com/sveltejs/kit/blob/main/documentation/docs/10-getting-started/10-introduction.md")

In [None]:
# Test a page with <main> tag
await test_content_extraction("https://web.dev/articles/semantic-html")

In [None]:
# Test a page with articles and comments
await test_content_extraction("https://news.ycombinator.com/item?id=32556068")

In [None]:
# Test a page with navigation and sidebar
await test_content_extraction("https://react.dev/learn")

## Test Content Density Analysis

In [None]:
async def analyze_element_density(url: str, top_n=10):
    """Analyze the content density of elements on a page"""
    html_content = await fetch_html(url)
    if not html_content:
        print(f"Failed to fetch {url}")
        return
    
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Find substantial elements
    elements = []
    for tag_name in ["div", "section", "article", "main", "table", "aside"]:
        for element in soup.find_all(tag_name):
            text = element.get_text(strip=True)
            text_length = len(text)
            if text_length < 100:  # Skip tiny elements
                continue
                
            html_length = len(str(element))
            link_count = len(element.find_all("a"))
            
            # Calculate metrics
            text_density = text_length / html_length if html_length > 0 else 0
            text_to_link_ratio = text_length / link_count if link_count > 0 else text_length * 0.5
            
            # Store with different scoring formulas
            size_score = (text_length * 0.6) + (text_density * 20) + (text_to_link_ratio * 0.1)
            density_score = (text_density * 50) + (text_to_link_ratio * 0.2) + (text_length * 0.01)
            balanced_score = (text_length * 0.3) + (text_density * 30) + (text_to_link_ratio * 0.15)
            
            identifier = f"{tag_name} - {element.get('id', '')} {' '.join(element.get('class', []))}"[:50]
            elements.append({
                "element": element,
                "identifier": identifier,
                "text_length": text_length,
                "html_length": html_length,
                "link_count": link_count,
                "text_density": text_density,
                "text_to_link_ratio": text_to_link_ratio,
                "size_score": size_score,
                "density_score": density_score,
                "balanced_score": balanced_score
            })
    
    print(f"Analyzed {len(elements)} elements on {url}\n")
    
    # Report top elements by each score
    for score_type in ["size_score", "density_score", "balanced_score"]:
        print(f"\nTop {top_n} elements by {score_type}:")
        top_elements = sorted(elements, key=lambda x: x[score_type], reverse=True)[:top_n]
        
        print(f"{'Rank':<5}{'Element':<52}{'Text Len':<10}{'Density':<10}{'Link Ratio':<12}{'Score':<10}")
        print("-" * 100)
        
        for i, element in enumerate(top_elements, 1):
            print(f"{i:<5}{element['identifier']:<52}{element['text_length']:<10}{element['text_density']:.3f}   {element['text_to_link_ratio']:<12.1f}{element[score_type]:<10.1f}")
            
        # Show text excerpt from top element
        top_text = top_elements[0]['element'].get_text(strip=True)
        print(f"\nExcerpt from top element:\n{top_text[:300]}...")

In [None]:
# Analyze density on different page types
await analyze_element_density("https://news.ycombinator.com/item?id=32556068")

In [None]:
await analyze_element_density("https://web.dev/articles/semantic-html")

## Test GitHub Raw Content Integration

In [None]:
async def test_github_raw(url: str):
    """Test GitHub raw content detection and conversion"""
    # Fetch the original URL
    html_content = await fetch_html(url)
    if not html_content:
        print(f"Failed to fetch {url}")
        return
        
    print(f"Testing GitHub integration for: {url}\n")
    
    # If this is a direct GitHub URL, convert it
    if is_github_url(url):
        raw_url = github_url_to_raw(url)
        print(f"Direct conversion to raw URL:\n{raw_url}\n")
        
        # Fetch the raw content
        raw_content = await fetch_html(raw_url)
        if raw_content:
            print_section("Raw content sample", raw_content[:1000])
    
    # Check for GitHub source links in the page
    github_link = find_github_source_link(html_content)
    if github_link:
        print(f"\nFound GitHub source link in page:\n")
        print(f"Link text: {github_link['text']}")
        print(f"Link URL: {github_link['href']}")
        
        if github_link.get('raw_url'):
            print(f"Converted raw URL: {github_link['raw_url']}")
            
            # Fetch and show the raw content
            raw_content = await fetch_html(github_link['raw_url'])
            if raw_content:
                print_section("Source raw content sample", raw_content[:1000])
    else:
        print("No GitHub source link found in this page.")

In [None]:
# Test different GitHub scenarios
await test_github_raw("https://github.com/sveltejs/kit/blob/main/documentation/docs/10-getting-started/10-introduction.md")

In [None]:
await test_github_raw("https://svelte.dev/docs/introduction")

## Test Complete Content Processing Pipeline

In [None]:
async def process_url(url: str, prefer_raw=True, include_navigation=True, 
                      content_priority="auto", strip_comments=True, strip_ads=True):
    """Process a URL with the complete pipeline"""
    print(f"Processing URL: {url}")
    print(f"Options: prefer_raw={prefer_raw}, include_navigation={include_navigation}, ")
    print(f"         content_priority={content_priority}, strip_comments={strip_comments}, strip_ads={strip_ads}\n")
    
    # Handle direct GitHub URLs
    if is_github_url(url) and prefer_raw:
        raw_url = github_url_to_raw(url)
        print(f"Direct GitHub URL detected, using raw URL: {raw_url}")
        raw_content = await fetch_html(raw_url)
        if raw_content:
            # Extract filename from URL
            filename = url.split("/")[-1].split("?")[0]
            content = f"# {filename}\n\n{raw_content}"
            print_section("Processed content (from direct GitHub URL)", content[:2000])
            return
    
    # For non-GitHub URLs or if raw fetch failed
    html_content = await fetch_html(url)
    if not html_content:
        print(f"Failed to fetch {url}")
        return
        
    # Check for GitHub source links
    if prefer_raw:
        github_link = find_github_source_link(html_content)
        if github_link and github_link.get("raw_url"):
            raw_url = github_link["raw_url"]
            print(f"Found GitHub source link, using raw URL: {raw_url}")
            raw_content = await fetch_html(raw_url)
            
            if raw_content:
                content = raw_content
                
                # Extract navigation if requested
                if include_navigation:
                    nav_sections = extract_navigation(html_content)
                    if nav_sections:
                        nav_markdown = format_navigation_as_markdown(nav_sections)
                        if nav_markdown:
                            content = f"{nav_markdown}\n---\n\n{content}"
                
                # Add title if needed
                if not content.startswith("# "):
                    from llm_browser.utils.html import extract_title
                    title = extract_title(html_content) or raw_url.split("/")[-1]
                    content = f"# {title}\n\n{content}"
                    
                print_section("Processed content (from GitHub source link)", content[:2000])
                return
    
    # If no GitHub raw content, process with HTML to markdown
    title = extract_title(html_content) or url
    
    # Convert to markdown with filtering options
    markdown_content = html_to_markdown(
        html_content,
        content_priority=content_priority,
        strip_comments=strip_comments,
        strip_ads=strip_ads
    )
    
    # Extract navigation if requested
    if include_navigation:
        nav_sections = extract_navigation(html_content)
        if nav_sections:
            nav_markdown = format_navigation_as_markdown(nav_sections)
            if nav_markdown:
                markdown_content = f"{nav_markdown}\n---\n\n{markdown_content}"
    
    # Add title
    content = f"# {title}\n\n{markdown_content}"
    
    print_section("Processed content (from HTML)", content[:2000])

In [None]:
# Test different URLs with our processing pipeline
await process_url("https://svelte.dev/docs/introduction")

In [None]:
await process_url("https://github.com/sveltejs/kit/blob/main/documentation/docs/10-getting-started/10-introduction.md")

In [None]:
# Test with different content extraction strategies
await process_url("https://web.dev/articles/semantic-html", content_priority="main")

In [None]:
await process_url("https://web.dev/articles/semantic-html", content_priority="largest")

In [None]:
# Test with navigation disabled
await process_url("https://react.dev/learn", include_navigation=False)