# Browser Integration Tests

This notebook tests integration of GitHub link detection with the browser functionality.

In [1]:
import sys
import os
import asyncio
from typing import Dict, Optional

# Add the parent directory to the path so we can import from the llm_browser package
sys.path.append(os.path.abspath('..'))

In [ ]:
# Import the url conversion and link extraction functions from previous notebooks
def github_url_to_raw(github_url: str) -> str:
    """
    Convert a GitHub URL (edit, blob, or tree) to its raw.githubusercontent.com equivalent.
    
    Args:
        github_url: A GitHub URL pointing to a file
        
    Returns:
        URL for the raw content version
    """
    # Handle different GitHub URL patterns
    if "github.com" not in github_url:
        return github_url
        
    # Extract query parameters to preserve them
    url_parts = github_url.split('?', 1)
    base_url = url_parts[0]
    query = f"?{url_parts[1]}" if len(url_parts) > 1 else ""
    
    # Handle edit links
    if "/edit/" in base_url:
        # Convert /edit/ to /raw/
        raw_url = base_url.replace("/edit/", "/raw/")
        return raw_url + query
        
    # Handle blob links
    elif "/blob/" in base_url:
        # Replace github.com with raw.githubusercontent.com and /blob/ with /
        raw_url = base_url.replace("github.com", "raw.githubusercontent.com")
        raw_url = raw_url.replace("/blob/", "/")
        return raw_url + query
        
    # Handle tree links (often used in "Edit this page" links)
    elif "/tree/" in base_url:
        # Replace github.com with raw.githubusercontent.com and /tree/ with /
        raw_url = base_url.replace("github.com", "raw.githubusercontent.com")
        raw_url = raw_url.replace("/tree/", "/")
        return raw_url + query
        
    return github_url

## Simulating the Browser Server

Let's simulate the modified `browse_url` function to handle GitHub links.

In [ ]:
# Import necessary packages
import httpx
from bs4 import BeautifulSoup
from llm_browser.utils.html import html_to_markdown, extract_title
from llm_browser.utils.url import normalize_url, is_valid_url

# Define a function to find GitHub source links
def find_github_source_link(html: str) -> Optional[Dict]:
    """
    Find the most likely GitHub source link in an HTML document.
    
    Args:
        html: Raw HTML content
        
    Returns:
        Dictionary with link information or None if no suitable link found
    """
    soup = BeautifulSoup(html, "html.parser")
    links = []

    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        text = a_tag.get_text(strip=True)
        
        # Skip javascript links and empty links
        if not href or not text or href.startswith("javascript:"):
            continue
            
        # Create a link data dictionary
        link_data = {
            "href": href,
            "text": text,
            "is_github": False,
            "is_edit_link": False,
            "raw_url": None
        }
                
        # Check for GitHub links
        if "github.com" in href:
            link_data["is_github"] = True
            
            if "/edit/" in href or "/blob/" in href or "/tree/" in href:
                link_data["raw_url"] = github_url_to_raw(href)
                
        # Check for common edit link texts
        edit_phrases = ["edit this page", "edit on github", "contribute to this page", "view source"]
        link_data["is_edit_link"] = any(phrase in text.lower() for phrase in edit_phrases)
            
        links.append(link_data)
    
    # First, look for GitHub links that explicitly mention editing/viewing source
    for link in links:
        if link["is_github"] and link["is_edit_link"] and link["raw_url"]:
            return link
    
    # Next, look for any GitHub edit/blob/tree links
    for link in links:
        if link["is_github"] and link["raw_url"]:
            return link
    
    return None

# Simulate the browser functionality
async def enhanced_browse_url(url: str, prefer_raw: bool = False) -> str:
    """
    Enhanced version of browse_url that can detect and follow GitHub raw links.
    
    Args:
        url: The URL to fetch
        prefer_raw: Whether to automatically follow raw links when found
        
    Returns:
        Markdown content from the URL
    """
    # Validate URL
    if not is_valid_url(url):
        return f"Invalid URL: {url}"
        
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    headers = {"User-Agent": user_agent}
    
    try:
        async with httpx.AsyncClient() as client:
            # Fetch the original URL
            response = await client.get(url, headers=headers, timeout=30.0, follow_redirects=True)
            response.raise_for_status()
            html_content = response.text
            
            # Check if there's a GitHub source link
            github_link = find_github_source_link(html_content)
            used_raw = False
            
            # If we found a GitHub link with raw URL and prefer_raw is True, fetch the raw content instead
            if github_link and github_link["raw_url"] and prefer_raw:
                raw_url = github_link["raw_url"]
                raw_response = await client.get(raw_url, headers=headers, timeout=30.0, follow_redirects=True)
                
                if raw_response.status_code == 200:
                    # For raw content, we'll assume it's already in a good format (markdown or plain text)
                    content = raw_response.text
                    used_raw = True
                    
                    # Add a title if it's not already there
                    if not content.startswith("#"):
                        # Extract filename from the URL
                        filename = raw_url.split("/")[-1]
                        content = f"# {filename}\n\n{content}"
                    
                    return f"### Content from {url} (using raw version: {raw_url})\n\n{content}"
            
            # If we didn't use raw content, process the HTML
            if not used_raw:
                title = extract_title(html_content) or url
                markdown_content = html_to_markdown(html_content)
                content = f"# {title}\n\n{markdown_content}"
                
                # If we found a GitHub link but didn't use it, mention it
                if github_link and github_link["raw_url"]:
                    content = f"### Content from {url}\n\nNote: Raw version available at: {github_link['raw_url']}\n\n{content}"
                else:
                    content = f"### Content from {url}\n\n{content}"
                    
                return content
    except Exception as e:
        return f"Error fetching {url}: {str(e)}"

## Test with Real URLs

Let's test our enhanced browser with real documentation pages that have GitHub edit links.

In [4]:
# Test URLs
test_urls = [
    # A URL with a GitHub edit link
    "https://svelte.dev/docs/introduction",
    
    # A direct GitHub URL
    "https://github.com/sveltejs/kit/blob/main/documentation/docs/10-getting-started/10-introduction.md"
]

# Test with prefer_raw=False first
print("\n=== Testing with prefer_raw=False ===\n")
for url in test_urls:
    print(f"Fetching: {url}")
    result = await enhanced_browse_url(url, prefer_raw=False)
    # Just print the first 500 characters to avoid overwhelming output
    print(f"Result (first 500 chars): {result[:500]}...\n")

# Test with prefer_raw=True
print("\n=== Testing with prefer_raw=True ===\n")
for url in test_urls:
    print(f"Fetching: {url}")
    result = await enhanced_browse_url(url, prefer_raw=True)
    # Just print the first 500 characters to avoid overwhelming output
    print(f"Result (first 500 chars): {result[:500]}...\n")


=== Testing with prefer_raw=False ===

Fetching: https://svelte.dev/docs/introduction
Result (first 500 chars): ### Content from https://svelte.dev/docs/introduction

Note: Raw version available at: https://github.com/sveltejs/svelte/raw/main/documentation/docs/01-introduction/01-overview.md

# Overview • Docs • Svelte

SvelteIntroduction

# Overview

 

### On this page

 

Svelte is a framework for building user interfaces on the web. It uses a compiler to turn declarative components written in HTML, CSS and JavaScript...

App

```
<script>
	function greet() {
		alert('Welcome to Svelte!');
	}
</script>...

Fetching: https://github.com/sveltejs/kit/blob/main/documentation/docs/10-getting-started/10-introduction.md
Result (first 500 chars): ### Content from https://github.com/sveltejs/kit/blob/main/documentation/docs/10-getting-started/10-introduction.md

Note: Raw version available at: https://raw.githubusercontent.com/sveltejs/vite-plugin-svelte/main/docs/config.md#hot

# kit/docum

## Implementation Notes

Based on our testing, here's the final integration plan for llm-browser:

1. Add to `utils/url.py`:
   - `github_url_to_raw(github_url)` function

2. Add to `utils/html.py`:
   - Enhance `extract_links()` to detect GitHub links
   - Add `find_github_source_link()` function

3. Modify `server.py`:
   - Update `browse_url()` to check for GitHub links
   - Add a `prefer_raw` parameter with default False
   - Optionally fetch the raw content when a GitHub link is found

4. Update `cli.py`:
   - Add a `--prefer-raw` flag to the browser command
   - Pass this value to the server configuration

This implementation will seamlessly enhance the llm-browser to provide cleaner content for AI processing when documentation pages have GitHub source links available.