# Link Extraction Tests

This notebook tests functionality to extract and analyze links from HTML content, focusing on GitHub links.

In [None]:
import sys
import os
import re
from typing import List, Dict, Optional

# Add the parent directory to the path so we can import from the llm_browser package
sys.path.append(os.path.abspath('..'))

In [None]:
# Import existing utilities
from bs4 import BeautifulSoup

# Import the github_url_to_raw function from the other notebook
def github_url_to_raw(github_url: str) -> str:
    """
    Convert a GitHub URL (edit or blob) to its raw.githubusercontent.com equivalent.
    
    Args:
        github_url: A GitHub URL pointing to a file
        
    Returns:
        URL for the raw content version
    """
    # Handle different GitHub URL patterns
    if "github.com" not in github_url:
        return github_url
        
    # Extract query parameters to preserve them
    url_parts = github_url.split('?', 1)
    base_url = url_parts[0]
    query = f"?{url_parts[1]}" if len(url_parts) > 1 else ""
    
    # Handle edit links
    if "/edit/" in base_url:
        # Convert /edit/ to /raw/
        raw_url = base_url.replace("/edit/", "/raw/")
        return raw_url + query
        
    # Handle blob links
    elif "/blob/" in base_url:
        # Replace github.com with raw.githubusercontent.com and /blob/ with /
        raw_url = base_url.replace("github.com", "raw.githubusercontent.com")
        raw_url = raw_url.replace("/blob/", "/")
        return raw_url + query
        
    return github_url

## Enhanced Link Extraction

Let's create an enhanced version of the `extract_links` function that detects GitHub links and other special link types.

In [None]:
def extract_links_enhanced(html: str) -> List[Dict]:
    """
    Extract all links from an HTML document with enhanced metadata.

    Args:
        html: Raw HTML content

    Returns:
        List of dictionaries with href, text, and metadata for each link
    """
    soup = BeautifulSoup(html, "html.parser")
    links = []

    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        text = a_tag.get_text(strip=True)
        
        # Skip javascript links and empty links
        if not href or not text or href.startswith("javascript:"):
            continue
            
        # Create a dictionary with basic link info
        link_data = {
            "href": href,
            "text": text,
            "is_github": False,
            "is_github_edit": False,
            "is_github_blob": False,
            "raw_url": None,
            "attributes": {}
        }
        
        # Add all attributes
        for attr_name, attr_value in a_tag.attrs.items():
            if attr_name != "href":  # href is already included separately
                link_data["attributes"][attr_name] = attr_value
                
        # Check for GitHub links
        if "github.com" in href:
            link_data["is_github"] = True
            
            if "/edit/" in href:
                link_data["is_github_edit"] = True
                link_data["raw_url"] = github_url_to_raw(href)
            elif "/blob/" in href:
                link_data["is_github_blob"] = True
                link_data["raw_url"] = github_url_to_raw(href)
                
        # Check for common edit link texts
        edit_phrases = ["edit this page", "edit on github", "contribute to this page"]
        link_data["is_edit_link"] = any(phrase in text.lower() for phrase in edit_phrases)
            
        links.append(link_data)

    return links

def find_github_source_link(html: str) -> Optional[Dict]:
    """
    Find the most likely GitHub source link in an HTML document.
    
    Args:
        html: Raw HTML content
        
    Returns:
        Dictionary with link information or None if no suitable link found
    """
    links = extract_links_enhanced(html)
    
    # First, look for GitHub edit/blob links that explicitly mention editing
    for link in links:
        if (link["is_github_edit"] or link["is_github_blob"]) and link["is_edit_link"]:
            return link
    
    # Next, look for any GitHub edit/blob links
    for link in links:
        if link["is_github_edit"] or link["is_github_blob"]:
            return link
    
    # Finally, try any GitHub link
    for link in links:
        if link["is_github"]:
            return link
    
    return None

## Test with Example HTML

Let's test our enhanced link extraction with the example HTML.

In [None]:
# Test with example HTML
test_html = """
<div>
    <a target="_blank" rel="noopener, noreferrer" class="link" href="https://github.com/saadeghi/daisyui/blob/v5/packages/docs/src/routes/(routes)/docs/install/sveltekit/+page.md?plain=1">Edit this page on GitHub</a>
</div>
<p>Some other content</p>
<a class="svelte-72zfh9" href="https://github.com/sveltejs/kit/edit/main/documentation/docs/10-getting-started/10-introduction.md">
    <svg class="svelte-r1j42q" width="20" height="20"><use href="#edit"></use></svg>
    Edit this page on GitHub
</a>
<a href="https://example.com">Regular link</a>
"""

# Extract all links
all_links = extract_links_enhanced(test_html)

print(f"Found {len(all_links)} links:")
for i, link in enumerate(all_links, 1):
    print(f"\nLink {i}:")
    print(f"  Text: {link['text']}")
    print(f"  URL: {link['href']}")
    
    if link["is_github"]:
        print(f"  Is GitHub: Yes")
        if link["is_github_edit"]:
            print(f"  Is GitHub Edit: Yes")
        if link["is_github_blob"]:
            print(f"  Is GitHub Blob: Yes")
        if link["raw_url"]:
            print(f"  Raw URL: {link['raw_url']}")
    
    if link["is_edit_link"]:
        print(f"  Is Edit Link: Yes")
        
    if link["attributes"]:
        print(f"  Attributes: {link['attributes']}")

## Test Finding the Source Link

Let's test our function to find the most likely GitHub source link.

In [None]:
source_link = find_github_source_link(test_html)

if source_link:
    print("Found GitHub source link:")
    print(f"  Text: {source_link['text']}")
    print(f"  URL: {source_link['href']}")
    if source_link["raw_url"]:
        print(f"  Raw URL: {source_link['raw_url']}")
else:
    print("No GitHub source link found.")

## Test with More Complex HTML

Let's create a more complex test case with mixed link types.

In [None]:
complex_html = """
<html>
<head>
    <title>Documentation Page</title>
</head>
<body>
    <header>
        <nav>
            <a href="/">Home</a>
            <a href="/docs">Docs</a>
            <a href="/about">About</a>
        </nav>
    </header>
    <main>
        <h1>Getting Started</h1>
        <p>This is some documentation about getting started with our project.</p>
        <div class="edit-links">
            <a target="_blank" rel="noopener, noreferrer" class="github-link" href="https://github.com/example/repo/blob/main/docs/getting-started.md">
                <svg width="16" height="16"><path d="M..."></path></svg>
                View source on GitHub
            </a>
            <a href="https://raw.githubusercontent.com/example/repo/main/docs/getting-started.md">Raw version</a>
        </div>
        <div class="content">
            <p>More documentation content here...</p>
            <a href="javascript:void(0)" onclick="showMore()">Show more</a>
        </div>
        <div class="contribute">
            <a class="edit-link" href="https://github.com/example/repo/edit/main/docs/getting-started.md">
                Contribute to this page
            </a>
        </div>
    </main>
    <footer>
        <a href="https://github.com/example/repo">GitHub Repository</a>
    </footer>
</body>
</html>
"""

# Try to find the source link in the complex HTML
source_link = find_github_source_link(complex_html)

if source_link:
    print("Found GitHub source link:")
    print(f"  Text: {source_link['text']}")
    print(f"  URL: {source_link['href']}")
    if source_link["raw_url"]:
        print(f"  Raw URL: {source_link['raw_url']}")
else:
    print("No GitHub source link found.")

## Integration Notes

To integrate this functionality into the llm-browser:

1. Add the `github_url_to_raw` function to `utils/url.py`
2. Enhance `extract_links` in `utils/html.py` to detect GitHub links similar to our implementation
3. Add a `find_github_source_link` function to `utils/html.py`
4. Modify the `browse_url` function in `server.py` to:
   - Detect if a GitHub source link exists
   - Offer the option to follow the raw version
   - Add a flag to the CLI to automatically prefer raw content when available

This approach would allow users to automatically get cleaner, raw content versions when viewing documentation sites that link to GitHub sources.