# GitHub URL Conversion Tests

This notebook tests functionality to convert GitHub URLs to raw content URLs.

In [None]:
import sys
import os

# Add the parent directory to the path so we can import from the llm_browser package
sys.path.append(os.path.abspath('..'))

In [None]:
# Import existing utilities
from llm_browser.utils.url import normalize_url, is_valid_url

# Define our new GitHub URL conversion function
def github_url_to_raw(github_url: str) -> str:
    """
    Convert a GitHub URL (edit or blob) to its raw.githubusercontent.com equivalent.
    
    Args:
        github_url: A GitHub URL pointing to a file
        
    Returns:
        URL for the raw content version
    """
    # Handle different GitHub URL patterns
    if "github.com" not in github_url:
        return github_url
        
    # Extract query parameters to preserve them
    url_parts = github_url.split('?', 1)
    base_url = url_parts[0]
    query = f"?{url_parts[1]}" if len(url_parts) > 1 else ""
    
    # Handle edit links
    if "/edit/" in base_url:
        # Convert /edit/ to /raw/
        raw_url = base_url.replace("/edit/", "/raw/")
        return raw_url + query
        
    # Handle blob links
    elif "/blob/" in base_url:
        # Replace github.com with raw.githubusercontent.com and /blob/ with /
        raw_url = base_url.replace("github.com", "raw.githubusercontent.com")
        raw_url = raw_url.replace("/blob/", "/")
        return raw_url + query
        
    return github_url

## Test Cases

Let's test our URL conversion function with various GitHub URLs.

In [None]:
# Test case 1: GitHub blob URL with query parameter
url1 = "https://github.com/saadeghi/daisyui/blob/v5/packages/docs/src/routes/(routes)/docs/install/sveltekit/+page.md?plain=1"
print(f"Original: {url1}")
print(f"Converted: {github_url_to_raw(url1)}")
print()

In [None]:
# Test case 2: GitHub edit URL
url2 = "https://github.com/sveltejs/kit/edit/main/documentation/docs/10-getting-started/10-introduction.md"
print(f"Original: {url2}")
print(f"Converted: {github_url_to_raw(url2)}")
print()

In [None]:
# Test case 3: Already raw URL - should remain unchanged
url3 = "https://raw.githubusercontent.com/sveltejs/kit/main/documentation/docs/10-getting-started/10-introduction.md"
print(f"Original: {url3}")
print(f"Converted: {github_url_to_raw(url3)}")
print()

In [None]:
# Test case 4: Non-GitHub URL - should remain unchanged
url4 = "https://example.com/some/path"
print(f"Original: {url4}")
print(f"Converted: {github_url_to_raw(url4)}")
print()

## Testing with BS4 for Link Extraction

Now let's test extracting GitHub links from HTML.

In [None]:
from bs4 import BeautifulSoup

def extract_github_links(html: str):
    """Extract GitHub edit/blob links from HTML content"""
    soup = BeautifulSoup(html, "html.parser")
    github_links = []
    
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        text = a_tag.get_text(strip=True)
        
        if "github.com" in href and ("/blob/" in href or "/edit/" in href):
            github_links.append({
                "href": href,
                "text": text,
                "raw_url": github_url_to_raw(href)
            })
            
    return github_links

In [None]:
# Test with example HTML
test_html = """
<div>
    <a target="_blank" rel="noopener, noreferrer" class="link" href="https://github.com/saadeghi/daisyui/blob/v5/packages/docs/src/routes/(routes)/docs/install/sveltekit/+page.md?plain=1">Edit this page on GitHub</a>
</div>
<p>Some other content</p>
<a class="svelte-72zfh9" href="https://github.com/sveltejs/kit/edit/main/documentation/docs/10-getting-started/10-introduction.md">
    <svg class="svelte-r1j42q" width="20" height="20"><use href="#edit"></use></svg>
    Edit this page on GitHub
</a>
"""

github_links = extract_github_links(test_html)

for i, link in enumerate(github_links, 1):
    print(f"Link {i}:")
    print(f"  Text: {link['text']}")
    print(f"  Original URL: {link['href']}")
    print(f"  Raw URL: {link['raw_url']}")
    print()

## Integration with llm-browser

Notes on how to integrate this functionality into the main application:

1. Add the `github_url_to_raw` function to `url.py`
2. Enhance `extract_links` in `html.py` to detect GitHub links
3. Update `browse_url` in `server.py` to optionally follow raw URLs
4. Add CLI option to always prefer raw content

This would allow the browser to automatically fetch the raw content version when encountering GitHub links.