In [22]:
from url_to_llm_text.get_llm_ready_text import url_to_llm_text
import asyncio
import os

In [23]:
async def scrape_and_save(url: str, filename: str):
    """
    Scrape the content from the given URL and save it in a format suitable for LLM processing.
    
    Args:
        url (str): The URL to scrape.
    
    Returns:
        str: The processed text ready for LLM input.
    """
    llm_text = await url_to_llm_text(url)
    filepath = f"scraped-webpages/{filename}.txt"
    # Create directory if it doesn't exist
    os.makedirs("scraped-webpages", exist_ok=True)

    with open(filepath, 'w') as file:
        file.write(llm_text)
    return llm_text

In [24]:
result = await scrape_and_save(
    url="https://huggingface.co/docs/transformers/v4.53.1/en/main_classes/trainer",
    filename="huggingface_transformers_trainer_doc"
)

  for attr, value in list(new_attrs.items()):


In [25]:
result = await scrape_and_save(
    url="https://huggingface.co/docs/trl/v0.19.1/en/sft_trainer",
    filename="huggingface_transformers_sfttrainer_doc"
)

In [30]:
result = await scrape_and_save(
    url="https://huggingface.co/docs/transformers/v4.53.1/en/main_classes/tokenizer",
    filename="huggingface_transformers_tokenizer_doc"
)

In [26]:
import re
import os
from pathlib import Path
from url_to_llm_text.get_llm_ready_text import url_to_llm_text
from urllib.parse import urlparse
import asyncio

class HuggingFaceDocFormatter:
    """Integrated scraper and formatter for Hugging Face documentation."""
    
    def __init__(self):
        self.patterns = self._initialize_patterns()
    
    def _initialize_patterns(self):
        """Initialize regex patterns for formatting."""
        return {
            'cleanup': [
                (r'Hugging Face: https://huggingface\.co/.*?Sign Up: https://huggingface\.co/join', ''),
                (r'🏡 View all docs.*?timm\s*Search documentation', ''),
                (r'mainv\d+\.\d+\.\d+.*?EN\s*', ''),
                (r'\d+,?\d*: https://github\.com/huggingface/\w+', ''),
                (r'Join the Hugging Face community.*?to get started', ''),
                (r'\n{3,}', '\n\n'),
                (r'[ \t]+$', ''),
            ],
            'headers': [
                (r'(\w+) documentation\s+([\w\s-]+?)(?:\s+\1|\s*$)', r'# \1 Documentation - \2'),
                (r': https://huggingface\.co/docs/[\w/.-]+#[\w-]+ ([A-Z][A-Za-z\s&-]+)(?=\n|$)', r'## \1'),
                (r': https://huggingface\.co/docs/[\w/.-]+ ([A-Z][A-Za-z\s&-]+)(?=\n|$)', r'### \1'),
            ],
            'code_blocks': [
                (r'Copied\s*(```\w*\n.*?```)', r'\1'),
                (r'Copied\s*([^`\n]+(?:\n[^`\n]+)*?)(?=\n\n|\n[A-Z]|\Z)', r'```python\n\1\n```'),
            ],
            'classes': [
                (r'class (trl|transformers)\. (\w+)', r'### `\1.\2`'),
                (r'< source >: https://github\.com/huggingface/[\w/.-]+', ''),
            ],
            'lists': [
                (r'^\s*\*\s+', '- ', re.MULTILINE),
                (r'Important attributes:\s*\*\s*', '\n**Important attributes:**\n\n- '),
                (r'\*\s*(\w+)\s*—\s*', r'- **\1** — '),
            ],
            'links': [
                (r': https://huggingface\.co/docs/[\w/.-]+', ''),
                (r'https://img\.shields\.io/badge/[^\s]+', ''),
            ]
        }
    
    def _apply_patterns(self, content, pattern_category):
        """Apply a category of regex patterns."""
        patterns = self.patterns.get(pattern_category, [])
        for pattern_info in patterns:
            if len(pattern_info) == 2:
                pattern, replacement = pattern_info
                flags = 0
            else:
                pattern, replacement, flags = pattern_info
            content = re.sub(pattern, replacement, content, flags=flags)
        return content
    
    def _extract_title_from_content(self, content):
        """Extract a suitable title from the documentation content."""
        # Try to find the main documentation title
        title_patterns = [
            r'(\w+) documentation\s+([\w\s-]+)',
            r'# ([\w\s-]+)',
            r'^([A-Z][A-Za-z\s&-]+)(?=\n|$)',
        ]
        
        for pattern in title_patterns:
            match = re.search(pattern, content, re.MULTILINE)
            if match:
                if len(match.groups()) == 2:
                    return f"{match.group(1)}_{match.group(2)}"
                else:
                    return match.group(1)
        
        return "huggingface_doc"
    
    def _url_to_filename(self, url, content):
        """Generate a filename from URL and content."""
        # Extract title from content
        title = self._extract_title_from_content(content)
        
        # Clean up title for filename
        filename = re.sub(r'[^\w\s-]', '', title.lower())
        filename = re.sub(r'[-\s]+', '_', filename)
        filename = filename.strip('_')
        
        # Add URL-based context if needed
        parsed = urlparse(url)
        path_parts = [p for p in parsed.path.split('/') if p and p not in ['docs', 'en']]
        
        if path_parts:
            url_context = '_'.join(path_parts[-2:])  # Last 2 parts
            url_context = re.sub(r'[^\w]', '_', url_context)
            if url_context not in filename:
                filename = f"{filename}_{url_context}"
        
        return filename
    
    def format_documentation(self, content):
        """Format the raw documentation content."""
        # Apply all formatting patterns
        for category in ['cleanup', 'headers', 'code_blocks', 'classes', 'lists', 'links']:
            content = self._apply_patterns(content, category)
        
        # Format parameter lists
        content = re.sub(r'\( ([^)]+) \)', self._format_parameters, content)
        
        # Wrap class definitions in code blocks
        content = re.sub(
            r'(### `\w+\.\w+`.*?)(?=###|\Z)', 
            self._format_class_block, 
            content, 
            flags=re.DOTALL
        )
        
        # Add proper spacing around headers
        content = re.sub(r'(#{1,6}[^\n]+)', r'\n\1\n', content)
        content = re.sub(r'\n{3,}', '\n\n', content)
        
        return content.strip()
    
    def _format_parameters(self, match):
        """Format parameter lists."""
        param_string = match.group(1)
        if len(param_string) > 100:
            params = [p.strip() for p in param_string.split(',') if p.strip()]
            if len(params) > 3:
                formatted_params = '\n'.join([f"    {param}" for param in params])
                return f"(\n{formatted_params}\n)"
        return f"({param_string})"
    
    def _format_class_block(self, match):
        """Format class definition blocks."""
        block = match.group(1)
        lines = block.split('\n')
        
        # Find the class signature
        for i, line in enumerate(lines):
            if line.strip().startswith('(') and ')' in line:
                # This looks like a class signature, wrap it in code block
                code_start = i
                code_end = i
                
                # Find the end of the signature
                paren_count = 0
                for j in range(i, len(lines)):
                    paren_count += lines[j].count('(') - lines[j].count(')')
                    if paren_count <= 0:
                        code_end = j
                        break
                
                # Wrap the signature in code block
                signature_lines = lines[code_start:code_end+1]
                signature = '\n'.join(signature_lines)
                
                new_lines = (
                    lines[:code_start] + 
                    ['```python'] + 
                    signature_lines + 
                    ['```'] + 
                    lines[code_end+1:]
                )
                return '\n'.join(new_lines)
        
        return block

async def scrape_and_format_docs(urls, output_dir="scraped-webpages"):
    """
    Scrape and format multiple Hugging Face documentation URLs.
    
    Args:
        urls (list): List of URLs to scrape
        output_dir (str): Directory to save formatted files
    
    Returns:
        dict: Mapping of URLs to saved filenames
    """
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    formatter = HuggingFaceDocFormatter()
    results = {}
    
    for url in urls:
        print(f"Processing: {url}")
        
        try:
            # Scrape the content
            raw_content = await url_to_llm_text(url)
            
            # Generate filename from content
            filename = formatter._url_to_filename(url, raw_content)
            
            # Format the content
            formatted_content = formatter.format_documentation(raw_content)
            
            # Save formatted markdown
            output_path = Path(output_dir) / f"{filename}.md"
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(formatted_content)
            
            results[url] = str(output_path)
            print(f"  ✅ Saved to: {output_path.name}")
            
        except Exception as e:
            print(f"  ❌ Error processing {url}: {e}")
            results[url] = None
    
    return results

# Convenience function for single URL
async def scrape_and_format_single(url, output_dir="scraped-webpages"):
    """Scrape and format a single URL."""
    results = await scrape_and_format_docs([url], output_dir)
    return results[url]

In [28]:
import re
import os
from pathlib import Path
from url_to_llm_text.get_llm_ready_text import url_to_llm_text
from urllib.parse import urlparse
import asyncio

class HuggingFaceDocFormatter:
    """Improved formatter for Hugging Face documentation."""
    
    def __init__(self):
        self.patterns = self._initialize_patterns()
    
    def _initialize_patterns(self):
        """Initialize regex patterns for formatting."""
        return {
            'cleanup': [
                # Remove navigation and header clutter
                (r'Hugging Face: https://huggingface\.co/.*?Sign Up: https://huggingface\.co/join', ''),
                (r'🏡 View all docs.*?timm\s*Search documentation', ''),
                (r'mainv\d+\.\d+\.\d+.*?doc-builder-html[A-Z]+\s*', ''),
                (r'\d+,?\d*: https://github\.com/huggingface/\w+', ''),
                (r'Join the Hugging Face community.*?to get started', ''),
                (r'You are viewing v\d+\.\d+\.\d+ version\..*?is available\.', ''),
                # Clean up navigation breadcrumbs
                (r'←[^→]*→', ''),
                # Remove excessive whitespace
                (r'\n{3,}', '\n\n'),
                (r'[ \t]+$', ''),
            ],
            'main_headers': [
                # Main documentation title
                (r'(\w+) documentation\s+([\w\s-]+?)(?:\s+\1|\s*$)', r'# \1 Documentation - \2'),
                # Major section headers (only if they start a new paragraph)
                (r'\n\s*([A-Z][A-Za-z\s&-]+)\n\s*\n', r'\n## \1\n\n'),
            ],
            'class_definitions': [
                # Class definitions with proper formatting
                (r'class (trl|transformers)\. (\w+)', r'## `\1.\2`'),
                # Remove source links
                (r'< source >: https://github\.com/huggingface/[\w/.-]+', ''),
            ],
            'code_formatting': [
                # Handle "Copied" code blocks
                (r'Copied\s*```(\w*)\n(.*?)```', r'```\1\n\2```'),
                # Handle simple code examples with "Copied"
                (r'Copied\s*([^`\n]+(?:\n[^`\n]*)*?)(?=\n\n|\n[A-Z]|\Z)', self._format_simple_code),
            ],
            'links_cleanup': [
                # Remove link references that aren't useful
                (r': https://huggingface\.co/docs/[\w/.-]+#[\w-]+', ''),
                (r': https://huggingface\.co/docs/[\w/.-]+', ''),
                (r'https://img\.shields\.io/badge/[^\s]+', ''),
            ],
            'lists': [
                # Fix bullet points
                (r'^\s*\*\s+', '- ', re.MULTILINE),
                # Format important attributes
                (r'Important attributes:\s*\n\s*-\s*', '\n**Important attributes:**\n\n- '),
            ],
        }
    
    def _format_simple_code(self, match):
        """Format simple code blocks that don't have triple backticks."""
        code = match.group(1).strip()
        
        # Check if it looks like Python code
        if any(keyword in code for keyword in ['from ', 'import ', 'def ', 'class ', '=', '(', ')']):
            return f'```python\n{code}\n```'
        else:
            return code
    
    def _apply_patterns(self, content, pattern_category):
        """Apply a category of regex patterns."""
        patterns = self.patterns.get(pattern_category, [])
        for pattern_info in patterns:
            if len(pattern_info) == 2:
                pattern, replacement = pattern_info
                flags = 0
            elif len(pattern_info) == 3:
                pattern, replacement, flags = pattern_info
            else:
                continue
            
            if callable(replacement):
                content = re.sub(pattern, replacement, content, flags=flags)
            else:
                content = re.sub(pattern, replacement, content, flags=flags)
        return content
    
    def _extract_title_from_content(self, content):
        """Extract a suitable title from the documentation content."""
        title_patterns = [
            r'(\w+) documentation\s+([\w\s-]+)',
            r'# ([\w\s-]+)',
            r'^([A-Z][A-Za-z\s&-]+)(?=\n|$)',
        ]
        
        for pattern in title_patterns:
            match = re.search(pattern, content, re.MULTILINE)
            if match:
                if len(match.groups()) == 2:
                    return f"{match.group(1)}_{match.group(2)}"
                else:
                    return match.group(1)
        
        return "huggingface_doc"
    
    def _url_to_filename(self, url, content):
        """Generate a filename from URL and content."""
        title = self._extract_title_from_content(content)
        filename = re.sub(r'[^\w\s-]', '', title.lower())
        filename = re.sub(r'[-\s]+', '_', filename)
        filename = filename.strip('_')
        
        # Add URL context
        parsed = urlparse(url)
        path_parts = [p for p in parsed.path.split('/') if p and p not in ['docs', 'en']]
        
        if path_parts:
            url_context = '_'.join(path_parts[-2:])
            url_context = re.sub(r'[^\w]', '_', url_context)
            if url_context not in filename:
                filename = f"{filename}_{url_context}"
        
        return filename
    
    def _format_class_signature(self, content):
        """Format class signatures with proper parameter formatting."""
        # Find class signatures with parameters
        class_pattern = r'(## `\w+\.\w+`.*?)\(\s*([^)]+)\s*\)'
        
        def format_signature(match):
            class_header = match.group(1)
            params = match.group(2)
            
            # Clean up parameters
            params = re.sub(r'\s+', ' ', params)
            param_list = [p.strip() for p in params.split(',') if p.strip()]
            
            if len(param_list) > 3 or len(params) > 100:
                # Format as multi-line
                formatted_params = '\n'.join([f'    {param},' for param in param_list])
                signature = f'{class_header}\n\n```python\nclass_name(\n{formatted_params}\n)\n```'
            else:
                # Keep as single line
                signature = f'{class_header}\n\n```python\nclass_name({params})\n```'
            
            return signature
        
        return re.sub(class_pattern, format_signature, content, flags=re.DOTALL)
    
    def _clean_parameter_descriptions(self, content):
        """Clean up parameter description formatting."""
        # Fix parameter list formatting
        content = re.sub(
            r'(\w+)\s+\(([^)]+)\)\s*—\s*(.+?)(?=\n\s*\w+\s+\(|\n\n|\Z)',
            r'- **\1** (`\2`) — \3',
            content,
            flags=re.DOTALL
        )
        
        # Clean up method descriptions that got mixed in
        content = re.sub(r'- \*\*(\w+)\*\* —', r'- **\1** —', content)
        
        return content
    
    def _format_method_sections(self, content):
        """Format method sections without making them all headers."""
        # Only make actual method definitions into subsections, not every mention
        method_pattern = r'\n\s*(\w+)\n\s*\n\s*([A-Z].*?)\n'
        
        def replace_method(match):
            method_name = match.group(1)
            description = match.group(2)
            
            # Only format as subsection if it looks like a method definition
            if re.match(r'^[a-z_]+$', method_name) and len(description) > 20:
                return f'\n### {method_name}\n\n{description}\n'
            else:
                return match.group(0)
        
        return re.sub(method_pattern, replace_method, content)
    
    def format_documentation(self, content):
        """Main formatting method."""
        # Apply basic cleanup
        content = self._apply_patterns(content, 'cleanup')
        
        # Apply main headers
        content = self._apply_patterns(content, 'main_headers')
        
        # Handle class definitions
        content = self._apply_patterns(content, 'class_definitions')
        
        # Format code blocks
        content = self._apply_patterns(content, 'code_formatting')
        
        # Clean up links
        content = self._apply_patterns(content, 'links_cleanup')
        
        # Fix lists
        content = self._apply_patterns(content, 'lists')
        
        # Format class signatures
        content = self._format_class_signature(content)
        
        # Clean parameter descriptions
        content = self._clean_parameter_descriptions(content)
        
        # Format method sections more carefully
        content = self._format_method_sections(content)
        
        # Final cleanup
        content = re.sub(r'\n{3,}', '\n\n', content)
        content = re.sub(r'#+\s*#+', '##', content)  # Fix double headers
        
        # Add proper spacing around headers
        content = re.sub(r'(#{1,3}[^\n]+)', r'\n\1\n', content)
        content = re.sub(r'\n{3,}', '\n\n', content)
        
        return content.strip()

# Main functions remain the same
async def scrape_and_format_docs(urls, output_dir="scraped-webpages"):
    """Scrape and format multiple Hugging Face documentation URLs."""
    os.makedirs(output_dir, exist_ok=True)
    formatter = HuggingFaceDocFormatter()
    results = {}
    
    for url in urls:
        print(f"Processing: {url}")
        
        try:
            raw_content = await url_to_llm_text(url)
            filename = formatter._url_to_filename(url, raw_content)
            formatted_content = formatter.format_documentation(raw_content)
            
            output_path = Path(output_dir) / f"{filename}.md"
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(formatted_content)
            
            results[url] = str(output_path)
            print(f"  ✅ Saved to: {output_path.name}")
            
        except Exception as e:
            print(f"  ❌ Error processing {url}: {e}")
            results[url] = None
    
    return results

async def scrape_and_format_single(url, output_dir="scraped-webpages"):
    """Scrape and format a single URL."""
    results = await scrape_and_format_docs([url], output_dir)
    return results[url]

In [29]:
# Example usage - multiple URLs
urls = [
    "https://huggingface.co/docs/transformers/v4.53.1/en/main_classes/trainer",
    "https://huggingface.co/docs/trl/v0.19.1/en/sft_trainer",
    "https://huggingface.co/docs/transformers/v4.53.1/en/main_classes/tokenizer"
]

results = await scrape_and_format_docs(urls)

# Check results
for url, filepath in results.items():
    if filepath:
        print(f"✅ {url} → {filepath}")
    else:
        print(f"❌ Failed: {url}")

Processing: https://huggingface.co/docs/transformers/v4.53.1/en/main_classes/trainer
  ✅ Saved to: transformers_trainer_transformers_main_classes_trainer.md
Processing: https://huggingface.co/docs/trl/v0.19.1/en/sft_trainer
  ✅ Saved to: trl_supervised_fine_tuning_trainer_trl_v0_19_1_sft_trainer.md
Processing: https://huggingface.co/docs/transformers/v4.53.1/en/main_classes/tokenizer
  ✅ Saved to: transformers_tokenizer_transformers_main_classes_tokenizer.md
✅ https://huggingface.co/docs/transformers/v4.53.1/en/main_classes/trainer → scraped-webpages/transformers_trainer_transformers_main_classes_trainer.md
✅ https://huggingface.co/docs/trl/v0.19.1/en/sft_trainer → scraped-webpages/trl_supervised_fine_tuning_trainer_trl_v0_19_1_sft_trainer.md
✅ https://huggingface.co/docs/transformers/v4.53.1/en/main_classes/tokenizer → scraped-webpages/transformers_tokenizer_transformers_main_classes_tokenizer.md
