In [None]:
# Install dependencies (if not already installed)
# !pip install -e .

In [None]:
import sys
sys.path.insert(0, '..')

import json
import asyncio
from pathlib import Path

# Import scraper components
from src.scrapers import HackTricksScraper, OWASPScraper
from src.processors import ContentCleaner, FormatConverter, Deduplicator, QualityChecker
from src.processors.data_validator import DataValidator
from src.processors.data_augmenter import DataAugmenter
from src.utils.config import ScrapingConfig
from src.generators.qa_generator import QAGenerator

## 1. Configuration

In [None]:
# Load configuration
config = ScrapingConfig(
    delay_between_requests=2.0,
    concurrent_requests=3,
    timeout=30,
    respect_robots_txt=True,
)

print(f"Config loaded:")
print(f"  Delay: {config.delay_between_requests}s")
print(f"  Concurrent requests: {config.concurrent_requests}")

## 2. Scraping Examples

In [None]:
# Example: Scrape HackTricks (limited for demo)
async def scrape_demo():
    async with HackTricksScraper(config=config) as scraper:
        items = []
        count = 0
        
        async for item in scraper.scrape():
            items.append(item.to_dict())
            count += 1
            
            if count >= 5:  # Limit for demo
                break
        
        return items

# Run the scraper (uncomment to execute)
# scraped_items = await scrape_demo()
# print(f"Scraped {len(scraped_items)} items")

## 3. Content Processing

In [None]:
# Example HTML cleaning
cleaner = ContentCleaner(
    preserve_code_blocks=True,
    preserve_links=True,
    normalize_whitespace=True,
)

sample_html = """
<html>
<body>
    <nav>Menu items here</nav>
    <main>
        <h1>SQL Injection Tutorial</h1>
        <p>SQL injection is a code injection technique.</p>
        <pre><code class="language-sql">SELECT * FROM users WHERE id = '1' OR '1'='1'</code></pre>
    </main>
    <footer>Footer content</footer>
</body>
</html>
"""

result = cleaner.clean_html(sample_html)
print(f"Title: {result.title}")
print(f"Text length: {len(result.text)}")
print(f"Code blocks: {len(result.code_blocks)}")
if result.code_blocks:
    print(f"  Language: {result.code_blocks[0]['language']}")
    print(f"  Code: {result.code_blocks[0]['code'][:100]}...")

## 4. Q&A Generation

In [None]:
# Generate Q&A pairs from content
qa_generator = QAGenerator()

sample_content = """
SQL Injection is a type of injection attack that makes it possible to execute malicious SQL statements. 
These statements control a database server behind a web application.

Common techniques include:
1. Union-based SQL injection - uses UNION SQL operator
2. Error-based SQL injection - forces the database to generate an error
3. Blind SQL injection - asks the database true/false questions

Example payload:
' OR '1'='1' --

Prevention:
- Use parameterized queries
- Input validation
- Least privilege principle
"""

qa_pairs = qa_generator.generate_from_content(
    content=sample_content,
    title="SQL Injection",
    category="web_security",
)

print(f"Generated {len(qa_pairs)} Q&A pairs:")
for i, qa in enumerate(qa_pairs[:3], 1):
    print(f"\n--- Q&A #{i} ---")
    print(f"Q: {qa['instruction'][:100]}...")
    print(f"A: {qa['output'][:100]}...")

## 5. Data Validation

In [None]:
# Validate samples
validator = DataValidator(
    min_instruction_length=20,
    min_output_length=50,
)

# Sample data to validate
samples = [
    {
        "instruction": "What is SQL injection and how can it be prevented?",
        "output": "SQL injection is a code injection technique that exploits security vulnerabilities. Prevention includes using parameterized queries and input validation.",
        "category": "web_security"
    },
    {
        "instruction": "Short",  # Too short
        "output": "OK",  # Too short
    },
    {
        "instruction": "Explain XSS attacks",
        # Missing output
    },
]

report = validator.validate_dataset(samples)

print(f"Validation Report:")
print(f"  Valid: {report.valid_samples}/{report.total_samples}")
print(f"  Errors: {report.total_errors}")
print(f"  Warnings: {report.total_warnings}")
print(f"  Avg Quality Score: {report.avg_quality_score:.2f}")

## 6. Data Augmentation

In [None]:
# Augment dataset
augmenter = DataAugmenter()

original_sample = {
    "instruction": "Explain how to use nmap to scan for open ports",
    "output": "Nmap is a powerful network scanning tool. To scan for open ports, use: nmap -sS target_ip",
    "category": "network_security",
    "difficulty": "beginner"
}

augmented = augmenter.augment_sample(original_sample)

print(f"Original: {original_sample['instruction']}")
print(f"\nAugmented versions ({len(augmented)}):")
for aug in augmented:
    print(f"  - {aug['instruction'][:80]}...")

## 7. Format Conversion

In [None]:
# Convert to different formats
converter = FormatConverter()

# Alpaca format (default)
alpaca_sample = converter.to_alpaca(
    instruction="How to detect SQL injection vulnerabilities?",
    output="Use tools like sqlmap, manual testing with payloads, and code review.",
    input_text="Given a web application with user input forms",
    category="web_security",
    difficulty="intermediate",
)

print("Alpaca Format:")
print(json.dumps(alpaca_sample.model_dump(), indent=2))

In [None]:
# ShareGPT format
sharegpt_sample = converter.to_sharegpt(
    user_message="How do I perform a penetration test on a web application?",
    assistant_message="A web application penetration test involves several phases: reconnaissance, scanning, exploitation, and reporting...",
    system_message="You are a cybersecurity expert specializing in penetration testing.",
)

print("ShareGPT Format:")
print(json.dumps(sharegpt_sample.model_dump(by_alias=True), indent=2))

## 8. Export to Training Formats

In [None]:
from src.processors.dataset_exporter import DatasetExporter

# Example dataset
dataset = [
    {
        "instruction": "What is a buffer overflow attack?",
        "input": "",
        "output": "A buffer overflow occurs when a program writes more data to a buffer than it can hold...",
        "category": "vulnerability",
        "difficulty": "intermediate"
    },
    {
        "instruction": "Explain cross-site scripting (XSS)",
        "input": "",
        "output": "XSS is a client-side code injection attack where malicious scripts are injected into web pages...",
        "category": "web_security",
        "difficulty": "beginner"
    },
]

exporter = DatasetExporter(output_dir="../data/exports")

# Export to Axolotl format
# files = exporter.export(dataset, format_name="axolotl", filename="security_demo")
# print(f"Exported files: {files}")

## 9. CLI Usage Examples

```bash
# Scrape all sources
security-scraper scrape --all --limit 100

# Scrape specific source
security-scraper scrape -s hacktricks -s owasp

# Process scraped data
security-scraper process -i data/raw -o data/processed

# Generate dataset
security-scraper generate -i data/processed -f alpaca --split

# Quality check
security-scraper quality -i data/dataset/train.json

# Augment dataset
security-scraper augment -i data/dataset/train.json -m 2.0

# Export for fine-tuning
security-scraper export -i data/dataset -f axolotl

# Full pipeline
security-scraper run --all --limit 100 -f alpaca
```

## 10. Advanced: Custom Scraper

In [None]:
from src.scrapers.base_scraper import BaseScraper, ScrapedItem
from typing import AsyncIterator

class CustomSecurityScraper(BaseScraper):
    """Example of creating a custom scraper."""
    
    SOURCE_NAME = "custom_source"
    BASE_URL = "https://example-security-site.com"
    
    async def scrape(self) -> AsyncIterator[ScrapedItem]:
        # Get list of URLs to scrape
        urls = await self.get_urls_to_scrape()
        
        for url in urls:
            # Fetch page content
            html = await self.fetch_page(url)
            if not html:
                continue
            
            # Clean and extract content
            content = self.content_cleaner.clean_html(html)
            
            # Create scraped item
            yield ScrapedItem(
                url=url,
                title=content.title,
                content=content.text,
                code_blocks=content.code_blocks,
                headers=content.headers,
                metadata={'source': self.SOURCE_NAME}
            )
    
    async def get_urls_to_scrape(self):
        """Override to provide URLs."""
        return []

print("Custom scraper class defined successfully!")