# Versnellingsplan Kennisbank URL Extractor Test

This notebook tests URL extraction from the Versnellingsplan Kennisbank website.

In [None]:
import asyncio
import logging
import nest_asyncio
import os
import sys
from IPython.display import Markdown

# Apply nest_asyncio to allow nested event loops (required for Jupyter)
nest_asyncio.apply()

## Setup Environment and Paths

In [None]:
# Add parent directory to path to import project modules
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('scraper-test')

## Import Crawl4AI and Configure

In [ ]:
try:
    import crawl4ai
    logger.info("Crawl4AI imported successfully.")
except ImportError as e:
    logger.error(f"Failed to import Crawl4AI: {e}")
    logger.info("Installing Crawl4AI...")
    !pip install crawl4ai
    import crawl4ai

## URL Extraction Function

In [ ]:
async def extract_kennisbank_urls(url="https://www.versnellingsplan.nl/kennisbank/"):
    """Extract URLs from the Versnellingsplan Kennisbank page.
    
    Args:
        url (str): The URL of the Kennisbank page
        
    Returns:
        list: List of dictionaries with 'title' and 'url' keys
    """
    try:
        logger.info(f"Extracting URLs from {url}")
        
        # Create AsyncWebCrawler instance
        async with crawl4ai.AsyncWebCrawler() as crawler:
            page = await crawler.arun(url=url)
            
            # Wait for the content links to appear (10s timeout)
            await page.wait_for('a.elementor-post__thumbnail__link', timeout=10000)
            
            # Extract the links
            links = page.soup.select('a.elementor-post__thumbnail__link')
            
            results = []
            for link in links:
                try:
                    url = link.get('href')
                    if not url:
                        continue
                        
                    # Make sure URL is absolute
                    if not url.startswith('http'):
                        url = f"{url.rstrip('/')}/{url.lstrip('/')}"
                    
                    # Try to get title from the link
                    title_elem = link.select_one('.elementor-post__title')
                    title = title_elem.text.strip() if title_elem else link.get('title', url)
                    
                    results.append({"title": title, "url": url})
                except Exception as e:
                    logger.warning(f"Error processing link: {e}")
            
            logger.info(f"Extracted {len(results)} URLs")
            return results
    
    except Exception as e:
        logger.error(f"Error extracting URLs: {e}")
        return []

## Format URLs as Markdown Links

In [None]:
def format_as_markdown_links(url_data):
    """Format URL data as markdown links.
    
    Args:
        url_data (list): List of dictionaries with 'title' and 'url' keys
        
    Returns:
        str: Markdown formatted list of links
    """
    if not url_data:
        return "No URLs found."
    
    markdown_links = []
    for item in url_data:
        title = item.get("title", "Untitled")
        url = item.get("url", "")
        if url:
            markdown_links.append(f"- [{title}]({url})")
    
    return "\n".join(markdown_links)

## Run URL Extraction

In [None]:
async def main():
    # Extract URLs from Kennisbank
    url_data = await extract_kennisbank_urls()
    
    # Format and display as markdown
    markdown_text = format_as_markdown_links(url_data)
    
    # Display as plain text
    print("Extracted URLs (Plain Text):")
    print(markdown_text)
    
    # Display as rendered markdown
    return Markdown("### Extracted URLs (Rendered):\n" + markdown_text)

# Run the main function
await main()

## Detailed Analysis of a Single URL

In [None]:
async def analyze_single_url(url_index=0):
    """Analyze a single URL from the extracted list.
    
    Args:
        url_index (int): Index of the URL in the extracted list
        
    Returns:
        dict: Analysis results
    """
    # Extract URLs
    url_data = await extract_kennisbank_urls()
    
    if not url_data or url_index >= len(url_data):
        return {"error": "URL not found"}
    
    # Get the URL to analyze
    target_url = url_data[url_index].get("url")
    target_title = url_data[url_index].get("title")
    
    logger.info(f"Analyzing URL: {target_title} - {target_url}")
    
    try:
        # Create crawler instance
        crawler = AsyncHTMLCrawler()
        
        # Fetch the page
        response = await crawler.fetch(target_url)
        
        if not response.ok:
            return {"error": f"Failed to fetch {target_url}: {response.status}"}
        
        # Extract metadata
        title_selector = XPathSelector("//h1", attr="text")
        content_selector = XPathSelector("//article//div[contains(@class, 'content')]", attr="html")
        
        page_title = await title_selector.extract(response)
        content = await content_selector.extract(response)
        
        # Extract any PDF links
        pdf_selector = XPathSelector("//a[contains(@href, '.pdf')]", attr={"url": "href", "text": "text"})
        pdf_links = await pdf_selector.extract_all(response)
        
        return {
            "title": page_title,
            "url": target_url,
            "content_preview": content[:500] + "..." if content else "No content found",
            "pdf_links": pdf_links
        }
        
    except Exception as e:
        logger.error(f"Error analyzing URL: {e}")
        return {"error": str(e)}

# Run the analysis for the first URL (change index as needed)
analysis = await analyze_single_url(0)
analysis