In [194]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import nest_asyncio

# Apply the nest_asyncio patch to allow nested event loops in Jupyter
nest_asyncio.apply()

# Asynchronous URL fetching with retry logic
async def fetch_urls(url, session, retries=3):
    attempt = 0
    while attempt < retries:
        try:
            async with session.get(url, timeout=15) as response:
                if response.status != 200:
                    return set()  # Return an empty set if the page doesn't load
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")
                urls = set(
                    urljoin(url, link['href'])
                    for link in soup.find_all('a', href=True)
                    if urlparse(urljoin(url, link['href'])).scheme in ('http', 'https')
                )
                if attempt > 0:
                    print(f"Successfully fetched URL: {url} on attempt {attempt + 1}")
                return urls
        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
            attempt += 1
            if attempt < retries:
                await asyncio.sleep(2 ** attempt)  # Exponential backoff
        except Exception as e:
            return set()

    print(f"Failed to fetch URL {url} after {retries} attempts.")
    return set()

# Check if URL should be allowed based on blacklist/whitelist mode
def is_url_allowed(url, black_list_data):
    url_list = black_list_data["list"]
    is_blacklist_mode = black_list_data["black_list"]
    
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

    if is_blacklist_mode:
        # Blacklist mode: block URLs matching any blacklist entry
        for entry in url_list:
            entry_parsed = urlparse(entry)
            entry_base = f"{entry_parsed.scheme}://{entry_parsed.netloc}"
            if entry_base == base_url and url.startswith(entry):
                return False
            # Specifically check if the entry is a path that should be blocked
            if entry_parsed.netloc == parsed_url.netloc and urlparse(entry).path == parsed_url.path:
                return False
    else:
        # Whitelist mode: only allow URLs matching any whitelist entry
        return any(
            url.startswith(entry) or f"{urlparse(entry).scheme}://{urlparse(entry).netloc}" == base_url
            for entry in url_list
        )
    
    return True

# Asynchronous scraping with blacklist/whitelist and depth handling
async def scrape_urls(url, session, max_depth, current_depth=0, visited=None, black_list_data=None):
    if visited is None:
        visited = {}

    if current_depth > max_depth:
        return visited  # Stop recursion if depth limit exceeded

    if not is_url_allowed(url, black_list_data):
        return visited  # Skip URL if not allowed

    visited[url] = current_depth  # Store URL with its depth

    # Extract URLs from the current page
    urls = await fetch_urls(url, session)

    tasks = []
    for new_url in urls:
        if new_url not in visited and is_url_allowed(new_url, black_list_data):
            # Continue scraping at the next depth level if within depth limit
            if current_depth < max_depth:
                tasks.append(scrape_urls(new_url, session, max_depth, current_depth + 1, visited, black_list_data))

    # Await all the tasks concurrently
    await asyncio.gather(*tasks)
    return visited

# Entry point for asynchronous scraping
async def start_scraping(url, depth, black_list_data=None):
    async with aiohttp.ClientSession() as session:
        found_urls = await scrape_urls(url, session, depth, black_list_data=black_list_data)
    return found_urls

In [198]:
starting_url = "https://docs.wire.com"
depth_limit = 2
whitelist = {"list": ["https://docs.wire.com"], "black_list": False}

# Start scraping
scraped_urls = await start_scraping(starting_url, depth_limit, whitelist)

print("******************")
for url, depth in scraped_urls.items():
    print(f"Depth {depth}: {url}")
print(f"Total URLs found: {len(scraped_urls)}")

Successfully fetched URL: https://docs.wire.com/how-to/install/restund.html on attempt 2
Successfully fetched URL: https://docs.wire.com/how-to/install/ingress.html on attempt 2
Successfully fetched URL: https://docs.wire.com/how-to/install/prod-intro.html#what-will-not-be-installed-by-default on attempt 2
Successfully fetched URL: https://docs.wire.com/how-to/install/infrastructure-configuration.html#security on attempt 2
Successfully fetched URL: https://docs.wire.com/how-to/install/web-app-settings.html#enforce-desktop-application-only on attempt 2
Successfully fetched URL: https://docs.wire.com/how-to/install/monitoring.html on attempt 2
Successfully fetched URL: https://docs.wire.com/how-to/install/helm-prod.html#preparing-to-install-charts-from-the-internet-with-helm on attempt 2
Successfully fetched URL: https://docs.wire.com/how-to/install/logging.html#introduction on attempt 2
Successfully fetched URL: https://docs.wire.com/how-to/install/ansible-authentication.html on attempt