In [41]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import nest_asyncio

# Apply the nest_asyncio patch to allow nested event loops in Jupyter
nest_asyncio.apply()

# Asynchronous URL fetching with retry logic
async def fetch_urls(url, session, retries=3):
    attempt = 0
    while attempt < retries:
        try:
            async with session.get(url, timeout=15) as response:
                if response.status != 200:
                    return set()  # Return an empty set if the page doesn't load
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")
                urls = set(
                    urljoin(url, link['href'])
                    for link in soup.find_all('a', href=True)
                    if urlparse(urljoin(url, link['href'])).scheme in ('http', 'https')
                )
                if attempt > 0:
                    print(f"Successfully fetched URL: {url} on attempt {attempt + 1}")
                return urls
        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
            attempt += 1
            if attempt < retries:
                await asyncio.sleep(2 ** attempt)  # Exponential backoff
        except Exception as e:
            return set()

    print(f"Failed to fetch URL {url} after {retries} attempts.")
    return set()

# Check if URL should be allowed based on blacklist/whitelist mode
def is_url_allowed(url, black_list_data):
    url_list = black_list_data["list"]
    is_blacklist_mode = black_list_data["black_list"]
    
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

    if is_blacklist_mode:
        # Blacklist mode: block URLs matching any blacklist entry
        for entry in url_list:
            entry_parsed = urlparse(entry)
            entry_base = f"{entry_parsed.scheme}://{entry_parsed.netloc}"
            if entry_base == base_url and url.startswith(entry):
                return False
            # Specifically check if the entry is a path that should be blocked
            if entry_parsed.netloc == parsed_url.netloc and urlparse(entry).path == parsed_url.path:
                return False
    else:
        # Whitelist mode: only allow URLs matching any whitelist entry
        return any(
            url.startswith(entry) or f"{urlparse(entry).scheme}://{urlparse(entry).netloc}" == base_url
            for entry in url_list
        )
    
    return True

# Asynchronous scraping with blacklist/whitelist and depth handling
async def scrape_urls(url, session, max_depth, current_depth=0, visited=None, black_list_data=None):
    if visited is None:
        visited = {}

    if current_depth > max_depth:
        return visited  # Stop recursion if depth limit exceeded

    if not is_url_allowed(url, black_list_data):
        return visited  # Skip URL if not allowed

    visited[url] = current_depth  # Store URL with its depth

    # Extract URLs from the current page
    urls = await fetch_urls(url, session)

    tasks = []
    for new_url in urls:
        if new_url not in visited and is_url_allowed(new_url, black_list_data):
            # Continue scraping at the next depth level if within depth limit
            if current_depth < max_depth:
                tasks.append(scrape_urls(new_url, session, max_depth, current_depth + 1, visited, black_list_data))

    # Await all the tasks concurrently
    await asyncio.gather(*tasks)
    return visited

# Entry point for asynchronous scraping
async def start_scraping(url, depth, black_list_data=None):
    async with aiohttp.ClientSession() as session:
        found_urls = await scrape_urls(url, session, depth, black_list_data=black_list_data)
    return found_urls

In [42]:
starting_url = "https://docs.wire.com"
depth_limit = 2
whitelist = {"list": ["https://docs.wire.com"], "black_list": False}

# Start scraping
scraped_urls = await start_scraping(starting_url, depth_limit, whitelist)

print("******************")
for url, depth in scraped_urls.items():
    print(f"Depth {depth}: {url}")
print(f"Total URLs found: {len(scraped_urls)}")

Successfully fetched URL: https://docs.wire.com/developer/developer/features.html on attempt 2
Successfully fetched URL: https://docs.wire.com/developer/reference/user/connection.html#connection-backend-internals on attempt 2
Successfully fetched URL: https://docs.wire.com/developer/reference/provisioning/scim-token.html on attempt 2
Successfully fetched URL: https://docs.wire.com/understand/legalhold.html on attempt 2
Successfully fetched URL: https://docs.wire.com/developer/reference/config-options.html on attempt 2
Successfully fetched URL: https://docs.wire.com/developer/developer/processes.html on attempt 2
Successfully fetched URL: https://docs.wire.com/understand/searchability.html on attempt 2
Successfully fetched URL: https://docs.wire.com/understand/restund.html#what-is-it-used-for on attempt 2
Successfully fetched URL: https://docs.wire.com/understand/api-client-perspective/index.html on attempt 2
Successfully fetched URL: https://docs.wire.com/developer/reference/user/rich-

In [43]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

# Regular expression patterns for various date formats
date_patterns = [
    r'\b\d{4}-\d{2}-\d{2}\b',            # Matches YYYY-MM-DD (e.g., 2024-09-20)
    r'\b\d{4}/\d{2}/\d{2}\b',            # Matches YYYY/MM/DD (e.g., 2024/09/20)
    r'\b\d{2}-\d{2}-\d{4}\b',            # Matches MM-DD-YYYY (e.g., 09-20-2024)
    r'\b\d{2}/\d{2}/\d{4}\b',            # Matches MM/DD/YYYY (e.g., 09/20/2024)
    r'\b\d{2}-\d{2}-\d{4}\b',            # Matches DD-MM-YYYY (e.g., 20-09-2024)
    r'\b\d{2}/\d{2}/\d{4}\b',            # Matches DD/MM/YYYY (e.g., 20/09/2024)
    r'\b\d{1,2}\s+[A-Za-z]+\s+\d{4}\b',  # Matches 20 September 2024
    r'\b[A-Za-z]+\s+\d{1,2},\s+\d{4}\b', # Matches September 20, 2024
    r'\b\d{1,2}\s+[A-Za-z]{3}\s+\d{4}\b', # Matches 20 Sep 2024 (short month)
    r'\b[A-Za-z]{3}\s+\d{1,2},\s+\d{4}\b', # Matches Sep 20, 2024 (short month)
    r'\b\d{1,2}-[A-Za-z]{3}-\d{4}\b',    # Matches 20-Sep-2024 (short month)
    r'\b[A-Za-z]{3}-\d{1,2}-\d{4}\b',    # Matches Sep-20-2024 (short month)
    r'\b[A-Za-z]+,\s+\d{1,2}\s+[A-Za-z]+\s+\d{4}\b', # Matches Friday, 20 September 2024 (with weekday)
    r'\b[A-Za-z]+,\s+[A-Za-z]+\s+\d{1,2},\s+\d{4}\b', # Matches Friday, September 20, 2024 (with weekday)
    r'\b\d{4}\.\d{2}\.\d{2}\b',          # Matches YYYY.MM.DD (e.g., 2024.09.20)
    r'\b\d{2}\.\d{2}\.\d{4}\b',          # Matches DD.MM.YYYY (e.g., 20.09.2024)
    r'\b\d{4}\s+[A-Za-z]{3}\s+\d{1,2}\b',# Matches 2024 Sep 20
    r'\b\d{4}\s+[A-Za-z]+\s+\d{1,2}\b',  # Matches 2024 September 20
    r'\b\d{1,2}\s+[A-Za-z]+,\s+\d{4}\b', # Matches 20 September, 2024 (comma after day)
    r'\b[A-Za-z]+\s+\d{1,2}\s+\d{4}\b'   # Matches September 20 2024 (no comma)
]


def find_all_dates_with_regex(text):
    """Search for all date matches using regex patterns."""
    dates = []
    for pattern in date_patterns:
        matches = re.findall(pattern, text)  # Find all matches for the pattern
        dates.extend(matches)
    return dates

def parse_date(date_str):
    """Convert a date string to a datetime object, trying different formats."""
    date_formats = [
        '%Y-%m-%d',        # Format: 2024-09-20
        '%Y/%m/%d',        # Format: 2024/09/20
        '%m-%d-%Y',        # Format: 09-20-2024
        '%m/%d/%Y',        # Format: 09/20/2024
        '%d-%m-%Y',        # Format: 20-09-2024
        '%d/%m/%Y',        # Format: 20/09/2024
        '%d %B %Y',        # Format: 20 September 2024
        '%B %d, %Y',       # Format: September 20, 2024
        '%d %b %Y',        # Format: 20 Sep 2024 (short month)
        '%b %d, %Y',       # Format: Sep 20, 2024 (short month)
        '%d-%b-%Y',        # Format: 20-Sep-2024 (short month)
        '%b-%d-%Y',        # Format: Sep-20-2024 (short month)
        '%A, %d %B %Y',    # Format: Friday, 20 September 2024 (with weekday)
        '%A, %B %d, %Y',   # Format: Friday, September 20, 2024 (with weekday)
        '%Y.%m.%d',        # Format: 2024.09.20
        '%d.%m.%Y',        # Format: 20.09.2024 (European format)
        '%Y %b %d',        # Format: 2024 Sep 20 (short month)
        '%Y %B %d',        # Format: 2024 September 20
        '%d %B, %Y',       # Format: 20 September, 2024 (comma after day)
        '%B %d %Y'         # Format: September 20 2024 (no comma)
    ]
    
    for date_format in date_formats:
        try:
            return datetime.strptime(date_str, date_format)
        except ValueError:
            pass
    return None

def get_latest_date(dates):
    """Return the latest valid date from a list of date strings."""
    parsed_dates = [parse_date(date) for date in dates]
    # Filter out any dates that couldn't be parsed and return the latest one
    valid_dates = [d for d in parsed_dates if d is not None]
    return max(valid_dates).strftime('%Y-%m-%d') if valid_dates else "No valid date found"

def get_title_and_date(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title from the <title> tag
        title = soup.title.string if soup.title else "No title found"

        # Extract all text content to search for a date pattern
        text_content = soup.get_text(separator=' ', strip=True)

        # Use regex to find all dates in the page's text content
        dates = find_all_dates_with_regex(text_content)

        # Get the latest date found
        latest_date = get_latest_date(dates)

        return {'url': url, 'title': title, 'date': latest_date}

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {'url': url, 'title': "Error", 'date': "Error"}

In [44]:
results = [get_title_and_date(url) for url in scraped_urls]

In [45]:
print(results)

[{'url': 'https://docs.wire.com', 'title': 'Welcome to Wire’s documentation! — Wire 0.0.4 documentation', 'date': 'No valid date found'}, {'url': 'https://docs.wire.com/how-to/administrate/index.html', 'title': 'Administration — Wire 0.0.4 documentation', 'date': 'No valid date found'}, {'url': 'https://docs.wire.com/how-to/install/index.html', 'title': 'Installation — Wire 0.0.4 documentation', 'date': 'No valid date found'}, {'url': 'https://docs.wire.com/understand/index.html', 'title': 'Reference — Wire 0.0.4 documentation', 'date': 'No valid date found'}, {'url': 'https://docs.wire.com/developer/index.html', 'title': 'Notes for developers — Wire 0.0.4 documentation', 'date': '2021-02-16'}, {'url': 'https://docs.wire.com/security-responses/index.html', 'title': 'Security responses — Wire 0.0.4 documentation', 'date': '2023-01-19'}, {'url': 'https://docs.wire.com#welcome-to-wire-s-documentation', 'title': 'Welcome to Wire’s documentation! — Wire 0.0.4 documentation', 'date': 'No val

In [49]:
valid_date_count = sum(1 for result in results if result['date'] != "No valid date found")
print(f"Percentage of valid dates found: {valid_date_count / len(results):.2%}")

Percentage of valid dates found: 45.31%
