In [1]:
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
from urllib.parse import urljoin, urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup
from orb.scraper.utils import spoof_request
from tqdm import tqdm

In [2]:
BASE_URL = 'https://www.westsussexconnecttosupport.org/'

In [3]:
def check_for_404_pages(url_dict) -> List[str]:

    """Check each URL in the dictionary for 'Page not found' or '404' error messages."""

    not_found_pages = [] # Creating list of not found pages and their originating URL
    
    for origin_url, links in tqdm(url_dict.items(), desc="Checking for 'Page not found'"):
        for link in links:
            try:
                response = requests.get(link)
                if response.status_code == 404 or 'page not found' in response.text.lower():
                    not_found_pages.append((origin_url, link))

            except requests.RequestException as e:
                not_found_pages.append((origin_url,link))
    
    return not_found_pages



In [6]:
def get_internal_links_recursive(url, base_domain, visited=None, depth=0, max_depth=3):
    """
    Recursively fetch internal links from the given URL within the specified domain and build a dictionary tree.

    Args:
        url (str): The starting URL.
        base_domain (str): The base domain to limit crawling.
        visited (set): A set to keep track of visited URLs.
        depth (int): Current depth of recursion.
        max_depth (int): Maximum depth to recurse.

    Returns:
        dict: A dictionary tree representing the links.
    """
    if visited is None:
        visited = set()
    
    # Check if we've reached the maximum depth or already visited the URL
    if depth > max_depth or url in visited:
        return {}

    try:
        response = spoof_request(url=url, use_proxies=False)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        visited.add(url)  # Mark the URL as visited
        
        # Extract all href links
        excluded_prefixes = ('tel:', 'mailto:', 'Tel:', 'Mailto:', 'email:', 'Email:')
        links = [
            a.get('href') for a in soup.find_all('a', href=True)
            if not a.get('href').startswith(excluded_prefixes)
        ]

        # Filter internal links only
        internal_links = []
        for link in links:
            full_url = urljoin(url, link)  # Ensure full URL for relative links
            if base_domain in urlparse(full_url).netloc:  # Check if it's within the same domain
                internal_links.append(full_url)

        # Build the dictionary tree
        link_tree = {}
        for link in set(internal_links):  # Remove duplicates
            link_tree[link] = get_internal_links_recursive(link, base_domain, visited, depth + 1, max_depth)

        return link_tree
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return {}

In [7]:
domain = urlparse(BASE_URL).netloc
link_tree = get_internal_links_recursive(BASE_URL, domain, max_depth=2)

In [8]:
link_tree

{'https://www.westsussexconnecttosupport.org/s4s/Home/TermsOfUse#3': {},
 'https://www.westsussexconnecttosupport.org/money-advice/introduction/': {'https://www.westsussexconnecttosupport.org/s4s/Home/TermsOfUse#3': {},
  'https://www.westsussexconnecttosupport.org/money-advice/introduction/': {},
  'https://www.westsussexconnecttosupport.org/sharing-updates/get-involved-with-adults-services/': {'https://www.westsussexconnecttosupport.org/accessibility/accessibility-statement/': {},
   'https://www.westsussexconnecttosupport.org/how-to-live-well-with-a-health-condition-or-disability/introduction/': {},
   'https://www.westsussexconnecttosupport.org/care-options/introduction/': {},
   'https://www.westsussexconnecttosupport.org/help-to-stay-safe/': {},
   'https://www.westsussexconnecttosupport.org/living-independently/': {},
   'https://www.westsussexconnecttosupport.org/professional-zone/#2': {},
   'https://www.westsussexconnecttosupport.org/administration/': {},
   'https://www.west