# West Sussex webscraper

Created as a fun project to play around with webscraping and trying to identify dead links on the west sussex connect site

https://www.westsussexconnecttosupport.org/

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from urllib.parse import urljoin, urlparse
import time

In [2]:
def get_internal_links(url, domain):
    """Fetch all internal links from the given URL within the specified domain."""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract all href links
        links = [a.get('href') for a in soup.find_all('a', href=True) if not (a.get('href').startswith('tel:') or a.get('href').startswith('mailto:') or a.get('href').startswith('Tel:') or a.get('href').startswith('Mailto:') or a.get('href').startswith('email:') or a.get('href').startswith('Email:'))]
        
        # Filter internal links only
        internal_links = []
        for link in links:
            full_url = urljoin(url, link)  # Ensure full URL for relative links
            parsed_url = urlparse(full_url)
            
            # Check if the link is within the same domain
            # if parsed_url.netloc == domain:
            #     internal_links.append(full_url)

            internal_links.append(full_url)
        
        return list(set(internal_links))  # Remove duplicates
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return []

def crawl_website(start_url):
    """Crawl the website and create a dictionary of each URL and its internal links. Track external URLs without visiting."""
    domain = urlparse(start_url).netloc
    visited_urls = set()  # Track visited internal URLs to avoid cycles
    visited_external_urls = set() # Track visited external URLs
    url_dict = {}         # Dictionary to store URLs and their internal links
    to_visit = [start_url]  # Start with the root URL
    
    with tqdm(total=len(to_visit), desc="Crawling", unit="page") as pbar:
        while to_visit:
            current_url = to_visit.pop()
            parsed_current_url = urlparse(current_url)

            if parsed_current_url.netloc == domain:
                if current_url not in visited_urls:
                    visited_urls.add(current_url)
                    #print(f"Visiting: {current_url}")
                    
                    # Get internal links on the current page
                    internal_links = get_internal_links(current_url, domain)
                    url_dict[current_url] = internal_links
                    
                    # Add new URLs to visit that haven't been visited
                    for link in internal_links:
                        if link not in visited_urls:
                            to_visit.append(link)

            else:
                # Marking external links as visited with scraping
                if current_url not in visited_external_urls:
                    visited_external_urls.add(current_url)
                    url_dict[current_url] = []
                    #print(f"Visiting external: {current_url}")

            pbar.update(1)
            pbar.total = len(to_visit) + len(visited_urls) + len(visited_external_urls) + 1
            pbar.refresh()
            
            time.sleep(0.5)  # Being polite to the server

    return url_dict

start_url = 'https://www.westsussexconnecttosupport.org/'  # Replace with the website's root URL
url_links = crawl_website(start_url)

Crawling: 16547page [2:25:33,  1.89page/s]                     


In [3]:
def check_for_404_pages(url_dict):
    """Check each URL in the dictionary for 'Page not found' or '404' error messages."""
    not_found_pages = [] # Creating list of not found pages and their originating URL
    
    for origin_url, links in tqdm(url_dict.items(), desc="Checking for 'Page not found'"):
        for link in links:
            try:
                response = requests.get(link)
                if response.status_code == 404 or 'Page not found' in response.text:
                    #tuple_entry = (origin_url, link)
                    not_found_pages.append((origin_url, link))
                    #print(f"404 or 'Page not found' detected at {url}")

                    # Debugging: print types and the tuple itself
                    #print(f"Appending tuple: {tuple_entry}")
                    #print(f"Types - Origin: {type(origin_url)}, Link: {type(link)}")

                    # Check if the appended entry is a tuple
                    #assert isinstance(tuple_entry, tuple), "Error: The appended entry is not a tuple"

            except requests.RequestException as e:
                not_found_pages.append((origin_url,link))
                #print(f"Error accessing {link}: {e}")

        time.sleep(0.5) # Being polite to the server

    # Ensuring not_found_pages is a list of tuples
    # for item in not_found_pages:
    #     if not isinstance(item, tuple) or len(item) != 2:
    #         print("Warning: Item is not a tuple with two elements:", item)
    
    return not_found_pages

not_found_urls = check_for_404_pages(url_links)

Checking for 'Page not found': 100%|██████████| 2516/2516 [2:27:33<00:00,  3.52s/it]  


In [4]:
len(not_found_urls)

265

In [5]:
not_found_urls

[('https://www.westsussexconnecttosupport.org/how-to-live-well-with-a-health-condition-or-disability/sensory-impairment/',
  'https://www.rnib.org.uk/advice/technology-useful-products'),
 ('https://www.westsussexconnecttosupport.org/how-to-live-well-with-a-health-condition-or-disability/autism/',
  'https://www.autism.org.uk/about/diagnosis/adults.aspx'),
 ('https://www.westsussexconnecttosupport.org/how-to-live-well-with-a-health-condition-or-disability/autism/#maincontent',
  'https://www.autism.org.uk/about/diagnosis/adults.aspx'),
 ('https://www.westsussexconnecttosupport.org/s4s/Auth/SignInOidc',
  'https://www.westsussexconnecttosupport.org/RetrievePassword?returnUrl=%2Fconnect%2Fauthorize%2Fcallback%3Fclient_id%3Ds4s-mvc%26response_type%3Dcode%2520id_token%26scope%3Dopenid%2520mvc_app%2520s4s_api%2520umbraco_api%2520offline_access%2520pcg_forms_app_api%26state%3DOpenIdConnect.AuthenticationProperties%253DhKXMkKiqYH9niazGchTcG90izxvX4oiRdbwUm8XQgjKOn0nDefLwAFh5pg7tKaYu565CYfU3xJ0

In [None]:
# Converting the url_links dictionary into a pandas dataframe
data = [(url, internal_link) for url, links in url_links.items() for internal_link in (links if links else [None])]

df = pd.DataFrame(data, columns=["URL", "Internal_Link"])

In [6]:
# Converting the not_found_urls into a pandas dataframe
broken_links = pd.DataFrame(not_found_urls, columns=["Originating_URL", "Flagged_URL"])

In [7]:
broken_links.head()

Unnamed: 0,Originating_URL,Flagged_URL
0,https://www.westsussexconnecttosupport.org/how-to-live-well-with-a-health-condition-or-disabilit...,https://www.rnib.org.uk/advice/technology-useful-products
1,https://www.westsussexconnecttosupport.org/how-to-live-well-with-a-health-condition-or-disabilit...,https://www.autism.org.uk/about/diagnosis/adults.aspx
2,https://www.westsussexconnecttosupport.org/how-to-live-well-with-a-health-condition-or-disabilit...,https://www.autism.org.uk/about/diagnosis/adults.aspx
3,https://www.westsussexconnecttosupport.org/s4s/Auth/SignInOidc,https://www.westsussexconnecttosupport.org/RetrievePassword?returnUrl=%2Fconnect%2Fauthorize%2Fc...
4,https://www.westsussexconnecttosupport.org/s4s/SignUp,https://www.westsussexconnecttosupport.org/s4s/SignUp/Finish


In [8]:
broken_links.to_csv('west_sussex_broken_links.csv', index=False)