# West Sussex webscraper

Created as a fun project to play around with webscraping and trying to identify dead links on the west sussex connect site

https://www.westsussexconnecttosupport.org/

### To Do
- Refine the flagging of dead links
    - Investigate why some of the links have the west sussex gov url repeated at the end
- Speed up the webcrawler
- Create an output of url trees

### Stretch goals
- Create a front end and have this be an agnostic tool for looking at any website witha front end output

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from urllib.parse import urljoin, urlparse
import time

In [None]:
start_url = 'https://www.westsussexconnecttosupport.org/'  # Replace with the website's root URL

In [None]:
def get_internal_links(url):
    """Fetch all internal links from the given URL within the specified domain."""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract all href links
        excluded_prefixes = ('tel:', 'mailto:', 'Tel:', 'Mailto:', 'email:', 'Email:')
        links = [a.get('href') for a in soup.find_all('a', href=True) if not a.get('href').startswith(excluded_prefixes)]

        # Filter internal links only
        internal_links = []
        for link in links:
            full_url = urljoin(url, link)  # Ensure full URL for relative links
            internal_links.append(full_url)
        
        return list(set(internal_links))  # Remove duplicates
    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return []

def crawl_website(start_url):
    """Crawl the website and create a dictionary of each URL and its internal links. Track external URLs without visiting."""
    domain = urlparse(start_url).netloc
    visited_urls = set()  # Track visited internal URLs to avoid cycles
    visited_external_urls = set() # Track visited external URLs
    url_dict = {}         # Dictionary to store URLs and their internal links
    to_visit = [start_url]  # Start with the root URL
    
    with tqdm(total=len(to_visit), desc="Crawling", unit="page") as pbar:
        while to_visit:
            current_url = to_visit.pop()
            parsed_current_url = urlparse(current_url)

            if parsed_current_url.netloc == domain:
                if current_url not in visited_urls:
                    visited_urls.add(current_url)
                    
                    # Get internal links on the current page
                    internal_links = get_internal_links(current_url)
                    url_dict[current_url] = internal_links
                    
                    # Add new URLs to visit that haven't been visited
                    for link in internal_links:
                        if link not in visited_urls:
                            to_visit.append(link)

            else:
                # Marking external links as visited with scraping
                if current_url not in visited_external_urls:
                    visited_external_urls.add(current_url)
                    url_dict[current_url] = []

            pbar.update(1)
            pbar.total = len(to_visit) + len(visited_urls) + len(visited_external_urls) + 1
            pbar.refresh()
            
            time.sleep(0.5)  # Being polite to the server

    return url_dict

url_links = crawl_website(start_url)

In [None]:
def check_for_404_pages(url_dict):
    """Check each URL in the dictionary for 'Page not found' or '404' error messages."""
    not_found_pages = [] # Creating list of not found pages and their originating URL
    
    for origin_url, links in tqdm(url_dict.items(), desc="Checking for 'Page not found'"):
        for link in links:
            try:
                response = requests.get(link)
                if response.status_code == 404 or 'page not found' in response.text.lower():
                    not_found_pages.append((origin_url, link))

            except requests.RequestException as e:
                not_found_pages.append((origin_url,link))

        time.sleep(0.5) # Being polite to the server
    
    return not_found_pages

not_found_urls = check_for_404_pages(url_links)

In [None]:
len(not_found_urls)

In [None]:
not_found_urls

In [None]:
# Converting the url_links dictionary into a pandas dataframe
data = [(url, internal_link) for url, links in url_links.items() for internal_link in (links if links else [None])]

df = pd.DataFrame(data, columns=["URL", "Internal_Link"])

In [None]:
# Converting the not_found_urls into a pandas dataframe
broken_links = pd.DataFrame(not_found_urls, columns=["Originating_URL", "Flagged_URL"])

In [None]:
broken_links.head()

In [None]:
broken_links.to_csv('../playarea/west_sussex_broken_links.csv', index=False)