In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Function to clean up folder and filenames
def clean_filename(url):
    # Use the URL path as the filename, replacing slashes with underscores
    return urlparse(url).path.replace('/', '_').strip('_')

# Recursive function to scrape a URL and follow links up to depth 4
def scrape_page(url, current_depth, max_depth, parent_directory):
    if current_depth > max_depth:
        return

    # Create a directory for the current URL if it doesn't exist
    directory_name = clean_filename(url)
    directory_path = os.path.join(parent_directory, directory_name)
    os.makedirs(directory_path, exist_ok=True)

    # Fetch the page content
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Save the main content (paragraphs) to a .txt file
        text_filename = os.path.join(directory_path, f'{directory_name}.txt')
        with open(text_filename, 'w', encoding='utf-8') as file:
            main_content = soup.find_all('p')
            for paragraph in main_content:
                file.write(paragraph.get_text() + '\n')

        print(f'Scraped: {url} (depth {current_depth})')

        # Find all links on the page
        links = soup.find_all('a', href=True)
        for link in links:
            full_url = urljoin(url, link['href'])  # Get the full URL
            # Ensure we are only following internal links (same domain)
            if urlparse(full_url).netloc == urlparse(url).netloc:
                # Recursively scrape the link
                scrape_page(full_url, current_depth + 1, max_depth, directory_path)

    except requests.RequestException as e:
        print(f"Error accessing {url}: {e}")



In [4]:
# Initial settings
root_url = 'https://www.ontario.ca/page/driving-and-roads'
root_directory = 'ontarioRoadSafety'  # Root folder for storing scraped data
max_depth = 2  # Outline how many layers deep to read



In [5]:
# Create the root directory if it doesn't exist
os.makedirs(root_directory, exist_ok=True)


In [6]:
# Start scraping from the root URL
scrape_page(root_url, current_depth=0, max_depth=max_depth, parent_directory=root_directory)


Scraped: https://www.ontario.ca/page/driving-and-roads (depth 0)
Scraped: https://www.ontario.ca/page/driving-and-roads#main-content (depth 1)
Scraped: https://www.ontario.ca/page/driving-and-roads#main-content (depth 2)
Scraped: https://www.ontario.ca/page/government-ontario (depth 2)
Scraped: https://www.ontario.ca/fr/page/conduite-et-routes (depth 2)
Scraped: https://www.ontario.ca/fr/page/conduite-et-routes (depth 2)
Scraped: https://www.ontario.ca/page/arts-and-culture (depth 2)
Scraped: https://www.ontario.ca/page/business-and-economy (depth 2)
Scraped: https://www.ontario.ca/covid (depth 2)
Scraped: https://www.ontario.ca/page/driving-and-roads (depth 2)
Scraped: https://www.ontario.ca/page/education-and-training (depth 2)
Scraped: https://www.ontario.ca/page/environment-and-energy (depth 2)
Scraped: https://www.ontario.ca/page/government (depth 2)
Scraped: https://www.ontario.ca/page/health-care-ontario (depth 2)
Scraped: https://www.ontario.ca/page/home-and-community (depth 2)

KeyboardInterrupt: 