In [2]:
import requests
from bs4 import BeautifulSoup

def scrape_text(url):
    try:
        # Fetch the webpage content
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()  # Raise an error for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract and clean up text
        text = soup.get_text(separator="\n", strip=True)
        return text

    except requests.exceptions.RequestException as e:
        return f"Error fetching page: {e}"

# Example usage
url = "https://www.dmv.virginia.gov/vehicles/registration/tow-truck-reg"  # Replace with the actual URL
scraped_text = scrape_text(url)
print(scraped_text)  # Print first 500 characters for preview


Register a Tow Truck in Virginia | Virginia Department of Motor Vehicles
Skip to main content
Virginia Department of Motor Vehicles
Utility menu
Online Services
Locations
Moving
Forms
My Account
Search
Search Site
Submit
Main menu
Licenses & IDs
Licenses & IDs
Driver's License
REAL ID
Learner's Permits
ID Cards
Commercial Driver's Licenses
Motorcycle License
Exams & Study Materials
Driver Training
Driver Improvement
Disability Programs
Payment Plan Program
Organ Donation
License Extension
Military Personnel
Moving
Document Guide
Know which documents are required for your driver application before you head to DMV.
Access Guide
Vehicles
Vehicles
Title Your Vehicle or Trailer
Registration
First Time Vehicle Registration
Trailer Registration
Emissions Inspections
Delinquent Property Taxes and Vehicle Registration
Electric Vehicles
Antique Motor Vehicles and Trailers
Trip Permit
Denial of Registrations or Renewal
Receive Notifications - Email and Text Reminders
eNotification Policy
Renew Yo

In [4]:
import requests
from bs4 import BeautifulSoup

# URL to scrape
url = "https://www.dmv.virginia.gov/vehicles/registration/tow-truck-reg"  # Replace with your actual URL

# Fetch the webpage
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all accordion items
    accordion_items = soup.select('.c-accordion-item')

    # Loop through each accordion item
    for item in accordion_items:
        # Extract the button text (heading)
        button = item.select_one('.c-accordion-item__toggle')
        button_text = button.get_text(strip=True) if button else "Not found"

        # Extract the description inside the first paragraph in drawer-inner
        drawer_inner = item.select_one('.c-accordion-item__drawer-inner p')
        description_text = drawer_inner.get_text(strip=True) if drawer_inner else "Not found"

        # Print results for each accordion item
        print(f"Heading: {button_text}")
        print(f"Description: {description_text}")
        print("-" * 50)

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Heading: Tow Truck
Description: A for-hire motor vehicle designed to lift, pull or carry another vehicle by means of a hoist or other mechanical apparatus. This includes vehicles designed with a ramp on wheels and a hydraulic lift with a capacity to haul or tow another vehicle, commonly referred to as “flatbeds” or "rollbacks."
--------------------------------------------------
Heading: Wrecker
Description: A motor vehicle that is eligible to be registered as a tow truck, but will only be used in a private capacity. Essentially, a private use tow truck.
--------------------------------------------------
Heading: For-Hire
Description: A motor vehicle designed for transportation of property, and used for such by the owner or lessee, for compensation.
--------------------------------------------------
Heading: Private Use
Description: A motor vehicle designed for the transportation of property, operated as such by the owner or lessee for their convenience, not for compensation.
----------

In [6]:
import os
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urlparse
from pathlib import Path

# Base output directory
OUTPUT_DIR = 'dmv_site_data_v2'

def get_path_components(url, base_url):
    """Extract path components from URL after removing base_url"""
    relative_path = url.replace(base_url, '')
    components = [comp for comp in relative_path.strip('/').split('/') if comp]
    return components

def create_nested_directory(url, base_url):
    """Create nested directory structure based on URL path"""
    components = get_path_components(url, base_url)
    
    if len(components) >= 2:
        dir_path = os.path.join(OUTPUT_DIR, components[0], components[1])
    else:
        dir_path = os.path.join(OUTPUT_DIR, 'root')
    
    os.makedirs(dir_path, exist_ok=True)
    return dir_path

def get_safe_filename(url, base_url):
    """Convert last part of URL path to safe filename"""
    components = get_path_components(url, base_url)
    filename = components[-1] if components else 'index'
    return f"{filename}.json"

def fetch_page(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser') if response.status_code == 200 else None

def extract_all_text(soup):
    main_contents = soup.find_all('div', class_='c-wysiwyg')
    return "\n\n".join(content.get_text(separator='\n', strip=True) for content in main_contents) if main_contents else "No content found"


def extract_accordions(soup):
    """Extract accordion items with headings and descriptions"""
    accordions = []
    for item in soup.select('.c-accordion-item'):
        button = item.select_one('.c-accordion-item__toggle')
        button_text = button.get_text(strip=True) if button else "Not found"
        
        drawer_inner = item.select_one('.c-accordion-item__drawer-inner p')
        description = drawer_inner.get_text(strip=True) if drawer_inner else "Not found"
        
        accordions.append({
            'heading': button_text,
            'description': description
        })
    return accordions

def extract_submenu_links(soup):
    submenu_links = {}
    for submenu in soup.select('.c-menu__subnav .c-menu__item a'):
        title = submenu.get_text(strip=True)
        url = submenu.get('href')
        if url and title:
            submenu_links[title] = url
    return submenu_links

def extract_attachments(soup, base_url, dir_path):
    attachments = {}
    attachments_dir = os.path.join(dir_path, 'attachments')
    os.makedirs(attachments_dir, exist_ok=True)

    for link in soup.find_all('a', href=True):
        file_url = link['href']
        if not file_url.startswith('http'):
            file_url = f"{base_url}{file_url}"
        
        if file_url.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx')):
            file_name = os.path.basename(file_url)
            file_path = os.path.join(attachments_dir, file_name)
            
            try:
                response = requests.get(file_url, stream=True)
                if response.status_code == 200:
                    with open(file_path, 'wb') as f:
                        for chunk in response.iter_content(1024):
                            f.write(chunk)
                    attachments[file_name] = os.path.join('attachments', file_name)
            except Exception as e:
                print(f"Failed to download {file_url}: {e}")
    return attachments

def save_page_data(data, url, base_url):
    """Save page data to a JSON file in the appropriate nested directory"""
    dir_path = create_nested_directory(url, base_url)
    filename = get_safe_filename(url, base_url)
    filepath = os.path.join(dir_path, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
    
    print(f"Saved data to {filepath}")
    return dir_path

def main():
    base_url = "https://www.dmv.virginia.gov"
    start_url = f"{base_url}/vehicles/registration/first-reg"
    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    soup = fetch_page(start_url)
    if not soup:
        return
    
    # Process main page
    dir_path = create_nested_directory(start_url, base_url)
    main_page_data = {
        "url": start_url,
        "text_content": extract_all_text(soup),
        "accordions": extract_accordions(soup),
        "attachments": extract_attachments(soup, base_url, dir_path)
    }
    save_page_data(main_page_data, start_url, base_url)
    
    # Process submenu pages
    submenu_links = extract_submenu_links(soup)
    for title, relative_url in submenu_links.items():
        full_url = f"{base_url}{relative_url}"
        submenu_soup = fetch_page(full_url)
        
        if submenu_soup:
            dir_path = create_nested_directory(full_url, base_url)
            submenu_data = {
                "url": full_url,
                "title": title,
                "text_content": extract_all_text(submenu_soup),
                "accordions": extract_accordions(submenu_soup),
                "attachments": extract_attachments(submenu_soup, base_url, dir_path)
            }
            save_page_data(submenu_data, full_url, base_url)

if __name__ == "__main__":
    main()

Saved data to dmv_site_data_v2\vehicles\registration\first-reg.json
Saved data to dmv_site_data_v2\vehicles\registration\first-reg.json
Saved data to dmv_site_data_v2\vehicles\registration\trailer.json
Saved data to dmv_site_data_v2\vehicles\registration\emissions.json
Saved data to dmv_site_data_v2\vehicles\registration\stop-register.json
Saved data to dmv_site_data_v2\vehicles\registration\alternative-vehicles.json
Saved data to dmv_site_data_v2\vehicles\registration\antique.json
Saved data to dmv_site_data_v2\vehicles\registration\temp-permit.json
Saved data to dmv_site_data_v2\vehicles\registration\denials.json
Saved data to dmv_site_data_v2\vehicles\registration\enotification-info.json
Saved data to dmv_site_data_v2\vehicles\registration\enotification-policy.json
Saved data to dmv_site_data_v2\vehicles\registration\mail-renew.json
Saved data to dmv_site_data_v2\vehicles\registration\farm.json
Saved data to dmv_site_data_v2\vehicles\registration\moped.json
Saved data to dmv_site_da

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urlparse
from pathlib import Path

# Base output directory
OUTPUT_DIR = 'dmv_site_data_v2'

def get_path_components(url, base_url):
    """Extract path components from URL after removing base_url"""
    relative_path = url.replace(base_url, '')
    components = [comp for comp in relative_path.strip('/').split('/') if comp]
    return components

def create_nested_directory(url, base_url):
    """Create nested directory structure based on URL path"""
    components = get_path_components(url, base_url)
    
    if len(components) >= 2:
        dir_path = os.path.join(OUTPUT_DIR, components[0], components[1])
    else:
        dir_path = os.path.join(OUTPUT_DIR, 'root')
    
    os.makedirs(dir_path, exist_ok=True)
    return dir_path

def get_safe_filename(url, base_url):
    """Convert last part of URL path to safe filename"""
    components = get_path_components(url, base_url)
    filename = components[-1] if components else 'index'
    return f"{filename}.json"

def fetch_page(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser') if response.status_code == 200 else None


def extract_submenu_links(soup):
    submenu_links = {}
    for submenu in soup.select('.c-menu__subnav .c-menu__item a'):
        title = submenu.get_text(strip=True)
        url = submenu.get('href')
        if url and title:
            submenu_links[title] = url
    return submenu_links

def extract_attachments(soup, base_url, dir_path):
    attachments = {}
    attachments_dir = os.path.join(dir_path, 'attachments')
    os.makedirs(attachments_dir, exist_ok=True)

    for link in soup.find_all('a', href=True):
        file_url = link['href']
        if not file_url.startswith('http'):
            file_url = f"{base_url}{file_url}"
        
        if file_url.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx')):
            file_name = os.path.basename(file_url)
            file_path = os.path.join(attachments_dir, file_name)
            
            try:
                response = requests.get(file_url, stream=True)
                if response.status_code == 200:
                    with open(file_path, 'wb') as f:
                        for chunk in response.iter_content(1024):
                            f.write(chunk)
                    attachments[file_name] = os.path.join('attachments', file_name)
            except Exception as e:
                print(f"Failed to download {file_url}: {e}")
    return attachments

def save_page_data(data, url, base_url):
    """Save page data to a JSON file in the appropriate nested directory"""
    dir_path = create_nested_directory(url, base_url)
    filename = get_safe_filename(url, base_url)
    filepath = os.path.join(dir_path, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
    
    print(f"Saved data to {filepath}")
    return dir_path

def extract_all_text(soup):
    """Extract and combine all text content with formatted accordions"""
    main_content = []
    accordion_content = []
    
    # Extract main content blocks
    for content in soup.find_all('div', class_='c-wysiwyg'):
        main_content.append(content.get_text(separator='\n', strip=True))
    
    # Extract and format accordions
    for item in soup.select('.c-accordion-item'):
        heading = item.select_one('.c-accordion-item__toggle')
        description = item.select_one('.c-accordion-item__drawer-inner p')
        
        if heading and description:
            accordion_text = f"[Accordion] {heading.get_text(strip=True)}\n{description.get_text(strip=True)}"
            accordion_content.append(accordion_text)
    
    # Combine all sections with clear separation
    combined = []
    if main_content:
        combined.append("# Main Content\n" + "\n\n".join(main_content))
    if accordion_content:
        combined.append("# Frequently Asked Questions\n" + "\n\n".join(accordion_content))
    
    return "\n\n".join(combined) if combined else "No content found"

def main():
    base_url = "https://www.dmv.virginia.gov"
    start_url = f"{base_url}/vehicles/registration/first-reg"
    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    soup = fetch_page(start_url)
    if not soup:
        return
    
    # Process main page
    dir_path = create_nested_directory(start_url, base_url)
    main_page_data = {
        "url": start_url,
        "text_content": extract_all_text(soup),  # Now contains all text
        "attachments": extract_attachments(soup, base_url, dir_path)
    }
    save_page_data(main_page_data, start_url, base_url)
    
    # Process submenu pages
    submenu_links = extract_submenu_links(soup)
    for title, relative_url in submenu_links.items():
        full_url = f"{base_url}{relative_url}"
        submenu_soup = fetch_page(full_url)
        
        if submenu_soup:
            dir_path = create_nested_directory(full_url, base_url)
            submenu_data = {
                "url": full_url,
                "title": title,
                "text_content": extract_all_text(submenu_soup),  # Combined text
                "attachments": extract_attachments(submenu_soup, base_url, dir_path)
            }
            save_page_data(submenu_data, full_url, base_url)

if __name__ == "__main__":
    main()

Saved data to dmv_site_data_v2\vehicles\registration\first-reg.json
Saved data to dmv_site_data_v2\vehicles\registration\first-reg.json
Saved data to dmv_site_data_v2\vehicles\registration\trailer.json
Saved data to dmv_site_data_v2\vehicles\registration\emissions.json
Saved data to dmv_site_data_v2\vehicles\registration\stop-register.json
Saved data to dmv_site_data_v2\vehicles\registration\alternative-vehicles.json
Saved data to dmv_site_data_v2\vehicles\registration\antique.json
Saved data to dmv_site_data_v2\vehicles\registration\temp-permit.json
Saved data to dmv_site_data_v2\vehicles\registration\denials.json
Saved data to dmv_site_data_v2\vehicles\registration\enotification-info.json
Saved data to dmv_site_data_v2\vehicles\registration\enotification-policy.json
Saved data to dmv_site_data_v2\vehicles\registration\mail-renew.json
Saved data to dmv_site_data_v2\vehicles\registration\farm.json
Saved data to dmv_site_data_v2\vehicles\registration\moped.json
Saved data to dmv_site_da