In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import urllib3

In [2]:
# Base URL of the pages to scrape
base_url = "https://www.colorado.gov/services?page="

# List to hold the extracted data
data = []

# Loop through each page (0 to 12)
for page in range(13):
    print(f"Scraping page {page}...")
    # Send a GET request to the current page URL
    response = requests.get(base_url + str(page))
    response.raise_for_status()

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the container holding the service items
    services_container = soup.find('div', class_='services--items-container')
    services_items = services_container.find_all('li')

    # Loop through each service item and extract the required information
    for item in services_items:
        main_link = item.find('div', class_='views-field-field-service-application-name').find('a')
        dept_div = item.find('div', class_='views-field-field-service-agency-local-link')
        dept_link = dept_div.find('a') if dept_div else None
        
        main_text = main_link.text.strip() if main_link else ''
        main_url = main_link['href'] if main_link else ''
        dept_name = dept_link.text.strip() if dept_link else ''
        dept_url = dept_link['href'] if dept_link else ''
        
        data.append({
            "mainText": main_text,
            "mainURL": main_url,
            "deptName": dept_name,
            "deptURL": dept_url
        })

# Create a DataFrame to organize the data
df = pd.DataFrame(data)

Scraping page 0...
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...


In [3]:
# Display the DataFrame


df.to_csv('colorado_services.csv', index=False)
df.head(500)

Unnamed: 0,mainText,mainURL,deptName,deptURL
0,Access College in Colorado planning resources,https://www.mycoloradojourney.com/journey,Department of Higher Education,https://cdhe.colorado.gov/
1,Access the Driver's License and ID Card Center,https://mydmv.colorado.gov/_/,Department of Revenue,https://dmv.colorado.gov/
2,Add emergency contacts to your driver's licens...,https://mydmv.colorado.gov/?Link=EmergencyContact,Department of Revenue,https://revenue.colorado.gov/
3,Apply for an educator license,https://cool.randasolutions.com/Account/Login?...,Department of Education,https://www.cde.state.co.us/
4,Apply for Health First Colorado and Child Heal...,https://www.colorado.gov/hcpf/how-to-apply,Department of Health Care Policy & Financing,https://hcpf.colorado.gov/
...,...,...,...,...
140,"View state wildlife areas by county, hunting, ...",http://cpw.state.co.us/placestogo/parks/Pages/...,Department of Natural Resources,https://dnr.colorado.gov/
141,View travel alerts throughout the state,https://cotrip.org/travelAlerts.htm#?roadId=,Department of Transportation,https://www.codot.gov/
142,View where your tax dollars go with Colorado T...,https://apps.colorado.gov/apps/treasury/taxpay...,Department of Revenue,https://www.colorado.gov/taxtracks/
143,Workers' Compensation Benefits Calculator,https://dowc.cdle.state.co.us/Benefits/,Department of Labor and Employment,https://cdle.colorado.gov/


In [4]:
# Suppress only the single InsecureRequestWarning from urllib3 needed
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Create the "pages" and "pages/HTML" directories if they don't exist
if not os.path.exists("pages"):
    os.makedirs("pages")
if not os.path.exists("pages/HTML"):
    os.makedirs("pages/HTML")

# Function to create a valid filename
def make_valid_filename(name):
    return "".join(c if c.isalnum() else "_" for c in name)

# Function to fetch a page with retries and SSL verification handling
def fetch_page(url, retries=3, backoff_factor=0.3):
    for i in range(retries):
        try:
            response = requests.get(url, verify=False)
            response.raise_for_status()
            return response
        except requests.RequestException as e:
            if i < retries - 1:
                time.sleep(backoff_factor * (2 ** i))
            else:
                raise e

# Loop through the DataFrame and download each mainURL page as raw text and HTML
for index, row in df.iterrows():
    main_text = row['mainText']
    main_url = row['mainURL']
    
    # Make a valid filename
    filename = make_valid_filename(main_text)
    
    # Construct file paths
    text_file_path = os.path.join("pages", filename + ".txt")
    html_file_path = os.path.join("pages/HTML", filename + ".html")
    
    if os.path.exists(text_file_path) and os.path.exists(html_file_path):
        # print(f"{main_text} Exists!")
        continue
    
    try:
        # Fetch the page with retries and SSL verification handling
        response = fetch_page(main_url)
        
        # Parse the HTML content of the mainURL page
        page_soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract raw text from the page
        raw_text = page_soup.get_text(separator='\n', strip=True)
        
        # Save the raw text to a text file
        with open(text_file_path, 'w', encoding='utf-8') as file:
            file.write(raw_text)
        
        # Save the raw HTML to an HTML file
        with open(html_file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        
        print(f"Downloaded and saved: {main_text}")
    except requests.RequestException as e:
        print(f"Failed to download {main_text} from {main_url}: {e}")

print("All pages downloaded and saved as raw text and HTML.")

Failed to download Correctional Populations Data from https://ors.colorado.gov/ors-corrpops: HTTPSConnectionPool(host='ors.colorado.gov', port=443): Max retries exceeded with url: /ors-corrpops (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000200E9C20A70>: Failed to resolve 'ors.colorado.gov' ([Errno 11001] getaddrinfo failed)"))
Failed to download Crime Statistics from https://ors.colorado.gov/ors-crimestats: HTTPSConnectionPool(host='ors.colorado.gov', port=443): Max retries exceeded with url: /ors-crimestats (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000200E9C229F0>: Failed to resolve 'ors.colorado.gov' ([Errno 11001] getaddrinfo failed)"))
Failed to download MyUI Employer from https://myuiemployer.coworkforce.com/: 500 Server Error: Internal Server Error for url: https://myuiemployer.coworkforce.com/Views/Error/Index.cshtml?aspxerrorpath=/
Failed to download MyUI+ from https://myui.clouduim.cdle.state.co.us/

MANUALLY DOWNLOAD THE FOLLOWING OUTPUTS:    

In [5]:
import pandas as pd
import os

# Load the DataFrame from the CSV file
df = pd.read_csv('colorado_services.csv')

# Function to create a valid filename
def make_valid_filename(name):
    return "".join(c if c.isalnum() else "_" for c in name)

# List to store names and links of missing files
missing_files = []

# Loop through the DataFrame and check for existing files
for index, row in df.iterrows():
    main_text = row['mainText']
    main_url = row['mainURL']
    
    # Make a valid filename
    filename = make_valid_filename(main_text)
    
    # Construct file paths
    text_file_path = os.path.join("pages", filename + ".txt")
    html_file_path = os.path.join("pages/HTML", filename + ".html")
    
    # Check if both text and HTML files exist
    if not (os.path.exists(text_file_path) and os.path.exists(html_file_path)):
        missing_files.append({"name": main_text, "url": main_url})

# Print the list of missing files
print("\nMissing Files:")
for item in missing_files:
    print(f"Name: {item['name']}, URL: {item['url']}")



Missing Files:
Name: Correctional Populations Data, URL: https://ors.colorado.gov/ors-corrpops
Name: Crime Statistics, URL: https://ors.colorado.gov/ors-crimestats
Name: MyUI Employer, URL: https://myuiemployer.coworkforce.com/
Name: MyUI+, URL: https://myui.clouduim.cdle.state.co.us/Claimant/Core/Login.ASPX
Name: Register to vote, URL: https://www.sos.state.co.us/pubs/elections/vote/VoterHome.html?menuheaders=5
Name: Review limited license draw results and preference point status, URL: http://cpw.state.co.us/thingstodo/Pages/DrawResults_PreferencePoints.aspx
Name: Search for an oil and gas facility, URL: https://cogcc.state.co.us/cogis/FacilitySearch.asp
Name: Sign up to receive School Safety Newsletter e-mails, URL: https://cdps-m.state.co.us/mailman/listinfo/schoolsafetycenter
Name: State Demography Office Dashboard, URL: https://dola.colorado.gov/demog_webapps/dashboard.jsf
Name: Access TobaccoFreeCO resources, URL: http://www.tobaccofreeco.org/
Name: Verify a Colorado Professiona

Now get the files that didnt download and add them to folder to get raw text maually


In [6]:
html_directory = "pages/Downloaded"

# Loop through each HTML file in the directory
for filename in os.listdir(html_directory):
    if filename.endswith(".html"):
        html_file_path = os.path.join(html_directory, filename)
        text_file_path = os.path.join("pages", os.path.splitext(filename)[0] + ".txt")

        # Open and parse the HTML file
        with open(html_file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
        
        # Extract raw text from the HTML
        raw_text = soup.get_text(separator='\n', strip=True)
        
        # Save the extracted text to a text file
        with open(text_file_path, 'w', encoding='utf-8') as file:
            file.write(raw_text)
        
        print(f"Extracted and saved text from: {filename}")

print("All HTML files processed and text extracted.")

Extracted and saved text from: Access TobaccoFreeCO resources.html
Extracted and saved text from: MyUI+.html
Extracted and saved text from: Review limited license draw results and preference point status.html
Extracted and saved text from: View parks by activities, facilities, accessibility and conditions.html
Extracted and saved text from: View state wildlife areas by county, hunting, fishing, recreation, and GMU.html
All HTML files processed and text extracted.
