In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time

data = []

Creating a function to extract all the links for the datasets in the webpage

The links to each dataset are inside <a> elements that do not have any class or ids. They are nested inside h3, div and li elements that have a specific class and I will use to target the <a> elements.

except requests.exceptions.RequestException as e:

This line starts an except block.

requests.exceptions.RequestException is a base class for exceptions raised by the requests library during network requests. This includes various exceptions like:

ConnectionError: Problems with the network connection.
Timeout: Request timed out.
TooManyRedirects: Too many redirects encountered.
HTTPError: HTTP error responses (e.g., 404 Not Found, 500 Server Error).
as e: This assigns the specific exception instance to the variable e. This allows me to access information about the specific error that occurred.

print(f"Error fetching URL: {e}")

This line prints an informative error message to the console.
f-string (formatted string literal) is used to create the message dynamically.
{e} is a placeholder that gets replaced with the actual error object (e).
return []

This line returns an empty list ([]).
If an exception occurs during the request, the function will return an empty list instead of the expected list of links.
This helps to handle the error gracefully and prevent the program from crashing.

In [2]:
def get_dataset_links(url):
  """
  Extracts all href attributes from <a> tags within <h3 class="dataset_heading"> 
  elements on the given webpage.

  Args:
    url: The URL of the webpage to scrape.

  Returns:
    A list of href attributes extracted from the <a> tags.
  """
  try:
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/91.0.4472.124 Safari/537.36"
        )
    }
    
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    soup = BeautifulSoup(response.content, 'html.parser')

    links = []
    for dataset_item in soup.find_all('li', class_='dataset-item has-organization'):
      dataset_content = dataset_item.find('div', class_='dataset-content')
      if dataset_content:
        for dataset_heading in dataset_content.find_all('h3', class_='dataset-heading'):
          for link in dataset_heading.find_all('a'):
            href = link.get('href')
            links.append(href)

    return links

  except requests.exceptions.RequestException as e:
    print(f"Error fetching URL: {e}")
    return []


Create a function to shorten the descriptions:

In [3]:
def shorten_description(description, max_words=30):
  """Shortens a description to the specified word limit."""
  words = description.split()
  if len(words) > max_words:
    description = ' '.join(words[:max_words]) + '...'
  return description

Creating a Function to Extract Information from Each Dataset URL:

Parse the Title: Extracts the title of the dataset.
Parse the Description: Extracts the dataset description and passes it to the shorten_description function.
Parse Additional Information: Extracts all other relevant details from the dataset's table.
Since not all datasets contain the same information, the function assigns "Not available" for any missing fields.

In [4]:
def scrape_dataset_details(dataset_url):
  """
  Fetches and parses the details page of a given dataset URL.

  Args:
    dataset_url: The URL of the dataset page to scrape.

  Returns:
    A dictionary containing the extracted dataset details (e.g., title, description, etc.).
  """
  try:
    response = requests.get(dataset_url)
    response.raise_for_status()  # Raise an exception for bad status codes
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract specific details from the dataset page (customize as needed)
    dataset_name = soup.find('h1', attrs={'itemprop': 'name'}).text.strip()  # Example: Extract title from <h1> tag
      
    description_div = soup.find('div', itemprop="description", class_="notes embedded-content")
    if description_div:
      description_p = description_div.find('p')
      if description_p:
        description = description_p.text.strip()
        short_description = shorten_description(description, max_words=30)

    # Extract information from table 
    table_dict= {}
    tables = soup.find_all('table', class_='table table-striped table-bordered table-condensed')

    # Ensure there are tables on the page
    if len(tables) < 2:
        print("Table 2 not found on the page.")
    else:
    # Select the second table (index 1)
        table = tables[1]

    # Extract rows from the table and store in a dictionary

    rows = table.find_all('tr')

    for row in rows:
        th = row.find('th')  # Find the header
        td = row.find('td')  # Find the corresponding value
        if th and td:
            th_text = th.get_text(strip=True)
            td_text = td.get_text(strip=True)
            table_dict[th_text] = td_text

    resource_type = table_dict.get("Resource Type", "Not available")
    metadata_created_date = table_dict.get("Metadata Created Date", "Not available")
    identifier = table_dict.get("Identifier", "Not available")
    doi = table_dict.get("DOI", "Not available")
    publisher =  table_dict.get("Publisher", "Not available")
    maintainer = table_dict.get("Maintainer", "Not available")
    a_id = table_dict.get("@Id", "Not available")
    data_last_modified = table_dict.get("Data Last Modified", "Not available")
    public_access = table_dict.get("Public Access Level", "Not available")  
    data_first_published = table_dict.get("Data First Published", "Not available")

    
    data_dict = {
        'dataset_name': dataset_name,
        'description': short_description,
        'resource_type': resource_type,
        'metadata_created_date': metadata_created_date,
        'identifier': identifier,
        'doi': doi,
        'publisher': publisher,
        'maintainer': maintainer,
        'a_id': a_id,
        'data_last_modified': data_last_modified,
        'public_access': public_access,
        'data_first_published': data_first_published,
        }

    data.append(data_dict)

  except requests.exceptions.RequestException as e:
    print(f"Error fetching dataset URL: {e}")
    return None


Loop Through Pages Using Pagination Parameters:
Implement a loop to navigate through the pages using pagination parameters. The loop continues until no new data is added when it finds an empty webpage. This website doesn't implement a 404 page after the last page with results.

In [5]:
# Loop through pages and scrape dataset links with the get_dataset_links function created before.
current_page = 1
all_links = []

while True:
    url = f"https://catalog.data.gov/dataset/?tags=machine-learning&page={current_page}"
    dataset_links = get_dataset_links(url)

    if dataset_links:  # If links are found, add them to the list and move to the next page
        all_links.extend(dataset_links)
        current_page += 1
        time.sleep(2)
    else:  # Stop the loop if no links are found
        break

In [None]:
# Loop trough all the links and scrape the information for each of the datasets or studies.
for link in all_links:
    dataset_link = "https://catalog.data.gov" + str(link)
    dataset_details = scrape_dataset_details(dataset_link)

In [7]:
# Export the information to a csv file.
import csv

def list_of_dicts_to_csv(data_list, filename):
  """
  Converts a list of dictionaries to a CSV file.

  Args:
    data_list: A list of dictionaries.
    filename: The name of the CSV file to create.
  """
  with open(filename, 'w', newline='', encoding='utf-8') as file:
    fieldnames = data_list[0].keys()  # Get the header row from the first dictionary
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data_list)

list_of_dicts_to_csv(data, 'datagov_ml.csv')