In [2]:
from selenium import webdriver
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

In [3]:
driver = webdriver.Chrome() 

# Define the base URL of the website
base_url = "https://www.usda.gov/media/agency-reports"

# Create a list to store all link data
all_link_data = []

In [4]:
# Iterate through the next 5 pages
for page_number in range(1, 6):  # Pages 1 to 5
    # Define the URL for the current page
    url = f"{base_url}?start_date=08/01/2023&end_date=09/12/2023&page={page_number}"

    # Visit the URL using Selenium
    driver.get(url)

    # Get the page source after it's fully loaded (Selenium waits for the page to load)
    page_source = driver.page_source

    # Parse the HTML content of the page
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find all elements with the specific class (modify class_name as needed)
    blocks = soup.find_all('span', class_='agency-report-title')

    # Iterate through the blocks on the current page
    for block in blocks:
        # Find all the links within the block
        links = block.find_all('a')

        # Iterate through the links within the block
        for link in links:
            # Get the URL of the link
            link_url = link.get('href')

            # Check if the link is valid (not None and not empty)
            if link_url:
                # Handle relative URLs manually by combining with the base URL
                if not link_url.startswith("http"):
                    link_url = base_url + link_url

                # Send an HTTP GET request to the full link URL
                link_response = requests.get(link_url)

                # Check if the request to the link was successful (status code 200)
                if link_response.status_code == 200:
                    # Parse the HTML content of the link's page
                    link_soup = BeautifulSoup(link_response.text, 'html.parser')

                    # Extract the title of the article
                    article_title = link_soup.title.string

                    # Extract the source information
                    source_element = link_soup.find('span', class_='attribute contact_organization')
                    source = source_element.get_text(strip=True) if source_element else ""

                    # Append the link URL, article title, and source to the all_link_data list
                    all_link_data.append([link_url, article_title, source])

    print(f"Scraped data from page {page_number}")

# Define the CSV file name
csv_file = "links_data.csv"

# Write all the link data to a CSV file
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Link", "Article Title", "Source"])  # Write header row
    writer.writerows(all_link_data)  # Write all link data

print(f"Data has been exported to {csv_file}")

# Close the web driver
driver.quit()

Scraped data from page 1
Scraped data from page 2
Scraped data from page 3
Scraped data from page 4
Scraped data from page 5
Data has been exported to links_data.csv


In [6]:
df = pd.DataFrame(all_link_data)
df = df.rename(columns={0: "URL", 1: "Title and ID", 2: "Source"})
df.head()

Unnamed: 0,URL,Title and ID,Source
0,https://www.nass.usda.gov/Publications/Calenda...,Publication | Cotton System Consumption and St...,National Agricultural Statistics Service
1,https://www.nass.usda.gov/Publications/Calenda...,Publication | Fats and Oils: Oilseed Crushings...,National Agricultural Statistics Service
2,https://www.nass.usda.gov/Publications/Calenda...,Publication | Flour Milling Products | ID: cr5...,National Agricultural Statistics Service
3,https://www.nass.usda.gov/Publications/Calenda...,Publication | Grain Crushings and Co-Products ...,National Agricultural Statistics Service
4,https://www.nass.usda.gov/Publications/Calenda...,Publication | Honey Bee Colonies | ID: rn30113...,National Agricultural Statistics Service
