In [12]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime

# Base URL for the UCSD newspaper archives
base_url = "https://library.ucsd.edu/dc/"

# Load the subpage links from the subpage_links.json file
with open("subpage_links.json", "r") as f:
    subpage_links = json.load(f)

# List to store all the PDF links
pdf_links = []

# Loop through all the subpages and scrape the PDF links
for subpage_link in subpage_links:
    # Make a GET request to the subpage URL
    response = requests.get(subpage_link)

    # Create a BeautifulSoup object from the response HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the PDF links on the page
    pdf_links_on_page = soup.find_all('a', class_='dams-search-thumbnail-link')

    # Loop through all the PDF links and extract the PDF URL
    for pdf_link_on_page in pdf_links_on_page:
        try:
            # Extract the relative URL for the PDF
            pdf_url = pdf_link_on_page['href']

            pdf_url = pdf_url.replace("/dc/", "")

            # Construct the absolute URL for the PDF file and add the "_1.pdf" suffix
            pdf_full_url = f"{base_url}{pdf_url.split('?')[0]}/_1.pdf"

            # Create a dictionary with the PDF URL and date
            pdf_info = {
                'pdf_url': pdf_full_url,
            }

            # Append the dictionary to the list of PDF links
            pdf_links.append(pdf_info)
            print(f"PDF link found: {pdf_full_url}")
        except (TypeError, AttributeError):
            # Skip links that do not have a href attribute or have invalid HTML
            continue

# Write the PDF links to a JSON file
if len(pdf_links) > 0:
    with open('ucsd_pdf_links.json', 'w') as f:
        json.dump(pdf_links, f, indent=4)
else:
    print("No PDF links found.")


PDF link found: https://library.ucsd.edu/dc/object/bb75533932/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb8201865f/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb8884462f/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb9567064f/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb02154457/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb08980448/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb1580644j/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb22291132/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb29117140/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb3594311v/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb4242783m/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb49253868/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb6256451f/_1.pdf
PDF link found: https://library.ucsd.edu/dc/object/bb69390564/_1.pdf
PDF link found: https://library.uc