In [5]:
import requests
from bs4 import BeautifulSoup
import json
import re

# Base URL for the UCSD newspaper archives
base_url = "https://library.ucsd.edu/dc/search?f%5Bcollection_sim%5D%5B%5D=UCSD+Guardian&page="

# List to store all the PDF links
pdf_links = []

# Loop through all the pages (2-33) and scrape the PDF links
for page_num in range(1, 34):
    # Construct the URL for the current page
    url = base_url + str(page_num) + "&sort=object_create_dtsi+desc%2C+title_ssi+asc"

    # Make a GET request to the page URL
    response = requests.get(url)

    # Create a BeautifulSoup object from the response HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the article links on the page
    articles = soup.find_all('div', class_='document-title')

    # Loop through all the articles and extract the PDF link
    for article in articles:
        try:
            # Extract the relative URL for the article
            article_url = article.find('a', class_='original-url')['href']

            # Make a GET request to the article URL
            article_response = requests.get(article_url)

            # Create a BeautifulSoup object from the article response HTML
            article_soup = BeautifulSoup(article_response.content, 'html.parser')

            # Find the PDF link on the article page
            embed_tag = article.find('embed', type='application/x-google-chrome-pdf')
            pdf_url = embed_tag['original-url']


            # Construct the absolute URL for the PDF file
            pdf_full_url = f"https://library.ucsd.edu{pdf_url}"

            # Extract the date from the PDF URL
            date_str = pdf_url.split('/')[-2]
            year_str = date_str[:4]
            month_str = date_str[4:6]
            day_str = date_str[6:]
            date = f"{year_str}-{month_str}-{day_str}"

            # Create a dictionary with the date and PDF URL
            pdf_info = {
                'date': date,
                'pdf_url': pdf_full_url
            }

            # Append the dictionary to the list of PDF links
            pdf_links.append(pdf_info)
            print(f"PDF link found: {pdf_full_url}")
        except (TypeError, AttributeError):
            # Skip articles that do not have a PDF link or have invalid HTML
            continue

# Write the PDF links to a JSON file
if len(pdf_links) > 0:
    with open('ucsd_article_pages.json', 'w') as f:
        json.dump(pdf_links, f, indent=4)
else:
    print("No PDF links found.")


No PDF links found.
