In [55]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import UCSD_pdf_dates
import os
import PyPDF2
# Base URL for the UCSD newspaper archives
base_url = "https://library.ucsd.edu/dc/"

# Load the subpage links from the subpage_links.json file
with open("subpage_links.json", "r") as f:
    subpage_links = json.load(f)

# List to store all the PDF links
pdf_links = []
# Loop through all the subpages and scrape the PDF links
for subpage_link in subpage_links:
    # Make a GET request to the subpage URL
    response = requests.get(subpage_link)

    # Create a BeautifulSoup object from the response HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the PDF links on the page
    pdf_links_on_page = soup.find_all('a', class_='dams-search-thumbnail-link') #pdf_links_on_page = soup.find_all('li', class_='dams-sr') 

    # Loop through all the PDF links and extract the PDF URL
    for pdf_link_on_page in pdf_links_on_page:
        try:
            # Extract the relative URL for the PDF
            pdf_url = pdf_link_on_page['href']

            pdf_url = pdf_url.replace("/dc/", "")

            # Construct the absolute URL for the PDF file and add the "_1.pdf" suffix
            pdf_full_url = f"{base_url}{pdf_url.split('?')[0]}/_1.pdf"
            
            # Construct the absolute URL for the PDF file and add the "_1.pdf" suffix
            date_url = f"{base_url}{pdf_url.split('?')[0]}"
            print(date_url)
            date_response = requests.get(date_url)
            date_soup = BeautifulSoup(date_response.content, 'html.parser')

            # Find the date element
            date_elem = date_soup.find('dt', string='Creation Date').find_next_sibling('dd').find('ul', class_='unstyled').find('li').get_text(strip=True)
            if date_elem:
                date = date_elem
            else:
                date = None

            # Check if the date element exists and formats the dates (YYYY_mm_dd)
            if date_elem is not None:
                date_obj = datetime.strptime(date_elem, '%B %d, %Y')
                formatted_date = date_obj.strftime('%Y_%m_%d')
                print(formatted_date)
            else:
                raise ValueError("Could not find date element")

        except (KeyError, ValueError, AttributeError) as e:
            # Handle any exceptions that occur
            print(f"Error: {e}")
            continue
            

            # Create a dictionary with the PDF URL and date
        pdf_info = {
            'date': formatted_date,
            'pdf_url': pdf_full_url
        }


            # Append the dictionary to the list of PDF links
        pdf_links.append(pdf_info)
        #print(f"PDF link found: {pdf_full_url}")
            # Skip links that do not have a href attribute or have invalid HTML
    continue

# Write the PDF links to a JSON file
if len(pdf_links) > 0:
    with open('UCSD_article_pages.json', 'w') as f:
        json.dump(pdf_links, f, indent=4)
else:
    print("No PDF links found.")

import io
# Load the JSON file
with open("UCSD_article_pages.json", "r") as f:
    pdf_information = json.load(f)

# Create the output directory if it doesn't exist
if not os.path.exists("journal_data/txt"):
    os.makedirs("journal_data/txt")

# Open the output file
output_file = open("journal_data/txt/UCSD/all_text.txt", "a", encoding="utf-8")

# Loop through each PDF link and extract the text
for info in pdf_information:
    pdf_full_url = info['pdf_url']
    date = info['date']
    print(date)
    try:
        # Make a GET request to the PDF URL
        response = requests.get(pdf_full_url)

        # Create a PDF object from the response content
        pdf = PyPDF2.PdfReader(io.BytesIO(response.content))

        # Extract the text from the PDF object
        text = ""
        for i in range(len(pdf.pages)):
            text += pdf.pages[i].extract_text()
            # Write the text to a file with the publication date in the filename
            if date is not None:
             # Extract year, month, and day from the date string

                filename = f"journal_data/txt/UCSD/{date}.txt"
                print(filename)
            else:
                filename = f"journal_data/txt/UCSD/unknown_date.txt"

            # Create the directory structure for the output file if it doesn't exist   
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            with open(filename, "w", encoding="utf-8") as f:
                f.write(text)
                output_file.write(text)
    except Exception as e:
        print(f"Error: {e}")
        continue
output_file.close()





https://library.ucsd.edu/dc/object/bb75533932
2023_01_30
https://library.ucsd.edu/dc/object/bb8201865f
2023_01_23
https://library.ucsd.edu/dc/object/bb8884462f
2023_01_17
https://library.ucsd.edu/dc/object/bb9567064f
2023_01_09
https://library.ucsd.edu/dc/object/bb02154457
2022_12_23
https://library.ucsd.edu/dc/object/bb08980448
2022_12_04
https://library.ucsd.edu/dc/object/bb1580644j
2022_11_29
https://library.ucsd.edu/dc/object/bb22291132
2022_11_28
https://library.ucsd.edu/dc/object/bb29117140
2022_11_21
https://library.ucsd.edu/dc/object/bb3594311v
2022_11_14
https://library.ucsd.edu/dc/object/bb4242783m
2022_11_07
https://library.ucsd.edu/dc/object/bb49253868
2022_10_31
https://library.ucsd.edu/dc/object/bb6256451f
2022_10_17
https://library.ucsd.edu/dc/object/bb69390564
2022_10_10
https://library.ucsd.edu/dc/object/bb7621655w
2022_10_03
https://library.ucsd.edu/dc/object/bb82701222
2022_09_26
https://library.ucsd.edu/dc/object/bb89527248
2022_08_11
https://library.ucsd.edu/dc/obj

KeyboardInterrupt: 