In [None]:
# ---------------- Imports ----------------
import os
import requests
import csv
import sys

from datetime import datetime

import yaml

from bs4 import BeautifulSoup
from urllib.parse import urljoin



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")



In [None]:
# ---------------- Setup ----------------
resource_choice = 'veterans-oral-histories' #nprc-oral-histories #oral-history-at-the-national-archives #veterans-oral-histories #assembly-oral-histories

# URL of the webpage to scrape
base_url = f'https://www.archives.gov/about/history/{resource_choice}'

# Directory to save downloaded PDFs
download_dir = f'{data_folder}/raw_data/machine_collected/nara/{resource_choice.replace("-", "_")}'
os.makedirs(download_dir, exist_ok=True)

# File for metadata
metadata_file = f'{download_dir}/metadata.csv'

# Function to download a file
def download_file(url, folder):
    local_filename = os.path.join(folder, url.split('/')[-1])
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print(f'Downloaded: {local_filename}')
    return local_filename



In [None]:
# ---------------- Main ----------------
# Fetch the webpage content
response = requests.get(base_url)
response.raise_for_status()  # Ensure the request was successful

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find all <p> tags with links to PDFs
all_paragraphs = soup.find_all('p')

metadata = []

for i, p in enumerate(all_paragraphs):
    link = p.find('a', href=lambda href: href and href.lower().endswith('.pdf'))
    if link:
        pdf_url = urljoin(base_url, link['href'])
        pdf_filename = link['href'].split('/')[-1]
        name = link.get_text(strip=True)

        # Look for the next <p> as the description
        description = "No description"
        if i + 1 < len(all_paragraphs):  # Check if there is a next <p>
            next_p = all_paragraphs[i + 1]
            if next_p:  # Ensure itâ€™s not empty
                # Remove any nested <a> tags from the description
                for a_tag in next_p.find_all('a'):
                    a_tag.unwrap()  # Remove <a> tags but keep their text
                description = next_p.get_text(strip=False)

        # Download the PDF
        downloaded_path = download_file(pdf_url, download_dir)

        # Record metadata
        metadata.append({
            'interviewee_name': name,
            'description': description,
            'file_url': pdf_url,
            'collection_url': base_url,
            'original_file_name': pdf_filename,
            'retrieved_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        })

# Write metadata to a CSV file
with open(metadata_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['interviewee_name', 'description', 'file_url', 'collection_url', 'original_file_name', 'retrieved_date'])
    writer.writeheader()
    writer.writerows(metadata)

print(f'Metadata written to {metadata_file}')
