In [None]:
# ---------------- Imports ----------------
import os
import requests
import sys

from datetime import datetime

import pandas as pd
import yaml

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")


In [None]:
# ---------------- Setup ----------------
# Set up Chrome options
chrome_options = Options()
chrome_options.binary_location = "/usr/bin/google-chrome"  # Update with your Chrome path
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Automatically manage ChromeDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

#try:
# Open the webpage
driver.get('https://www.jfklibrary.org/asset-viewer/archives/rpcv')
# Wait for content to load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, 'asset-viewer-container'))
)
# Get and print HTML content
html_content = driver.page_source


#finally:
#    driver.quit()


In [None]:
# Check
html_content[:100]


In [None]:
# Extract links

def extract_anchor_tags(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    anchor_data = []

    # Extract all <a> tags with an href attribute
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        text = a_tag.get_text(strip=True)  # Get visible text of the link
        anchor_data.append({'href': href, 'text': text})

    # Convert to DataFrame
    return pd.DataFrame(anchor_data)



In [None]:
# Extract anchor tags into a DataFrame
df = extract_anchor_tags(html_content)
display(df.head())


In [None]:
# Subset the DataFrame to rows where href starts with "RPCV"
subset_df = df[df['href'].str.startswith("RPCV")]

display(subset_df.head())


In [None]:
# Check which pages have PDFs
def check_for_text_in_links_verbose(df, base_url, search_text, driver):
    df = df.copy()
    df['full_url'] = base_url + df['href']
    
    valid_entries = []  # Store only valid rows

    for _, row in df.iterrows():
        url = row['full_url']
        try:
            #print(f"Visiting URL: {url}")
            driver.get(url)  # Load page using Selenium

            # Get Initial Page Content
            page_content = driver.page_source  # Get the HTML before waiting
            soup = BeautifulSoup(page_content, 'html.parser')

            # Check if "Download PDF Transcript" exists
            if search_text not in soup.text:
                print(f"Skipping URL (text not found): {url}")
                continue

            print(f"Text found in: {url}")

            # Extract "Download PDF Transcript" Link Immediately
            pdf_download_url = None
            for link in soup.find_all("a", class_="button"):
                link_text = link.get_text(strip=True)
                href = link.get("href")

                if "Download PDF Transcript" in link_text and href:
                    pdf_download_url = href  # Store PDF link

            # Check if "Download Audio File" Exists
            audio_download_url = None
            audio_link_found = any(
                "Download Audio File" in link.get_text(strip=True) for link in soup.find_all("a", class_="button download-link")
            )

            # Wait for `download=""` to be populated (Only If Audio Exists)
            if audio_link_found:
                #print(f"Waiting for the 'download' attribute on: {url}")
                try:
                    # Wait until the "Download Audio File" `download` attribute is populated
                    audio_element = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "a.button.download-link[download]"))
                    )
                    
                    # Re-fetch the updated HTML
                    new_page_content = driver.page_source  # Get updated HTML
                    new_soup = BeautifulSoup(new_page_content, 'html.parser')

                    # Extract the correct "Download Audio File" link
                    for link in new_soup.find_all("a", class_="button download-link"):
                        if "Download Audio File" in link.get_text(strip=True):
                            href = link.get("href")
                            download_attr = link.get("download")
                            if href and href.startswith("http") and download_attr:
                                audio_download_url = href  # Store updated Audio File link
                except:
                    print(f"No valid Audio File link found for: {url}")

            # Extract Metadata Fields
            field_data = {}
            for field in soup.find_all(class_="field"):
                label_element = field.find(class_="field__label")
                value_elements = field.find_all(class_="field__item")

                if label_element and value_elements:
                    field_name = label_element.text.strip()
                    field_values = "; ".join([value.text.strip() for value in value_elements])  # Handle multiple values
                    field_data[field_name] = field_values

            # Store Data in DataFrame
            row_data = row.to_dict()  # Convert DataFrame row to dictionary
            row_data.update(field_data)  # Merge extracted field data
            row_data["pdf_download_url"] = pdf_download_url  # Add PDF URL
            row_data["audio_download_url"] = audio_download_url  # Add Audio URL
            
            # Add today's date in "retrieved_date" (YYYY-MM-DD format)
            row_data["retrieved_date"] = datetime.today().strftime("%Y-%m-%d")

            valid_entries.append(row_data)  # Store valid entry

        except requests.Timeout:
            print(f"Request timed out for URL: {url}")
        except requests.RequestException as e:
            print(f"Error accessing {url}: {e}")

    # Convert valid entries into a new DataFrame
    filtered_df = pd.DataFrame(valid_entries)


    return filtered_df




In [None]:
# Base URL and text to search
base_url = "https://www.jfklibrary.org/asset-viewer/archives/"
search_text = "Download PDF Transcript"

# Process the DataFrame
updated_df = check_for_text_in_links_verbose(subset_df, base_url, search_text, driver)

display(updated_df.shape)
display(updated_df.head())



In [None]:
# clean up
metadata_df = updated_df.drop(columns=['href', 'Person(s)'])

metadata_df = metadata_df.rename(columns={
    "full_url": "item_link",
    "text": "item_link_text",
})

metadata_df.columns = metadata_df.columns.str.lower().str.replace(r'[^\w\s]', '', regex=True).str.replace(' ', '_')

#metadata_df['file_name'] = metadata_df['digital_identifier'].astype(str) + ".pdf"
# move to first col
metadata_df.insert(0, 'original_file_name', metadata_df['digital_identifier'].astype(str) + ".pdf")


display(metadata_df.shape)
display(metadata_df.head())



In [None]:
# save
metadata_df.to_csv(f"{data_folder}/raw_data/machine_collected/jfk_library/returned_peace_corps_volunteers/metadata.csv", index=False)


In [None]:
# Download PDFs
def download_pdfs_with_custom_names(df, download_folder, base_url, pdf_text):

    
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)  # Create the folder if it doesn't exist

    for _, row in df.iterrows():
        url = row['item_link']
        file_name = row['digital_identifier'].replace('/', '_')  # Replace slashes in file name to avoid file system issues
        if not file_name.endswith(".pdf"):
            file_name += ".pdf"  # Ensure the file has a .pdf extension

        try:
            #print(f"Visiting URL: {url}")
            response = requests.get(url, timeout=10)  # Set a timeout
            response.raise_for_status()  # Raise an exception for HTTP errors
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the download link
            pdf_link = None
            for a_tag in soup.find_all('a', href=True, text=True):
                if pdf_text.lower() in a_tag.get_text(strip=True).lower():
                    pdf_link = a_tag['href']
                    break

            if pdf_link:
                # If the PDF link is relative, make it absolute
                if not pdf_link.startswith("http"):
                    pdf_link = base_url + pdf_link

                # Download the PDF
                #print(f"Found PDF link: {pdf_link}")
                pdf_response = requests.get(pdf_link, timeout=10)
                pdf_response.raise_for_status()  # Ensure the request was successful

                # Save the PDF with the custom file name
                pdf_filepath = os.path.join(download_folder, file_name)
                with open(pdf_filepath, 'wb') as pdf_file:
                    pdf_file.write(pdf_response.content)
                print(f"Downloaded: {pdf_filepath} from {url}")
            else:
                print(f"No PDF download link found on: {url}")

        except requests.RequestException as e:
            print(f"Error accessing {url}: {e}")


# Assume `pdf_links_df` contains the filtered DataFrame with `full_url` column
download_folder = f"{data_folder}/raw_data/machine_collected/jfk_library/returned_peace_corps_volunteers"  # Specify the folder to save PDFs
base_url = "https://www.jfklibrary.org"  # Update if needed
pdf_text = "Download PDF Transcript"  # Text to identify the PDF download link

download_pdfs_with_custom_names(metadata_df, download_folder, base_url, pdf_text)

