In [None]:
# ---------------- Imports ----------------
import os
import sys
import requests
import pandas as pd
import re

from datetime import datetime

import yaml

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")

# Specify the directory where files should be saved
save_directory = f"{data_folder}/raw_data/machine_collected/jsc_oral_history/"  
os.makedirs(save_directory, exist_ok=True)

# URL of the participants page
url = 'https://historycollection.jsc.nasa.gov/JSCHistoryPortal/history/oral_histories/participants.htm'

In [None]:
# ---------------- Setup ----------------
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status()  # Ensure the request was successful

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Dictionary to store names and ensure uniqueness
name_category_map = {}

# Track the most recent valid bolded text (category)
current_category = None
after_table = False  # Flag to track when we have passed a </table>
first_category_set = False  # Ensures the first category is assigned immediately

# Process all elements in order (keeping structure)
for element in soup.find_all(['table', 'b', 'strong']):
    
    # If it's a <b> or <strong> and we haven't assigned the first category, set it immediately
    if not first_category_set and element.name in ['b', 'strong']:
        current_category = " ".join(element.get_text(separator=" ", strip=True).split())
        first_category_set = True  # Mark that the first category has been set
        continue  # Move to the next element
    
    # If it's a <table>, process the names inside
    if element.name == 'table' and 'role' in element.attrs and element['role'] == 'presentation':
        after_table = True  # Mark that we've processed a table
        
        if not current_category:
            continue  # Skip tables if no category has been set yet

        for a_tag in element.find_all('a', href=True):
            name = a_tag.text.strip()
            link = a_tag['href'].strip()

            # Ensure the link is valid and prepend base URL if necessary
            if link.startswith('/'):
                link = 'https://historycollection.jsc.nasa.gov' + link
            elif not link.startswith('http'):
                link = 'https://historycollection.jsc.nasa.gov/JSCHistoryPortal/history/oral_histories/' + link

            # Clean up the name: remove all excessive spaces and newlines
            name = " ".join(name.split())

            # Only assign a category if the name hasn't been seen before
            if name not in name_category_map:
                name_category_map[name] = (current_category, link)

    # If we have just passed a </table>, pick the FIRST <b> or <strong> as the new category
    elif after_table and element.name in ['b', 'strong']:
        category_text = " ".join(element.get_text(separator=" ", strip=True).split())  # Remove extra spaces

        if category_text:  # Ignore empty or meaningless tags
            current_category = category_text  # Set the new category
            after_table = False  # Reset flag so we don't overwrite it again

# Convert dictionary to DataFrame
df = pd.DataFrame(
    [(cat, name, link) for name, (cat, link) in name_category_map.items()], 
    columns=['category', 'targets', 'collection_link']
)


# Function to reformat names (Last, First -> First Last)
def reformat_name(name):
    parts = name.split(", ")
    if len(parts) == 2:
        return f"{parts[1]} {parts[0]}"  # Swap order
    return name  # Return as is if format is unexpected

df['targets'] = df['targets'].apply(reformat_name)

df['targets'] = df['targets'].str.replace(',', '', regex=True)
df['targets'] = df['targets'].str.replace('..', '.', regex=False)



# Remove rows without a target
df = df.dropna(subset=['targets'])  # Removes NaN values
df = df[df['targets'].str.strip() != '']  # Removes empty strings


# clean categories
df['category'] = df['category'].replace('January 31, 2020', 'JSC Oral History Project')
df = df[df['category'] != 'Please note: Links on this page are active once the Oral History transcript is archived in the JSC History Collection.']


# Display the DataFrame
display(df.shape)
display(df.head())


In [None]:
def generate_full_link(row):
    base_url = re.sub(r'[^/]+$', '', row['collection_link'])  # Remove the last part of URL
    return urljoin(base_url, row['original_file_name'])  # Append new filename


def update_dataframe(df):
    df['document_link'] = df.apply(generate_full_link, axis=1)
    return df


def extract_html_links(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise error if request fails
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract only anchors where text contains 'HTML' in any casing
        links = [a['href'] for a in soup.find_all('a', href=True) if 'html' in a.text.strip().lower()]

        return links
    except requests.RequestException:
        return []  # Return empty list on request failure


def process_row(row):
    base_url = row['collection_link']
    original_file_names = extract_html_links(base_url)

    if original_file_names:
        return [{**row, 'original_file_name': link} for link in original_file_names]
    else:
        return [{**row, 'original_file_name': None}]


def expand_dataframe(df, url_column):
    expanded_data = []

    with ThreadPoolExecutor(max_workers=24) as executor:
        future_to_row = {executor.submit(process_row, row): row for _, row in df.iterrows()}

        for i, future in enumerate(as_completed(future_to_row)):
            expanded_data.extend(future.result())

            # Print update every 100 iterations
            if (i + 1) % 100 == 0:
                print(f"Processed {i + 1} rows...")

    print("Expansion complete.")
    return pd.DataFrame(expanded_data)



In [None]:
# ---------------- Main ----------------
expanded_df = expand_dataframe(df, 'collection_link')

# Remove rows where 'original_file_name' is NaN or an empty string
expanded_df = expanded_df.dropna(subset=['original_file_name'])  # Removes NaN values
expanded_df = expanded_df[expanded_df['original_file_name'].str.strip() != '']  # Removes empty strings

# Reset index
expanded_df = expanded_df.reset_index(drop=True)

# Expand to include full link
updated_df = update_dataframe(expanded_df)

updated_df['collection'] = "NASA Johnson Space Center Oral History Project"

# Fix the file name so only the last part of the slash is left
updated_df['original_file_name'] = updated_df['original_file_name'].apply(lambda x: os.path.basename(x) if pd.notna(x) else x)

# Add the retrieved_date column with the current date
updated_df['retrieved_date'] = datetime.today().strftime('%Y-%m-%d')

# Display expanded DataFrame
display(updated_df.shape)
display(updated_df.head())

# Save
updated_df.to_csv(f'{save_directory}/metadata.csv', index=False)



In [None]:
# Loop through each URL and save raw HTML
for index, row in updated_df.iterrows():
    url = row['document_link']
    original_filename = row['original_file_name']

    # Remove the file extension and ensure a valid filename
    base_filename = os.path.splitext(original_filename)[0]
    safe_filename = f"{base_filename}.html"

    try:
        # Fetch the raw HTML
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses

        # Save raw HTML
        file_path = os.path.join(save_directory, safe_filename)
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(response.text)

        print(f"Saved raw HTML from {url} to {file_path}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

