In [None]:
 #-----------------------------------------
# Step 1: Install Firefox and Geckodriver
# -----------------------------------------
!pip install selenium pandas beautifulsoup4 geckodriver-autoinstaller -q

!apt-get update > /dev/null
!apt-get install -y firefox > /dev/null # Install Firefox browser

import geckodriver_autoinstaller
# Automatically download/update and install geckodriver
geckodriver_autoinstaller.install()
print("Firefox installed and geckodriver managed.")

# Verify Firefox installation
!which firefox

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Firefox installed and geckodriver managed.
/usr/bin/firefox


In [None]:
# -----------------------------------------
# Step 2: Import Libraries
# -----------------------------------------
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException, ElementClickInterceptedException, WebDriverException
import time
import pandas as pd
from bs4 import BeautifulSoup
import re

# -----------------------------------------
# Step 3: Configure WebDriver (FIREFOX)
# -----------------------------------------
def setup_driver():
    """Sets up the Firefox WebDriver for Colab."""
    options = FirefoxOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")

    print("Setting up Firefox WebDriver...")
    try:
        driver = webdriver.Firefox(options=options)
        print("WebDriver setup complete (Firefox).")
        return driver
    except Exception as e:
        print(f"Error during Firefox WebDriver setup: {e}")
        if "unable to find runner binary" in str(e).lower() or "expected firefox binary location" in str(e).lower():
             print("\nHint: Firefox browser installation might have failed or is not in the expected path.")
             print("      - Check output of !apt-get install firefox in the first cell.")
        elif "geckodriver" in str(e).lower():
             print("\nHint: Geckodriver (Firefox driver) issue.")
             print("      - Ensure geckodriver_autoinstaller.install() ran successfully in the first cell.")
        raise

# -----------------------------------------
# Step 4: Function to Get Planet Links from All Pages
# -----------------------------------------
def get_all_planet_links(start_url, max_pages=None):
    """Navigates through catalog pages, handles lazy loading, and collects links."""
    driver = setup_driver()
    planet_links = []
    page_count = 0

    # Selector for the individual planet containers in the list
    planet_item_container_selector = (By.CSS_SELECTOR, "div#post-list-container div.hds-content-item")
    # Selector for the link *within* each container (target the heading link)
    link_selector_within_item = (By.CSS_SELECTOR, "a.hds-content-item-heading")
    # --- END SELECTORS ---

    try:
        print(f"Navigating to start URL: {start_url}")
        driver.get(start_url)
        print("Initial page load complete.")
        time.sleep(3) # Initial pause for basic JS

        while True:
            page_count += 1
            print(f"\nProcessing page {page_count}...")

            # --- SCROLLING LOOP TO HANDLE LAZY LOADING ---
            print("Scrolling down to load all items...")
            last_height = driver.execute_script("return document.body.scrollHeight")
            consecutive_scrolls_no_change = 0
            max_scroll_attempts = 10 # Limit attempts to prevent infinite loops

            for i in range(max_scroll_attempts):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                # Wait for new content to potentially load
                time.sleep(3) # Adjust sleep time if needed

                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    consecutive_scrolls_no_change += 1
                    print(f"  Scroll attempt {i+1}: Height unchanged ({new_height}px). Consecutive: {consecutive_scrolls_no_change}")
                else:
                    consecutive_scrolls_no_change = 0 # Reset counter if height changed
                    print(f"  Scroll attempt {i+1}: Height changed ({last_height}px -> {new_height}px).")

                last_height = new_height

                # If height hasn't changed for a few scrolls, assume all content is loaded
                if consecutive_scrolls_no_change >= 3:
                    print("Page height stable after multiple scrolls. Assuming all items loaded.")
                    break
            else: # This else block runs if the loop completes without breaking
                print(f"Reached max scroll attempts ({max_scroll_attempts}). Proceeding...")
            # --- END SCROLLING LOOP ---
k
            wait_timeout = 20 # very generous, reduce if needed
            print(f"Waiting up to {wait_timeout}s for first planet item container...")
            try:
                WebDriverWait(driver, wait_timeout).until(
                    EC.visibility_of_element_located(planet_item_container_selector)
                )
                print("First planet item container is visible.")
                time.sleep(2)

            except TimeoutException:
                print(f"Timeout waiting for VISIBLE planet item containers on page {page_count} even after scrolling.")
                pass

            # --- Find Elements using CORRECT selectors ---
            print("Attempting to find all planet item containers...")
            items_found_on_page = 0
            try:
                planet_item_elements = driver.find_elements(*planet_item_container_selector)
                items_found_on_page = len(planet_item_elements)
                print(f"Found {items_found_on_page} planet item containers on page {page_count}.")

                page_links = []
                for item in planet_item_elements:
                    try:
                        link_element = item.find_element(*link_selector_within_item)
                        href = link_element.get_attribute('href')
                        if href and href.startswith("https://science.nasa.gov/exoplanet-catalog/"): # Basic validation
                            page_links.append(href)
                        # else: # Optional debug for bad links
                        #     print(f"  Warning: Found item link with unexpected href: {href}")
                    except NoSuchElementException:
                        print("  Warning: Found an item container without a heading link inside.")
                    except Exception as link_e:
                        print(f"  Error extracting link from an item: {link_e}")

                print(f"Extracted {len(page_links)} valid planet links from items on page {page_count}.")
                planet_links.extend(page_links)

                # Check if we found fewer items than expected (e.g., less than 15)
                if items_found_on_page < 15 and page_count < 390: # Don't warn on the last page potentially
                     print(f"  Warning: Found only {items_found_on_page} items. Lazy loading might not be fully complete or page structure changed.")


            except Exception as e:
                 print(f"Error finding/processing planet items/links on page {page_count}: {e}")

            # --- PAGINATION LOGIC ---
            if max_pages is not None and page_count >= max_pages:
                print(f"Reached maximum specified pages ({max_pages}). Stopping pagination.")
                break

            try:
                print("Looking for 'Next' button...")
                # Locate the pagination container first
                pagination_nav = WebDriverWait(driver, 10).until(
                   EC.presence_of_element_located((By.CSS_SELECTOR, "nav.hds-pagination"))
                )
                # Find the 'Next' link *within* the pagination container
                next_button = pagination_nav.find_element(By.LINK_TEXT, "Next")

                # Check if the 'Next' button is disabled (might happen on the last page)
                if "disabled" in next_button.get_attribute("class") or next_button.get_attribute("aria-disabled") == "true":
                     print("'Next' button is disabled. Assuming end of catalog.")
                     break

                print("Found clickable 'Next' button. Scrolling and clicking...")
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button) # Scroll to center it
                time.sleep(1)

                driver.execute_script("arguments[0].click();", next_button)
                print("'Next' button clicked via JS.")
                time.sleep(5) # Wait for next page to start loading

            except (NoSuchElementException, TimeoutException):
                print("Could not find the 'Next' button using specific selectors. Assuming end of catalog.")
                break
            except ElementClickInterceptedException:
                 print("Element click intercepted for 'Next'. Should not happen with JS click, but handling anyway. Stopping.")
                 break # Stop if JS click was intercepted
            except StaleElementReferenceException:
                print("Next button became stale before clicking. Stopping pagination.")
                break
            except Exception as e:
                print(f"An unexpected error occurred during pagination: {e}")
                break
            # --- END PAGINATION ---

    except Exception as e:
        print(f"An error occurred during the main loop: {e}")
    finally:
        print(f"\nFinished collecting links. Found {len(planet_links)} total links across {page_count} pages.")
        if driver:
            driver.quit()
            print("WebDriver closed.")

    unique_links = sorted(list(set(filter(None, planet_links))))
    print(f"Returning {len(unique_links)} unique planet links.")
    return unique_links
# -----------------------------------------
# Step 5: Function to Scrape Details
# -----------------------------------------


def get_planet_details(driver, planet_url):
    """
    Navigates, extracts grid data, host star type, cleans radius,
    converts mass, and adds debug print for raw mass text.
    """
    print(f"  Scraping: {planet_url}")
    details = { # Initialize necessary/target fields
        'name': 'Not Found', 'planet_type': 'Not Found',
        'mass': 'Not Found', # Mass will store Earth Masses (numeric or 'Not Found')
        'planet_radius': 'Not Found', 'orbital_period': 'Not Found',
        'orbit_distance': 'Not Found', 'eccentricity': 'Not Found',
        'discovery_method': 'Not Found', 'host_star_type': 'Not Found',
        'host_star_name': 'Not Found', 'url': planet_url
    }
    # Map internal keys to the exact labels found on the page grid
    data_labels_in_grid = {
        'planet_type': 'Planet Type', 'discovery_method': 'Discovery Method',
        'mass': 'Planet Mass', 'planet_radius': 'Planet Radius',
        'orbital_period': 'Orbital Period', 'orbit_distance': 'Orbital Radius',
        'eccentricity': 'Eccentricity',
    }
    # Conversion factor
    JUPITER_MASS_IN_EARTH_MASS = 317.83

    try:
        driver.get(planet_url)
        detail_page_wait_timeout = 35
        # print(f"    Waiting up to {detail_page_wait_timeout}s for page title (H1)...")
        WebDriverWait(driver, detail_page_wait_timeout).until(
            EC.visibility_of_element_located((By.TAG_NAME, "h1"))
        )
        # print("    Page title (H1) is visible. Pausing...")
        time.sleep(2.0)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # --- Extract Planet Name & Infer Host Star Name ---
        try:
            name_element = soup.find('h1')
            details['name'] = name_element.text.strip() if name_element else 'Not Found'
            # print(f"    Found Name: {details['name']}")
            if details['name'] != 'Not Found':
                parts = details['name'].split()
                if len(parts) > 1 and len(parts[-1]) == 1 and parts[-1].islower():
                    details['host_star_name'] = " ".join(parts[:-1])
        except Exception as e:
            print(f"    Error extracting Name/Host Name: {e}")
            details['name'] = 'Error'

        # --- Find the specific container for key PLANET facts ---
        facts_container = soup.find('div', class_='smd-acf-grid-layout')

        if facts_container:
            # print("    Found 'smd-acf-grid-layout' container. Parsing planet cells...")
            fact_cells = facts_container.find_all('div', class_='smd-acf-grid-col')
            extracted_data_from_grid = {}
            for cell in fact_cells:
                label_tag = cell.find('div', class_='text-bold')
                value_tag_container = label_tag.find_next_sibling('div') if label_tag else None
                if label_tag and value_tag_container:
                    label_text_raw = label_tag.get_text(strip=True).replace(':', '')
                    value_text_raw = value_tag_container.get_text(strip=True)
                    list_items = value_tag_container.select('ul > li > span')
                    if list_items: value_text_raw = ', '.join(item.get_text(strip=True) for item in list_items)
                    extracted_data_from_grid[label_text_raw.lower()] = value_text_raw

            # Match extracted data to our desired keys
            for key, target_label in data_labels_in_grid.items():
                 matched_value_raw = extracted_data_from_grid.get(target_label.lower(), 'Not Found')

                 if matched_value_raw and matched_value_raw != 'Not Found':
                    cleaned_value = matched_value_raw # Start with raw value

                    # --- Specific Handling for Mass ---
                    if key == 'mass':
                        print(f"      Processing Mass field. Raw text: '{matched_value_raw}'")

                        mass_value_final = 'Not Found' # Default for this field
                        try:
                            # Extract number first
                            number_match = re.search(r'([\d\.]+)', cleaned_value)
                            if number_match:
                                number = float(number_match.group(1))
                                # Check units based on the raw string
                                if 'jupiter' in matched_value_raw.lower(): # Check raw value for unit
                                    mass_value_final = number * JUPITER_MASS_IN_EARTH_MASS
                                    print(f"        Detected Jupiter unit. Converted value: {mass_value_final}")
                                elif 'earth' in matched_value_raw.lower(): # Check raw value for unit
                                    mass_value_final = number # Already in Earth masses
                                    print(f"        Detected Earth unit. Value: {mass_value_final}")
                                else:
                                    mass_value_final = 'Not Found (Unit Unknown)'
                                    print(f"        Unit unclear in mass text.")
                            else:
                                 print(f"        Could not extract number from mass text.")
                                 # Keep 'Not Found'

                        except (ValueError, TypeError, AttributeError) as mass_e:
                             print(f"      Error converting mass value '{cleaned_value}': {mass_e}")
                             mass_value_final = 'Not Found (Conv Error)'

                        details[key] = mass_value_final # Store numeric Earth mass or error string
                        continue # Skip general cleaning

                    # --- General Cleaning for other fields ---
                    if key == 'planet_radius':
                       cleaned_value = re.sub(r'\s*(?:x Earth|x Jupiter|Earths|Jupiters)\s*(?:Mass|Radius)?', '', cleaned_value, flags=re.IGNORECASE).strip()
                       cleaned_value = re.sub(r'\s*\(\s*estimate\s*\)\s*$', '', cleaned_value, flags=re.IGNORECASE).strip()
                    elif key in ['orbital_period', 'orbit_distance', 'eccentricity']:
                       cleaned_value = re.sub(r'\s*(Days|Years|AU)', '', cleaned_value, flags=re.IGNORECASE).strip()

                    details[key] = cleaned_value if cleaned_value else 'Not Found (Cleaned Empty)'

        else:
            print("    ERROR: Could not find the 'smd-acf-grid-layout' container.")

        # --- Extract Host Star Type ---
        # print("    Searching for Host Star Type...")
        star_type_found = False
        main_content = soup.find('main', id='main-content')
        search_area = main_content if main_content else soup
        description_tags = search_area.find_all('p', limit=15)
        star_type_pattern = re.compile(r'([OBAFGKM])(?:[- ]?type|\s+dwarf)\s+star', re.IGNORECASE)
        for tag in description_tags:
             text_content = tag.get_text(" ", strip=True)
             match = star_type_pattern.search(text_content)
             if match:
                  details['host_star_type'] = match.group(1).upper()
                  # print(f"      Found Host Star Type: {details['host_star_type']}")
                  star_type_found = True
                  break
        if not star_type_found:
             description_div = soup.find('div', class_='custom-field')
             if description_div:
                  text_content = description_div.get_text(" ", strip=True)
                  match = star_type_pattern.search(text_content)
                  if match:
                       details['host_star_type'] = match.group(1).upper()
                       # print(f"      Found Host Star Type (in custom-field): {details['host_star_type']}")
                       star_type_found = True
        # if not star_type_found: print("      Host Star Type pattern not found.")


    except TimeoutException:
        print(f"    Timeout waiting for H1 on page: {planet_url}")
        details['name'] = 'Timeout Error (H1)'
        details['planet_type'] = 'Timeout Error (H1)'
    except Exception as e:
        print(f"    An unexpected error occurred scraping {planet_url}: {e}")
        details['name'] = 'Scrape Error'
        details['planet_type'] = 'Scrape Error'

    return details

# --- Keep the rest of your script (setup_driver, get_all_planet_links, main execution loop) the same ---

# -----------------------------------------
# Step 6: Main Execution Logic
# -----------------------------------------
START_URL = "https://science.nasa.gov/exoplanets/exoplanet-catalog/"
MAX_PAGES_TO_SCRAPE = None # None for all pages
MAX_PLANETS_TO_SCRAPE = None # None for all planets

# --- Part 1: Get all planet links ---
all_planet_links = get_all_planet_links(START_URL, max_pages=MAX_PAGES_TO_SCRAPE)

# --- Limit if testing ---
if MAX_PLANETS_TO_SCRAPE is not None:
    print(f"\nLimiting scraping to the first {MAX_PLANETS_TO_SCRAPE} planets found.")
    all_planet_links = all_planet_links[:MAX_PLANETS_TO_SCRAPE]

# --- Part 2: Scrape details for each planet (with driver restart logic) ---
if all_planet_links:
    planet_data = []
    detail_driver = None # Initialize driver as None
    max_retries = 1 # Allow one restart attempt per planet
    planets_processed_since_restart = 0
    planets_per_restart = 50 # How many planets to process before proactively restarting driver


    for i, link in enumerate(all_planet_links):
        print(f"\nProcessing planet {i+1}/{len(all_planet_links)}")
        retries = 0
        success = False

        # Proactive restart logic
        if detail_driver and planets_processed_since_restart >= planets_per_restart:
            print(f"Proactively restarting driver after {planets_processed_since_restart} planets...")
            try:
                detail_driver.quit()
            except Exception as quit_e:
                 print(f"  Error quitting driver before proactive restart: {quit_e}")
            detail_driver = None
            planets_processed_since_restart = 0

        while not success and retries <= max_retries:
            if detail_driver is None:
                print("  Driver is not running. Setting up new driver...")
                try:
                    detail_driver = setup_driver() # Setup Firefox driver
                    planets_processed_since_restart = 0 # Reset counter on new driver
                except Exception as setup_e:
                     print(f"  FATAL: Failed to set up driver: {setup_e}. Skipping remaining planets.")
                     break # Exit the outer planet loop if driver setup fails

            if detail_driver: # Check if driver setup was successful
                try:
                    # Attempt to get details
                    details = get_planet_details(detail_driver, link)
                    planet_data.append(details)
                    success = True # Mark as success for this planet
                    planets_processed_since_restart += 1 # Increment counter
                    time.sleep(1.5)

                except (WebDriverException, TimeoutException) as e: # Catch driver crashes or timeouts during get_planet_details internal waits/gets
                    print(f"  Encountered error on attempt {retries+1} for {link}: {type(e).__name__}")
                    retries += 1
                    if detail_driver:
                        print("  Attempting to quit potentially unstable driver...")
                        try:
                            detail_driver.quit()
                        except Exception as quit_e:
                            print(f"    Error quitting driver after exception: {quit_e}")
                    detail_driver = None # Signal that driver needs restart
                    print(f"  Will retry (attempt {retries}/{max_retries}) after restarting driver.")
                    time.sleep(5) # Wait a bit before restarting driver

                except KeyboardInterrupt:
                     print("\nScraping interrupted by user.")
                     raise # Re-raise to stop script
                except Exception as e: # Catch other unexpected errors in get_planet_details
                    print(f"  Unexpected error during get_planet_details for {link}: {e}")
                    details = {'url': link, 'name': 'Unexpected Scrape Error'}
                    # Initialize other fields with error status
                    base_data_keys = [ # List expected keys again for robust error filling
                        'planet_type', 'discovery_date', 'discovery_method', 'mass', 'planet_radius',
                        'orbital_period', 'orbit_distance', 'host_star_name', 'host_star_mass', 'host_star_radius'
                    ]
                    for key in base_data_keys: details.setdefault(key, 'Unexpected Scrape Error')
                    planet_data.append(details)
                    success = True # Mark as 'processed' (with error) to move to next planet
            else:
                 print(f"  Driver setup failed during retry for {link}. Skipping planet.")
                 break # Exit the retry loop for this planet

        if not success: # If retries failed
             print(f"  Failed to process {link} after {max_retries} retries. Marking as failed.")
             details = {'url': link, 'name': 'Retry Failed'}
             # Initialize other fields with error status
             base_data_keys = [
                 'planet_type', 'discovery_date', 'discovery_method', 'mass', 'planet_radius',
                 'orbital_period', 'orbit_distance', 'host_star_name', 'host_star_mass', 'host_star_radius'
             ]
             for key in base_data_keys: details.setdefault(key, 'Retry Failed')
             planet_data.append(details)

        # Check if driver setup failed and broke the outer loop
        if detail_driver is None and not success and retries > max_retries:
             break # Exit the main planet loop


    # --- Final cleanup ---
    if detail_driver:
        try:
            detail_driver.quit()
            print("\nFinal WebDriver closed after scraping details.")
        except Exception as final_quit_e:
             print(f"\nError during final driver quit: {final_quit_e}")

    # --- Part 3: Create DataFrame and Save ---
    if planet_data:
        df = pd.DataFrame(planet_data)

        cols_order = [
            'name',
            'host_star_name',
            'url',
            # Target Label
            'planet_type',
            # Key Features (directly scraped)
            'mass',
            'planet_radius',
            'orbital_period',
            'orbit_distance',
            'eccentricity',
            # Contextual Features (directly scraped)
            'discovery_method',
            'host_star_type',
             # Add placeholders, not used currently, can be inferred from star type
            # 'host_star_mass_est',
            # 'host_star_radius_est',
            # 'host_star_temp_est',
            # 'planet_density',
            # 'planet_eq_temp',
        ]

        actual_cols = [col for col in cols_order if col in df.columns]
        actual_cols += [col for col in df.columns if col not in actual_cols]
        df = df[actual_cols]

        print("\nSample of scraped data:")
        print(df.head())
        print(f"\nTotal planets processed (incl. errors): {len(df)}")

        output_filename = "nasa_exoplanet_catalog.csv" # Changed filename
        df.to_csv(output_filename, index=False, encoding='utf-8')
        print(f"\nData saved to {output_filename}")

    else:
        print("\nNo planet data was successfully scraped.")
else:
    print("\nNo planet links were found. Cannot proceed to scrape details.")

print("\nScript finished.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  Scraping: https://science.nasa.gov/exoplanet-catalog/ngts-13-b/
      Processing Mass field. Raw text: '4.84 Jupiters'
        Detected Jupiter unit. Converted value: 1538.2972

Processing planet 4887/5862
  Scraping: https://science.nasa.gov/exoplanet-catalog/ngts-14-a-b/
      Processing Mass field. Raw text: '29.24036 Earths'
        Detected Earth unit. Value: 29.24036

Processing planet 4888/5862
  Scraping: https://science.nasa.gov/exoplanet-catalog/ngts-15-b/
      Processing Mass field. Raw text: '0.751 Jupiters'
        Detected Jupiter unit. Converted value: 238.69033

Processing planet 4889/5862
  Scraping: https://science.nasa.gov/exoplanet-catalog/ngts-16-b/
      Processing Mass field. Raw text: '0.667 Jupiters'
        Detected Jupiter unit. Converted value: 211.99261

Processing planet 4890/5862
  Scraping: https://science.nasa.gov/exoplanet-catalog/ngts-17-b/
      Processing Mass field. Raw text: '0.76