In [5]:
import requests
from bs4 import BeautifulSoup
import re
import time
import json
import pandas as pd

In [6]:
def scrape_launch_data(url):
    # Fetch the content of the URL
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize dictionary to store the scraped data
    data = {}

    # Extract title, launch time, descriptions, and video/article
    section = soup.find("section", class_="card section--center white mdl-grid mdl-grid--no-spacing mdl-shadow--6dp")

    if section:
        # Extract status
        status_tag = section.find("h6", class_="status")
        data["mission_status"] = status_tag.text.strip() if status_tag else None
        
        # Extract title
        title_tag = section.find("h4", class_="mdl-card__title-text")
        data["title"] = title_tag.text.strip() if title_tag else None
        
        # Extract launch time from the main tag
        launch_time_tag = section.find("span", id="localized")
        if launch_time_tag:
            data["launch_time"] = launch_time_tag.text.strip()
        else:
            # Directly search for the strong tag within the section
            strong_tag = section.find("strong", string="Launch Time")
            if strong_tag:
                # Get the parent div and extract all text
                launch_time_cell = strong_tag.find_parent("div")
                if launch_time_cell:
                    # Use stripped_strings to get clean text
                    launch_time_text = ''.join(launch_time_cell.stripped_strings)
                    # Remove the "Launch Time" label from the text
                    data["launch_time"] = launch_time_text.replace("Launch Time", "").strip()
                else:
                    data["launch_time"] = None
            else:
                data["launch_time"] = None

        
        # Extract description
        description_tag = section.find("div", class_="mdl-card__supporting-text").find("p")
        data["description"] = description_tag.text.strip() if description_tag else None

        # Extract video links
        video_select = section.find("select", id="videos")
        if video_select:
            data["videos"] = {opt.text.strip(): opt.get("data-url") for opt in video_select.find_all("option") if opt.get("data-url")}
        
        # Extract article link
        links = section.find_all("a")
        data["article_link"] = None
        for link in links:
            if "watch" not in link.text.strip().lower():
                data["article_link"] = link["href"]

    # Extract Rocket Information
    rocket_data = {}
    rocket_section = soup.find("section", class_="card section--center mdl-grid mdl-grid--no-spacing mdl-shadow--6dp")
    if rocket_section:
        # Rocket Name
        header_section = rocket_section.find("header", class_="image_header")
        rocket_name_div = header_section.find("div", class_="mdl-card__title-text")
        rocket_data["rocket_name"] = rocket_name_div.find("span").text.strip() if rocket_name_div else None

        # Find the second <style> tag in the parent div
        style_tags = soup.find("div", class_="page-content").find_all("style")
        if len(style_tags) > 1:
            second_style = style_tags[1].string
            # Extract the image URL from the second style section
            if "rocket_image" in second_style:
                match = re.search(r'background:\s*url\((.*?)\)', second_style)
                if match:
                    rocket_data["image_url"] = match.group(1).strip(' "\'')
                else:
                    rocket_data["image_url"] = None
            else:
                rocket_data["image_url"] = None
        else:
            rocket_data["image_url"] = None

        # Image Credit
        image_credit_div = header_section.find("div", class_="right_justified_text")
        rocket_data["image_credit"] = image_credit_div.find("span").text.strip() if image_credit_div else None

        # Rocket Details
        rocket_details = rocket_section.find("div", class_="mdl-card__supporting-text")
        if rocket_details:
            rocket_details_info = rocket_details.find_all("div", class_="mdl-cell mdl-cell--6-col-desktop mdl-cell--12-col-tablet")
            for detail in rocket_details_info:
                key_value = detail.text.split(":")
                if len(key_value) == 2:
                    key = key_value[0].strip().replace(" ", "_").lower()
                    value = key_value[1].strip()
                    rocket_data[key] = value

        # Rocket Details Link
        details_link_tag = rocket_section.find("div", class_="mdl-card__actions mdl-card--border").find("a", class_="mdc-button")
        rocket_data["rocket_details_link"] = details_link_tag.get("href") if details_link_tag and details_link_tag.get("href") else None

        # Get Rocket ID from the link
        if rocket_data["rocket_details_link"]:
            rocket_data["rocket_id"] = rocket_data["rocket_details_link"].split("/")[-1]

    # Store rocket data in the main data dictionary
    data["rocket_data"] = rocket_data

    # Extract Mission Details
    mission_data = {}
    mission_section = soup.find("section", class_="section--center card white mdl-grid mdl-grid--no-spacing mdl-shadow--6dp")
    if mission_section:
        # Mission Title
        mission_title = mission_section.find("h4", class_="mdl-card__title-text")
        mission_data["title"] = mission_title.text.strip() if mission_title else None

        # Mission Description
        description_paragraph = mission_section.find("div", class_="mdl-grid a").find("p")
        mission_data["description"] = description_paragraph.text.strip() if description_paragraph else None

        # Mission Details
        details_grids = mission_section.find_all("div", class_="mdl-grid a")
        if len(details_grids) > 1:  # Check if there are at least two grids
            details_info = details_grids[1].find_all("div", class_="mdl-cell")
            for detail in details_info:
                text = detail.text.strip()
                if "Payloads:" in text:
                    mission_data["payloads"] = text.split(":")[1].strip()
                elif "Total Mass:" in text:
                    mission_data["total_mass"] = text.split(":")[1].strip()
                elif "Orbit" in text:
                    mission_data["orbit"] = text

    # Store mission data in the main data dictionary
    data["mission_data"] = mission_data

    # Location Information
    location_section = soup.find("h3", string="Location").find_next("section", class_="card")
    if location_section:
        location_title = location_section.find("h4", class_="mdl-card__title-text")
        data["location"] = location_title.text.strip() if location_title else None

    return data

In [7]:
# Example usage:
scrape_launch_data("https://nextspaceflight.com/launches/details/7651")

{'mission_status': 'Success',
 'title': 'Starlink Group 8-19',
 'launch_time': 'Fri Oct 18, 2024 23:31 UTC',
 'description': None,
 'videos': {'Official Video': 'https://www.spacex.com/launches/mission/?missionId=sl-8-19',
  'NSF Video': 'https://www.youtube.com/watch?v=qRveNsk03t4'},
 'article_link': 'https://www.nasaspaceflight.com/2024/10/launch-roundup-101424/',
 'rocket_data': {'rocket_name': 'Falcon 9 Block 5',
  'image_url': 'https://storage.googleapis.com/nextspaceflight/media/rockets/Falcon_9_Block_5.webp',
  'image_credit': 'Image Credit: SpaceX',
  'status': 'Active',
  'price': '$69.75 million',
  'liftoff_thrust': '7,607 kN',
  'payload_to_leo': '22,800 kg',
  'payload_to_gto': '8,300 kg',
  'stages': '2',
  'strap-ons': '0',
  'rocket_height': '70.0 m',
  'fairing_diameter': '5.2 m',
  'fairing_height': '13.0 m',
  'rocket_details_link': '/rockets/3',
  'rocket_id': '3'},
 'mission_data': {'title': 'Starlink Group 8-19',
  'description': 'SpaceX launch of a batch of Starl

In [8]:
# Function to make a request with retry mechanism and timeout
def fetch_url(url, max_retries=3, timeout=10):
    attempts = 0
    while attempts < max_retries:
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
            }
            response = requests.get(url, headers=headers, timeout=timeout)
            response.raise_for_status()  # Raise an exception for HTTP errors
            return response.text
        except requests.exceptions.RequestException as e:
            attempts += 1
            print(f"Attempt {attempts}/{max_retries} failed for {url}: {e}")
            if attempts < max_retries:
                time.sleep(2)  # Wait before retrying
            else:
                print(f"Giving up on {url} after {max_retries} attempts.")
    return None

In [9]:
# Function to create an array of URLs for multiple pages
def create_url_array(base_url, pages):
    url_array = [base_url]
    for i in range(2, pages + 1):
        url_string = f"{base_url}?page={i}"
        url_array.append(url_string)
    return url_array

In [10]:
# Function to collect detail page URLs from the base URLs
def collect_detail_urls(url_array):
    detail_urls = []
    for url in url_array:
        page_content = fetch_url(url)
        if page_content:
            soup = BeautifulSoup(page_content, 'html.parser')
            links = soup.find_all(class_='mdc-button')
            for link in links:
                if link['href'].startswith('/'):
                    detail_urls.append(f"https://nextspaceflight.com{link['href']}")
    return detail_urls

In [11]:
# Function to scrape data and count successes and failures
def scrape_with_counts(detail_urls, scrape_function):
    data = []
    failed_urls = []
    success_counter, fail_counter = 0, 0

    for url in detail_urls:
        try:
            result = scrape_function(url)
            if result:
                data.append(result)
                success_counter += 1
                print(f"Scraped {url} (Total Successes: {success_counter})")
            else:
                failed_urls.append(url)
                fail_counter += 1
                print(f"Failed to scrape {url}. (Total Failures: {fail_counter})")
        except Exception as e:
            failed_urls.append(url)
            fail_counter += 1
            print(f"Error scraping {url}: {e} (Total Failures: {fail_counter})")

    return data, success_counter, fail_counter, failed_urls

In [12]:
# Configurations for scraping past and upcoming launches
configurations = {
    "past": {"base_url": "https://nextspaceflight.com/launches/past/", "pages": 229, "output": "data_json/past_launches.json"},
    "upcoming": {"base_url": "https://nextspaceflight.com/launches/", "pages": 15, "output": "data_json/upcoming_launches.json"}
}

# Iterate over configurations to scrape both past and upcoming launches
for config_name, config in configurations.items():
    print(f"Scraping {config_name} launches...")

    # Create URL array and collect detail URLs
    url_array = create_url_array(config["base_url"], config["pages"])
    detail_urls = collect_detail_urls(url_array)

    # Scrape launch data with counts
    launch_data, total_success, total_failures, failed_urls = scrape_with_counts(detail_urls, scrape_launch_data)

    # Save the scraped data to a JSON file
    with open(config["output"], 'w') as json_file:
        json.dump(launch_data, json_file, indent=4)

    print(f"{config_name.capitalize()} launch scraping complete.")
    print(f"Total Success: {total_success}, Total Failures: {total_failures}. Data saved to '{config['output']}'.")

print("All scraping operations are complete.")


Scraping past launches...
Scraped https://nextspaceflight.com/launches/details/7651 (Total Successes: 1)
Scraped https://nextspaceflight.com/launches/details/7654 (Total Successes: 2)
Scraped https://nextspaceflight.com/launches/details/7650 (Total Successes: 3)
Scraped https://nextspaceflight.com/launches/details/7652 (Total Successes: 4)
Scraped https://nextspaceflight.com/launches/details/7642 (Total Successes: 5)
Scraped https://nextspaceflight.com/launches/details/6818 (Total Successes: 6)
Scraped https://nextspaceflight.com/launches/details/7579 (Total Successes: 7)
Scraped https://nextspaceflight.com/launches/details/7649 (Total Successes: 8)
Scraped https://nextspaceflight.com/launches/details/2657 (Total Successes: 9)
Scraped https://nextspaceflight.com/launches/details/7595 (Total Successes: 10)
Scraped https://nextspaceflight.com/launches/details/6881 (Total Successes: 11)
Scraped https://nextspaceflight.com/launches/details/7640 (Total Successes: 12)
Scraped https://nextspa