In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import json
import pandas as pd

In [4]:
def scrape_rocket_data(url):
    # Fetch the content of the URL
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize dictionary to store the scraped data
    rocket_data = {}

    # Extract Rocket Name
    rocket_name_tag = soup.find("h3", class_="section--center mdl-grid")
    rocket_data["rocket_name"] = rocket_name_tag.text.strip() if rocket_name_tag else None

    # Rocket Details Link
    rocket_data["rocket_details_link"] = url

    # Get Rocket ID from the link
    rocket_data["rocket_id"] = url.split("/")[-1]

    # Extract Rocket Description
    description_section = soup.find("section", class_="card section--center white mdl-grid mdl-grid--no-spacing mdl-shadow--6dp")
    description_paragraphs = description_section.find_all("p")
    rocket_data["description"] = " ".join([para.text.strip() for para in description_paragraphs]) if description_paragraphs else None

    # Extract Mission Statistics
    stats_section = description_section.find("div", class_="mdl-grid a")
    if stats_section:
        stats = {}
        for stat in stats_section.find_all("div", class_="mdl-cell"):
            key_value = stat.text.split(":")
            if len(key_value) == 2:
                key = key_value[0].strip().replace(" ", "_").lower()
                value = key_value[1].strip()
                stats[key] = value
        rocket_data["mission_statistics"] = stats

    configurations = []
    config_heading = soup.find("h3", class_="section--center mdl-grid title", string="Configurations")
    
    if config_heading:
        for sibling in config_heading.find_all_next():
            if sibling.name == "h3" and sibling.string.strip() == "Launch Sites":
                break 

            if sibling.name == "section" and "card section--center mdl-grid mdl-grid--no-spacing mdl-shadow--6dp" in ' '.join(sibling.get("class", [])):
                config_data = {}

                # Extract Configuration Name
                config_name_tag = sibling.find("div", class_="mdl-card__title-text")
                if config_name_tag:
                    config_data["name"] = config_name_tag.text.strip()

                # Extract Image URL from the <style> section immediately preceding this section
                style_tag = sibling.find_previous("style")
                if style_tag:
                    # Match the relevant background image URL
                    match = re.search(r'\.rocket_image\.\w+\s*{\s*background:\s*url\((.*?)\)', style_tag.string)
                    if match:
                        config_data["image_url"] = match.group(1).strip('"')

                # Extract Detailed Information
                details = {}
                details_section = sibling.find("div", class_="mdl-card__supporting-text")
                
                if details_section:
                    # Initialize a variable to store the company name
                    company_name = None
                    
                    for detail in details_section.find_all("div", class_="mdl-cell"):
                        # Check if the detail contains a key-value pair (with ":")
                        if ":" in detail.text:
                            detail_key_value = detail.text.split(":")
                            if len(detail_key_value) == 2:
                                key = detail_key_value[0].strip().replace(" ", "_").lower()
                                value = detail_key_value[1].strip()
                                details[key] = value
                        else:
                            # If no ":" is found, assume it's the company name
                            company_name = detail.text.strip()
                
                    # Store the company name in the config_data
                    config_data["company_name"] = company_name
                
                # Store the details dictionary in the config_data
                config_data["details"] = details
                if config_data.get("name"):
                    configurations.append(config_data)

    # Add configurations to rocket data
    rocket_data["configurations"] = configurations

    # Scraping the launch sites
    launch_sites = []
    launch_sites_heading = soup.find("h3", class_="section--center mdl-grid title", string="Launch Sites")
    if launch_sites_heading:
        for sibling in launch_sites_heading.find_all_next():
            
            # Stop when reaching a section that is not a launch site
            if sibling.name == "h3":
                break 
            
            # Check if the sibling is a launch site section
            if sibling.name == "section" and "card section--center mdl-grid mdl-grid--no-spacing mdl-shadow--6dp" in ' '.join(sibling.get("class", [])):
                launch_site_data = sibling.find("h5", style=re.compile("margin"))
                launch_site_name = launch_site_data.text.strip() if launch_site_data else None
                
                # Append the launch site to the list only if it has a name
                if launch_site_name:
                    launch_sites.append(launch_site_name)

    # Add launch sites to rocket data
    rocket_data["launch_sites"] = launch_sites
    
    return rocket_data

In [5]:
# Example usage:
scrape_rocket_data("https://nextspaceflight.com/rockets/3")

{'rocket_name': 'Falcon 9',
 'rocket_details_link': 'https://nextspaceflight.com/rockets/3',
 'rocket_id': '3',
 'description': "Falcon 9 is a partially reusable two-stage-to-orbit launch vehicle designed and manufactured by SpaceX in the United States. Both the first and second stages are powered by Merlin engines, using cryogenic liquid oxygen and rocket-grade kerosene (RP-1) as propellants. Unlike most rockets in service, which are expendable launch systems, Falcon 9 is partially reusable. The first stage is capable of re-entering the atmosphere and landing vertically after separating from the second stage. First-stage landings can occur either back at the launch site or downrange on droneships. In addition to the first stage, Falcon 9's payload fairing is also reusable. The payload fairing halves utilize a parafoil to steer themselves toward a recovery vessel.",
 'mission_statistics': {'missions': '383',
  'successes': '379',
  'partial_failures': '1',
  'failures': '3',
  'success

In [6]:
# Function to make a request with retry mechanism and timeout
def fetch_url(url, max_retries=3, timeout=15):
    attempts = 0
    while attempts < max_retries:
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
            }
            response = requests.get(url, headers=headers, timeout=timeout)
            response.raise_for_status()  # Raise an exception for HTTP errors
            return response.text
        except requests.exceptions.RequestException as e:
            attempts += 1
            print(f"Attempt {attempts}/{max_retries} failed for {url}: {e}")
            if attempts < max_retries:
                time.sleep(2)  # Wait before retrying
            else:
                print(f"Giving up on {url} after {max_retries} attempts.")
    return None

In [7]:
# Function to create a list of URLs to scrape from
def create_url_array(base_url, pages):
    return [f"{base_url}?search="] + [f"{base_url}?page={i}&search=" for i in range(2, pages+1)]

In [8]:
# Function to collect detail page URLs
def collect_detail_urls(url_array):
    detail_urls = []
    for url in url_array:
        page_content = fetch_url(url)
        if page_content:
            soup = BeautifulSoup(page_content, 'html.parser')
            links = soup.find_all(class_='mdc-button')
            for link in links:
                if link['href'][0] == '/':
                    detail_url = f"https://nextspaceflight.com{link['href']}"
                    detail_urls.append(detail_url)
    return detail_urls

In [9]:
# Function to scrape data and count successes, failures, and collect failed URLs
def scrape_with_counts(detail_urls, scrape_function):
    data = []
    failed_urls = []  # List to hold failed URLs
    success_counter, fail_counter = 0, 0  # Initialize counters

    for url in detail_urls:
        try:
            result = scrape_function(url)
            if result:
                data.append(result)
                success_counter += 1
                print(f"Scraped {url} (Success Count: {success_counter})")
            else:
                failed_urls.append(url)
                fail_counter += 1
                print(f"Failed to scrape {url}. (Total Failures: {fail_counter})")
        except Exception as e:
            failed_urls.append(url)
            fail_counter += 1
            print(f"Error scraping {url}: {e} (Total Failures: {fail_counter})")

    return data, success_counter, fail_counter, failed_urls

In [10]:
# Configurations
rocket_config = {
    "base_url": "https://nextspaceflight.com/rockets/",
    "pages": 8,
    "output": "data_json/rockets.json"
}

# Generate URL array and collect detail URLs
url_array = create_url_array(rocket_config["base_url"], rocket_config["pages"])
detail_urls = collect_detail_urls(url_array)

# Scrape rocket data data and collect failed URLs
rocket_data, total_success, total_failures, failed_urls = scrape_with_counts(detail_urls, scrape_rocket_data)

print(f"Total Success: {total_success}, Total Failures: {total_failures}.")
print(f"Rocket scraping complete. Data saved to '{rocket_output_json_path}'.")

# Write the scraped data to a JSON file
rocket_output_json_path = rocket_config["output"]
with open(rocket_output_json_path, 'w') as json_file:
    json.dump(rocket_data, json_file, indent=4)


Scraped https://nextspaceflight.com/rockets/62 (Success Count: 1)
Scraped https://nextspaceflight.com/rockets/252 (Success Count: 2)
Scraped https://nextspaceflight.com/rockets/183 (Success Count: 3)
Scraped https://nextspaceflight.com/rockets/58 (Success Count: 4)
Scraped https://nextspaceflight.com/rockets/26 (Success Count: 5)
Scraped https://nextspaceflight.com/rockets/9 (Success Count: 6)
Scraped https://nextspaceflight.com/rockets/88 (Success Count: 7)
Scraped https://nextspaceflight.com/rockets/89 (Success Count: 8)
Scraped https://nextspaceflight.com/rockets/90 (Success Count: 9)
Scraped https://nextspaceflight.com/rockets/91 (Success Count: 10)
Scraped https://nextspaceflight.com/rockets/92 (Success Count: 11)
Scraped https://nextspaceflight.com/rockets/33 (Success Count: 12)
Scraped https://nextspaceflight.com/rockets/68 (Success Count: 13)
Scraped https://nextspaceflight.com/rockets/93 (Success Count: 14)
Scraped https://nextspaceflight.com/rockets/204 (Success Count: 15)
Sc

NameError: name 'rocket_output_json_path' is not defined

In [11]:
# Write the scraped data to a JSON file
rocket_output_json_path = rocket_config["output"]
with open(rocket_output_json_path, 'w') as json_file:
    json.dump(rocket_data, json_file, indent=4)

In [18]:
# Load the JSON data from the file
with open('data_json/rockets.json', 'r') as json_file:
    rockets_data = json.load(json_file)

# Initialize a list to hold the flattened records
flattened_records = []

# Define the keys you want to extract from the rocket-level data
rocket_keys = [
    'rocket_name', 
    'rocket_details_link', 
    'rocket_id', 
    'description', 
    'launch_sites'
]

# Iterate over each rocket in the JSON data
for rocket in rockets_data:
    # Create a base record for each rocket
    base_record = {key: rocket.get(key) for key in rocket_keys}
    
    # Add mission statistics to the base record
    mission_stats = rocket.get('mission_statistics', {})
    for key, value in mission_stats.items():
        base_record[f'mission_statistics_{key}'] = value

    # Iterate through configurations
    for config in rocket.get('configurations', []):
        # Create a new record for each configuration
        record = base_record.copy()
        
        # Add configuration-specific details
        record['configuration_name'] = config.get('name')
        record['configuration_image_url'] = config.get('image_url')
        record['configuration_company_name'] = config.get('company_name')
        
        # Add details from the configuration
        details = config.get('details', {})
        for detail_key, detail_value in details.items():
            record[f'configuration_detail_{detail_key}'] = detail_value

        # Append the new record to the list
        flattened_records.append(record)

# Create a DataFrame from the flattened records
df = pd.DataFrame(flattened_records)

# Convert launch_sites list to a string (optional)
df['launch_sites'] = df['launch_sites'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Display the DataFrame
print(df[2:4].head())


  rocket_name                      rocket_details_link rocket_id  \
2    Angara 1  https://nextspaceflight.com/rockets/183       183   
3    Angara 1  https://nextspaceflight.com/rockets/183       183   

                                         description  \
2  The Angara 1 variants use a single Universal R...   
3  The Angara 1 variants use a single Universal R...   

                                        launch_sites  \
2  Site 1A, Vostochny Cosmodrome, Russia, Site 35...   
3  Site 1A, Vostochny Cosmodrome, Russia, Site 35...   

  mission_statistics_missions mission_statistics_successes  \
2                           4                            4   
3                           4                            4   

  mission_statistics_partial_failures mission_statistics_failures  \
2                                   0                           0   
3                                   0                           0   

  mission_statistics_success_streak  ... configuration_detail_

In [19]:
df.shape

(491, 24)

In [20]:
df.dtypes

rocket_name                              object
rocket_details_link                      object
rocket_id                                object
description                              object
launch_sites                             object
mission_statistics_missions              object
mission_statistics_successes             object
mission_statistics_partial_failures      object
mission_statistics_failures              object
mission_statistics_success_streak        object
mission_statistics_success_rate          object
configuration_name                       object
configuration_image_url                  object
configuration_company_name               object
configuration_detail_status              object
configuration_detail_price               object
configuration_detail_liftoff_thrust      object
configuration_detail_payload_to_leo      object
configuration_detail_stages              object
configuration_detail_strap-ons           object
configuration_detail_rocket_height      