In [1]:
!pip install beautifulsoup4 requests

import requests
from bs4 import BeautifulSoup
import unicodedata
import pandas as pd



In [2]:
def date_time(table_cells):
    """
    Return the date and time from the HTML table cell.
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]


def booster_version(table_cells):
    """
    Return the booster version from the HTML table cell.
    """
    out = ''.join([
        booster_version
        for i, booster_version in enumerate(table_cells.strings)
        if i % 2 == 0
    ][0:-1])
    return out


def landing_status(table_cells):
    """
    Return the landing status from the HTML table cell.
    """
    out = [i for i in table_cells.strings][0]
    return out


def get_mass(table_cells):
    """
    Return the payload mass as text ending in 'kg' or 0 if empty.
    """
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass = mass[0:mass.find("kg") + 2]
    else:
        new_mass = 0
    return new_mass


def extract_column_from_header(row):
    """
    Clean a header cell <th> and return the column name.
    """
    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()

    colunm_name = ' '.join(row.contents)

    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
    return colunm_name

In [3]:
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/91.0.4472.124 Safari/537.36"
}

response = requests.get(static_url, headers=headers)
print("Status:", response.status_code)

soup = BeautifulSoup(response.text, "lxml")
print(soup.title)
print(soup.title.string)

Status: 200
<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>
List of Falcon 9 and Falcon Heavy launches - Wikipedia


In [4]:
# Find all tables on the page
html_tables = soup.find_all("table")
print("Number of tables:", len(html_tables))

# The lab uses the 3rd table (index 2)
first_launch_table = html_tables[2]

# Extract column names from header cells
column_names = []
for th in first_launch_table.find_all("th"):
    name = extract_column_from_header(th)
    if name is not None and len(name) > 0:
        column_names.append(name)

print("Column names:")
print(column_names)

Number of tables: 25
Column names:
['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome', '1\n', '2\n', '3\n', '4\n', '5\n', '6\n', '7\n']


In [5]:
# Create dict with the extracted column names
launch_dict = dict.fromkeys(column_names)

print("Initial keys:", list(launch_dict.keys()))

# Remove the combined date-time column by pattern (avoids KeyError)
for k in list(launch_dict.keys()):
    if k.startswith("Date and time"):
        del launch_dict[k]
        break

# Initialize fields we will fill
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []

# New columns for cleaned data
launch_dict['Version Booster'] = []
launch_dict['Booster landing'] = []
launch_dict['Date'] = []
launch_dict['Time'] = []

print("Final keys:", list(launch_dict.keys()))

Initial keys: ['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome', '1\n', '2\n', '3\n', '4\n', '5\n', '6\n', '7\n']
Final keys: ['Flight No.', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome', '1\n', '2\n', '3\n', '4\n', '5\n', '6\n', '7\n', 'Version Booster', 'Booster landing', 'Date', 'Time']


In [9]:
extracted_row = 0

for table_number, table in enumerate(soup.find_all('table', "wikitable plainrowheaders collapsible")):
    for rows in table.find_all("tr"):
        # Check if first header cell is a numeric Flight No.
        if rows.th:
            if rows.th.string:
                flight_number = rows.th.string.strip()
                flag = flight_number.isdigit()
            else:
                flag = False
        else:
            flag = False

        # All <td> cells in the row
        row = rows.find_all('td')

        # Only process rows with a numeric flight number and data cells
        if flag and row:
            extracted_row += 1

            # Flight Number
            launch_dict['Flight No.'].append(flight_number)

            # Date and Time
            datatimelist = date_time(row[0])
            date = datatimelist[0].strip(',')
            time = datatimelist[1]
            launch_dict['Date'].append(date)
            launch_dict['Time'].append(time)

            # Booster Version
            bv = booster_version(row[1])
            if (not bv) and (row[1].a is not None):
                bv = row[1].a.get_text(strip=True)
            launch_dict['Version Booster'].append(bv)

            # Launch Site (works with or without <a>)
            launch_site = row[2].get_text(strip=True)
            launch_dict['Launch site'].append(launch_site)

            # Payload
            payload = row[3].get_text(strip=True)
            launch_dict['Payload'].append(payload)

            # Payload Mass
            payload_mass = get_mass(row[4])
            launch_dict['Payload mass'].append(payload_mass)

            # Orbit
            orbit = row[5].get_text(strip=True)
            launch_dict['Orbit'].append(orbit)

            # Customer (handle missing <a> to avoid AttributeError)
            if row[6].a is not None:
                customer = row[6].a.get_text(strip=True)
            else:
                customer = row[6].get_text(strip=True)
            launch_dict['Customer'].append(customer)

            # Launch outcome
            launch_outcome = list(row[7].strings)[0]
            launch_dict['Launch outcome'].append(launch_outcome)

            # Booster Landing
            booster_landing = landing_status(row[8])
            launch_dict['Booster landing'].append(booster_landing)

print("Total extracted rows:", extracted_row)

Total extracted rows: 121


In [3]:
import pandas as pd

s = pd.Series([], dtype=float)  # or dtype=object, int, str, etc.

In [10]:
df = pd.DataFrame({key: pd.Series(value) for key, value in launch_dict.items()})

print(df.head())
print(df.shape)

df.to_csv("spacex_web_scraped.csv", index=False)

  Flight No. Launch site                               Payload Payload mass  \
0          1       CCAFS  Dragon Spacecraft Qualification Unit            0   
1          2       CCAFS                                Dragon            0   
2          3       CCAFS                                Dragon       525 kg   
3          4       CCAFS                          SpaceX CRS-1     4,700 kg   
4          5       CCAFS                          SpaceX CRS-2     4,877 kg   

  Orbit Customer Launch outcome  1\n  2\n  3\n  4\n  5\n  6\n  7\n  \
0   LEO   SpaceX      Success\n  NaN  NaN  NaN  NaN  NaN  NaN  NaN   
1   LEO     NASA        Success  NaN  NaN  NaN  NaN  NaN  NaN  NaN   
2   LEO     NASA        Success  NaN  NaN  NaN  NaN  NaN  NaN  NaN   
3   LEO     NASA      Success\n  NaN  NaN  NaN  NaN  NaN  NaN  NaN   
4   LEO     NASA      Success\n  NaN  NaN  NaN  NaN  NaN  NaN  NaN   

    Version Booster Booster landing             Date   Time  
0  F9 v1.07B0003.18         Failure      4

  """Entry point for launching an IPython kernel.


In [8]:
print(column_names)
print(list(launch_dict.keys()))

['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome', '1\n', '2\n', '3\n', '4\n', '5\n', '6\n', '7\n']
['Flight No.', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome', '1\n', '2\n', '3\n', '4\n', '5\n', '6\n', '7\n', 'Version Booster', 'Booster landing', 'Date', 'Time']
