In [1]:
import sys
import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

# Helper functions
def date_time(table_cell):
    """
    This function returns the date and time from the HTML table cell.
    Input: the element of a table data cell.
    """
    return [data_time.strip() for data_time in table_cell.stripped_strings][:2]

def booster_version(table_cell):
    """
    Extracts the booster version from a table cell element.
    """
    return table_cell.a.string

def get_mass(table_cell):
    """
    Extracts the payload mass from a table cell element.
    """
    mass = unicodedata.normalize("NFKD", table_cell.text).strip()
    if mass:
        return mass.split(' ')[0]  # Assuming mass is in format '123 kg'
    return None

def landing_status(table_cell):
    """
    Extracts the landing status from a table cell element.
    """
    return table_cell.text.strip()

def extract_column_from_header(row):
    """
    Extracts and returns the column name from the HTML table header cell.
    Input: the element of a table header cell.
    """
    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    column_name = ' '.join(row.contents)
    
    # Filter out digits and empty names
    if not column_name.strip().isdigit():
        column_name = column_name.strip()
        return column_name


In [2]:
# URL of the Falcon9 Launch Wiki page
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

# Perform an HTTP GET request
response = requests.get(static_url)

# Check if the request was successful
if response.status_code == 200:
    print("Successfully fetched the Falcon9 Launch Wiki page")
    html_content = response.text  # Get the HTML content of the page

    # Create a BeautifulSoup object from the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
else:
    print(f"Failed to fetch the page. Status code: {response.status_code}")
    exit()


Successfully fetched the Falcon9 Launch Wiki page


In [3]:
# Find all tables on the wiki page
html_tables = soup.find_all('table')

# Target table contains the actual launch records
first_launch_table = html_tables[2]

# Extract column names
column_names = []

# Apply find_all() function with `th` element on first_launch_table
# Iterate each th element and apply the provided extract_column_from_header() to get a column name
# Append the Non-empty column name (`if name is not None and len(name) > 0`) into a list called column_names
th_elements = first_launch_table.find_all('th')

for th in th_elements:
    name = extract_column_from_header(th)
    if name is not None and len(name) > 0:
        column_names.append(name)

print(column_names)


['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


In [9]:
# Initialize the dictionary with keys from column names
launch_dict = {key: [] for key in column_names}

# Remove an irrelevant column
if 'Date and time ( )' in launch_dict:
    del launch_dict['Date and time ( )']

# Add new columns with empty lists
new_columns = ['Version Booster', 'Booster landing', 'Date', 'Time']
for col in new_columns:
    launch_dict[col] = []

# Initialize other columns with empty lists if they don't exist
required_columns = ['Flight No.', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']
for col in required_columns:
    if col not in launch_dict:
        launch_dict[col] = []

print(launch_dict)  # This will show the initialized dictionary structure


{'Flight No.': [], 'Launch site': [], 'Payload': [], 'Payload mass': [], 'Orbit': [], 'Customer': [], 'Launch outcome': [], 'Version Booster': [], 'Booster landing': [], 'Date': [], 'Time': []}


In [10]:
extracted_row = 0
# Extract each table 
for table_number, table in enumerate(soup.find_all('table', "wikitable plainrowheaders collapsible")):
    # Get table rows
    for rows in table.find_all("tr"):
        # Check to see if first table heading is a number corresponding to launch number
        if rows.th:
            if rows.th.string:
                flight_number = rows.th.string.strip()
                flag = flight_number.isdigit()
            else:
                flag = False
        else:
            flag = False

        # Get table elements
        row = rows.find_all('td')

        # If it is a number, save cells in a dictionary
        if flag and len(row) >= 9:
            extracted_row += 1
            
            # Flight Number value
            launch_dict['Flight No.'].append(flight_number)
            print(flight_number)

            datatimelist = date_time(row[0])
            
            # Date value
            date = datatimelist[0].strip(',')
            launch_dict['Date'].append(date)
            print(date)
            
            # Time value
            time = datatimelist[1]
            launch_dict['Time'].append(time)
            print(time)
              
            # Booster version
            bv = booster_version(row[1])
            if not bv and row[1].a:
                bv = row[1].a.string
            launch_dict['Version Booster'].append(bv)
            print(bv)

            # Launch Site
            launch_site = row[2].a.string if row[2].a else None
            launch_dict['Launch site'].append(launch_site)
            print(launch_site)

            # Payload
            payload = row[3].a.string if row[3].a else None
            launch_dict['Payload'].append(payload)
            print(payload)

            # Payload Mass
            payload_mass = get_mass(row[4])
            launch_dict['Payload mass'].append(payload_mass)
            print(payload_mass)

            # Orbit
            orbit = row[5].a.string if row[5].a else None
            launch_dict['Orbit'].append(orbit)
            print(orbit)

            # Customer
            customer = row[6].a.string if row[6].a else None
            launch_dict['Customer'].append(customer)
            print(customer)

            # Launch outcome
            try:
                launch_outcome = list(row[7].stripped_strings)[0] if row[7] else None
            except IndexError:
                launch_outcome = None
            launch_dict['Launch outcome'].append(launch_outcome)
            print(launch_outcome)

            # Booster landing
            try:
                booster_landing = landing_status(row[8]) if row[8] else None
            except IndexError:
                booster_landing = None
            launch_dict['Booster landing'].append(booster_landing)
            print(booster_landing)


1
4 June 2010
18:45
F9 v1.0
CCAFS
Dragon Spacecraft Qualification Unit
None
LEO
SpaceX
Success
Failure[9][10](parachute)
2
8 December 2010
15:43
F9 v1.0
CCAFS
Dragon
None
LEO
NASA
Success
Failure[9][14](parachute)
3
22 May 2012
07:44
F9 v1.0
CCAFS
Dragon
525
LEO
NASA
Success
No attempt
4
8 October 2012
00:35
F9 v1.0
CCAFS
SpaceX CRS-1
4,700
LEO
NASA
Success
No attempt
5
1 March 2013
15:10
F9 v1.0
CCAFS
SpaceX CRS-2
4,877
LEO
NASA
Success
No attempt
6
29 September 2013
16:00
F9 v1.1
VAFB
CASSIOPE
500
Polar orbit
MDA
Success
Uncontrolled(ocean)[d]
7
3 December 2013
22:41
F9 v1.1
CCAFS
SES-8
3,170
GTO
SES
Success
No attempt[38]
8
6 January 2014
22:06
F9 v1.1
CCAFS
Thaicom 6
3,325
GTO
Thaicom
Success
No attempt[43]
9
18 April 2014
19:25
F9 v1.1
Cape Canaveral
SpaceX CRS-3
2,296
LEO
NASA
Success
Controlled(ocean) [d][46]
10
14 July 2014
15:15
F9 v1.1
Cape Canaveral
Orbcomm-OG2
1,316
LEO
Orbcomm
Success
Controlled(ocean)[d][46]
11
5 August 2014
08:00
F9 v1.1
Cape Canaveral
AsiaSat 8
4,535
GT

In [11]:
# Ensure all lists in launch_dict are of the same length
max_length = max(len(lst) for lst in launch_dict.values())

for key, value in launch_dict.items():
    while len(value) < max_length:
        value.append(None)

# Convert the launch_dict to a Pandas DataFrame
df = pd.DataFrame(launch_dict)

# Display the DataFrame
print(df.head())

  Flight No. Launch site                               Payload Payload mass  \
0          1       CCAFS  Dragon Spacecraft Qualification Unit         None   
1          2       CCAFS                                Dragon         None   
2          3       CCAFS                                Dragon          525   
3          4       CCAFS                          SpaceX CRS-1        4,700   
4          5       CCAFS                          SpaceX CRS-2        4,877   

  Orbit Customer Launch outcome Version Booster            Booster landing  \
0   LEO   SpaceX        Success         F9 v1.0  Failure[9][10](parachute)   
1   LEO     NASA        Success         F9 v1.0  Failure[9][14](parachute)   
2   LEO     NASA        Success         F9 v1.0                 No attempt   
3   LEO     NASA        Success         F9 v1.0                 No attempt   
4   LEO     NASA        Success         F9 v1.0                 No attempt   

              Date   Time  
0      4 June 2010  18:45  
