In [1]:
!pip3 install beautifulsoup4
!pip3 install requests



In [2]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

In [3]:
def date_time(table_cells):
    """
    This function returns the data and time from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML  table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=''.join([booster_version for i,booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])
    return out

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=[i for i in table_cells.strings][0]
    return out


def get_mass(table_cells):
    mass=unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass=mass[0:mass.find("kg")+2]
    else:
        new_mass=0
    return new_mass


def extract_column_from_header(row):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    colunm_name = ' '.join(row.contents)
    
    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name    

In [4]:
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/91.0.4472.124 Safari/537.36"
}

In [5]:
import requests

# URL provided in the lab
static_url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"

# Some sites block plain requests, so send a browser-like header
headers = {"User-Agent": "Mozilla/5.0"}

# Send GET request
response = requests.get(static_url, headers=headers)

# Check response status
print("Status code:", response.status_code)

Status code: 200


In [6]:
from bs4 import BeautifulSoup

# Create a BeautifulSoup object from the response text
soup = BeautifulSoup(response.text, "html.parser")

In [7]:
# Print page title to verify soup object is correct
print("Page Title:", soup.title.string)

Page Title: List of Falcon 9 and Falcon Heavy launches - Wikipedia


In [8]:
# Find all HTML tables on the page
html_tables = soup.find_all("table")

print("Number of tables found:", len(html_tables))

Number of tables found: 15


In [9]:
# The third table contains the Falcon9 launch records
first_launch_table = html_tables[2]

# Print to inspect the table structure
print(first_launch_table)

<table class="wikitable sticky-header" id="F9-xxx" style="width: 100%;">
<tbody><tr>
<th scope="col">Date and time (<a href="/wiki/Coordinated_Universal_Time" title="Coordinated Universal Time">UTC</a>)
</th>
<th scope="col"><a href="/wiki/List_of_Falcon_9_first-stage_boosters" title="List of Falcon 9 first-stage boosters">Version,<br/>booster</a><sup class="reference" id="cite_ref-booster_32-2"><a href="#cite_note-booster-32"><span class="cite-bracket">[</span>j<span class="cite-bracket">]</span></a></sup>
</th>
<th scope="col">Launch site
</th>
<th scope="col">Payload<sup class="reference" id="cite_ref-Dragon_33-2"><a href="#cite_note-Dragon-33"><span class="cite-bracket">[</span>k<span class="cite-bracket">]</span></a></sup>
</th>
<th scope="col">Orbit
</th>
<th scope="col">Customer
</th></tr>
<tr>
<td rowspan="2">November 26, 2025<br/>18:18
</td>
<td><a href="/wiki/Falcon_9_Block_5" title="Falcon 9 Block 5">F9<span class="nowrap"> </span>B5</a><br/><a href="/wiki/List_of_Falcon_9_f

In [10]:
def extract_column_from_header(th):
    """
    Extracts the text of a <th> element, removing footnote markers like [a], [b].
    """
    if th is None:
        return None

    # Remove references [1], [2], etc.
    text = th.get_text().strip()
    text = text.split('[')[0].strip()

    return text

In [11]:
column_names = []

# Find all <th> elements inside the table header
ths = first_launch_table.find_all("th")

for th in ths:
    name = extract_column_from_header(th)
    if name is not None and len(name) > 0:
        column_names.append(name)

In [12]:
print("Extracted column names:")
print(column_names)

Extracted column names:
['Date and time (UTC)', 'Version,booster', 'Launch site', 'Payload', 'Orbit', 'Customer']


In [14]:
# ------------------ TASK 3: Parse launch tables and create DataFrame ------------------
import re
import pandas as pd
from bs4 import BeautifulSoup

# --- helper cleaners ---
def clean_text(tag_or_str):
    if tag_or_str is None:
        return ""
    if hasattr(tag_or_str, "stripped_strings"):
        txt = " ".join(s for s in tag_or_str.stripped_strings)
    else:
        txt = str(tag_or_str)
    txt = re.sub(r'\[.*?\]', '', txt)          # drop refs like [1], [a]
    txt = txt.replace('\n', ' ').strip()
    txt = re.sub(r'\s+', ' ', txt)
    return txt.strip()

def date_time(cell):
    txt = clean_text(cell)
    if txt == "":
        return ["", ""]
    if ',' in txt:
        parts = [p.strip() for p in txt.split(',', 1)]
        return [parts[0], parts[1] if len(parts)>1 else ""]
    if '\n' in txt:
        parts = [p.strip() for p in txt.split('\n', 1)]
        return [parts[0], parts[1] if len(parts)>1 else ""]
    m = re.search(r'(\d{1,2}:\d{2})$', txt)
    if m:
        return [txt[:m.start()].strip(' ,'), m.group(1)]
    return [txt, ""]

def landing_status(cell):
    txt = clean_text(cell).lower()
    if txt == "":
        return ""
    if 'no attempt' in txt or txt.strip() == 'no':
        return 'No attempt'
    if 'success' in txt or 'landed' in txt or 'recovered' in txt:
        return 'Success'
    if 'fail' in txt or 'lost' in txt:
        return 'Failure'
    return clean_text(cell)

# --- canonical keys we will produce ---
keys = [
    'Flight No.', 'Date and time (UTC)', 'Version Booster', 'Launch site',
    'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome', 'Booster landing',
    'Date', 'Time'
]

# initialize launch_dict (lists)
launch_dict = {k: [] for k in keys}

# Confirm 'soup' exists (from TASK1). If not present, raise informative error.
if 'soup' not in globals():
    raise NameError("soup not found. Run TASK 1 (HTTP GET + BeautifulSoup) before running Task 3.")

# locate tables (lab-targets use class 'wikitable plainrowheaders collapsible')
tables = soup.find_all('table', class_="wikitable plainrowheaders collapsible")
if not tables:
    # fallback to wikitable with plainrowheaders
    tables = [t for t in soup.find_all('table') if 'wikitable' in (t.get('class') or []) and 'plainrowheaders' in (t.get('class') or [])]

extracted_rows = 0
for table in tables:
    for tr in table.find_all('tr'):
        # flight number is often in <th> at row start
        if tr.th:
            thtxt = clean_text(tr.th)
            m = re.match(r'^\d+', thtxt)
            if m:
                flight_no = m.group(0)
                extracted_rows += 1
            else:
                continue
        else:
            continue

        tds = tr.find_all('td')
        # pad to expected minimum length
        while len(tds) < 9:
            tds.append(None)

        # Flight No.
        launch_dict['Flight No.'].append(flight_no)

        # Date and Time (raw + split)
        dt_raw = clean_text(tds[0])
        launch_dict['Date and time (UTC)'].append(dt_raw)
        d, t = date_time(tds[0])
        launch_dict['Date'].append(d)
        launch_dict['Time'].append(t)

        # Version Booster (prefer anchor text)
        vb = ""
        if tds[1] is not None:
            a = tds[1].find('a')
            vb = clean_text(a) if a else clean_text(tds[1])
        launch_dict['Version Booster'].append(vb)

        # Launch site
        ls = ""
        if tds[2] is not None:
            a = tds[2].find('a')
            ls = clean_text(a) if a else clean_text(tds[2])
        launch_dict['Launch site'].append(ls)

        # Payload
        pl = ""
        if tds[3] is not None:
            a = tds[3].find('a')
            pl = clean_text(a) if a else clean_text(tds[3])
        launch_dict['Payload'].append(pl)

        # Payload mass
        pm = clean_text(tds[4]) if tds[4] is not None else ""
        launch_dict['Payload mass'].append(pm)

        # Orbit
        ob = ""
        if tds[5] is not None:
            a = tds[5].find('a')
            ob = clean_text(a) if a else clean_text(tds[5])
        launch_dict['Orbit'].append(ob)

        # Customer
        cust = ""
        if tds[6] is not None:
            a = tds[6].find('a')
            cust = clean_text(a) if a else clean_text(tds[6])
        launch_dict['Customer'].append(cust)

        # Launch outcome
        lo = clean_text(tds[7]) if tds[7] is not None else ""
        launch_dict['Launch outcome'].append(lo)

        # Booster landing
        bl = landing_status(tds[8]) if tds[8] is not None else ""
        launch_dict['Booster landing'].append(bl)

# Ensure equal-length lists
max_len = max(len(v) for v in launch_dict.values())
for k in launch_dict:
    if len(launch_dict[k]) < max_len:
        launch_dict[k].extend([''] * (max_len - len(launch_dict[k])))

print(f"Parsed rows: {extracted_rows}; columns: {len(launch_dict)}")

# Create DataFrame safely (avoid DeprecationWarning for empty Series)
df = pd.DataFrame({k: pd.Series(v, dtype=object if len(v)==0 else None) for k, v in launch_dict.items()})

print("DataFrame shape:", df.shape)
display(df.head())

# Save CSV
df.to_csv("spacex_web_scraped.csv", index=False)
print("Saved spacex_web_scraped.csv")
# -------------------------------------------------------------------------------------

Parsed rows: 283; columns: 12
DataFrame shape: (283, 12)


Unnamed: 0,Flight No.,Date and time (UTC),Version Booster,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Booster landing,Date,Time
0,286,"January 3, 2024 03:44",F9 B5,Vandenberg,Starlink,"~16,800 kg (37,000 lb)",LEO,SpaceX,Success,Success,January 3,2024 03:44
1,287,"January 3, 2024 23:04",F9 B5,Cape Canaveral,Ovzon-3,"1,800 kg (4,000 lb)",GTO,Ovzon,Success,Success,January 3,2024 23:04
2,288,"January 7, 2024 22:35",F9 B5,Cape Canaveral,Starlink,"~17,100 kg (37,700 lb)",LEO,SpaceX,Success,Success,January 7,2024 22:35
3,289,"January 14, 2024 08:59",F9 B5,Vandenberg,Starlink,"~16,700 kg (36,800 lb)",LEO,SpaceX,Success,Success,January 14,2024 08:59
4,290,"January 15, 2024 01:52",F9 B5,Cape Canaveral,Starlink,"~17,100 kg (37,700 lb)",LEO,SpaceX,Success,Success,January 15,2024 01:52


Saved spacex_web_scraped.csv
