In [None]:
# GScraper Code - Spacecrafts - DO NOT TOUCH

import requests
from bs4 import BeautifulSoup
import time
import random
import csv
from datetime import datetime
from typing import Optional
from fuzzywuzzy import fuzz, process
from operator import itemgetter
import logging

logging.basicConfig(level=logging.DEBUG, filename='debug.log')

# Create a requests session object to reuse connections and headers
session = requests.Session()

def download_table() -> BeautifulSoup:
    """
    Downloads the HTML content of the web page containing the launch data table and returns it as a BeautifulSoup object.
    Uses the requests.Session object to make the request and the lxml parser instead of html.parser.
    """
    url = "https://space.skyrocket.de/doc_chr/lau2023.htm"
    response = session.get(url)
    soup = BeautifulSoup(response.content, "lxml")
    return soup

def load_table(start_date: Optional[datetime] = None, end_date: Optional[datetime] = None) -> list:
    """
    Loads the data from the launch table and returns it as a list of dictionaries.
    Filters the launches by start_date and end_date if provided.
    """
    soup = download_table()

    table = soup.find("table")

    data = []

    if table:
        for row in table.find_all("tr"):
            cells = row.find_all("td")

            if not cells:
                continue

            if cells[0].select("#Planned"):
                continue

            id_ = cells[0]
            date = cells[1]
            payloads = cells[2]

            # Remove unnecessary spans
            for span in payloads.find_all("span", class_="compact"):
                span.decompose()
            for span in payloads.find_all("span", class_=["detailed", "indent"]):
                span.unwrap()

            vehicle = cells[3]
            site = cells[4]
            remark = cells[5]

            for payload in payloads.contents:
                payload_name = ''
                payload_url = None

                if payload.name == 'br':
                    continue

                elif payload.name == 'a':
                    payload_url = payload['href']
                    payload_name = payload.text

                elif payload.name is None and len(payload.text) > 2:
                    payload_name = payload.text

                else:
                    continue

                # Skip dates with "X", "Q", or "?"
                try:
                    launch_date = datetime.strptime(date.text.strip(), '%d.%m.%Y')
                    if any(x in date.text for x in ['X', 'Q', '?']):
                        continue
                    elif start_date and end_date and (launch_date < start_date or launch_date > end_date):
                        continue
                except ValueError:
                    continue

                data.append({
                    'id': id_.text.strip(),
                    'date': date.text.strip(),
                    'payload_name': payload_name.strip(),
                    'payload_url': payload_url,
                    'vehicle': vehicle.text.strip(),
                    'site': site.text.strip(),
                    'remark': remark.text.strip()
                })

    return data

def download_details(url: str) -> BeautifulSoup:
    """
    Downloads and returns the details of the payloads from the given URL.
    """
    url = f"https://space.skyrocket.de/doc_chr/{url}"

    # Use the session object to make the request
    response = session.get(url)

    # Use the lxml parser instead of html.parser
    soup = BeautifulSoup(response.content, "lxml")

    return soup

def load_details(url, payload_name, date):
    soup = download_details(url)

    table = soup.find(id='satlist')

    if table is None:
        print(f"No table found for {url}")
        return {}

    cospars = []

    # Use CSS selectors instead of find_all
    for row in table.select("tr"):
        cells = row.select("td")

        if not cells:
            continue

        sat_name = cells[0].get_text(strip=True)
        cospar = cells[1]
        sat_date = cells[2].get_text(strip=True)

        cospars.append([sat_name, sat_date, cospar])

    details = {
        "Country": None,
        "Market Segment": None,
        "Operator": None,
        "Prime Manufacturer": None,
        "Equipment": None,
        "Configuration": None,
        "Propulsion": None,
        "Power": None,
        "Design Life": None,
        "Mass": None,
        "Orbit Type": None,
        "Cospar": cospars
    }

    # Use CSS selectors instead of find
    details["Country"] = soup.select_one('#sdnat').get_text(strip=True) if soup.select_one('#sdnat') else "None"
    details["Market Segment"] = soup.select_one('#sdtyp').get_text(strip=True) if soup.select_one('#sdtyp') else "None"
    details["Operator"] = soup.select_one('#sdope').get_text(strip=True) if soup.select_one('#sdope') else "None"
    details["Prime Manufacturer"] = soup.select_one('#sdcon').get_text(strip=True) if soup.select_one('#sdcon') else "None"
    details["Equipment"] = soup.select_one('#sdequ').get_text(strip=True) if soup.select_one('#sdequ') else "None"
    details["Configuration"] = soup.select_one('#sdcnf').get_text(strip=True) if soup.select_one('#sdcnf') else "None"
    details["Propulsion"] = soup.select_one('#sdpro').get_text(strip=True) if soup.select_one('#sdpro') else "None"
    details["Power"] = soup.select_one('#sdpow').get_text(strip=True) if soup.select_one('#sdpow') else "None"
    details["Design Life"] = soup.select_one('#sdlif').get_text(strip=True) if soup.select_one('#sdlif') else "None"
    details["Mass"] = soup.select_one('#sdmas').get_text(strip=True) if soup.select_one('#sdmas') else "None"
    details["Orbit Type"] = soup.select_one('#sdorb').get_text(strip=True) if soup.select_one('#sdorb') else "None"

    return details

def match_cospar(payload_name: str, payload_date: str, cospar_list: list) -> str:
    """
    Matches the payload name and date to the corresponding COSPAR number.
    Returns the matched COSPAR number or None if no match is found.
    """
    ret = []
    for name, date, cospar in cospar_list:
        match_name = fuzz.token_set_ratio(payload_name, name)
        match_date = fuzz.token_set_ratio(payload_date, date)

        if match_name == 100 and match_date == 100:
            print("100% match")
            return cospar.get_text(strip=True)
        if match_name >= 90 and match_date >= 90:
            print(f">90% match: {payload_name} match = {match_name}, {date} match = {match_date}")
            ret.append([match_name + match_date, cospar.get_text(strip=True)])
        else:
            print(f"<90% match: {payload_name} match = {match_name}, {date} match = {match_date}")

    if not ret:
        print("No match for COSPAR")
        return None

    return sorted(ret, key=itemgetter(0), reverse=True)[0][1]

def create_csv(filename: str, launches: list, details: dict) -> None:
    """
    Writes the scraped data to a CSV file.
    """
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Date', 'Spacecraft Name', 'URL', 'Vehicle Name', 'Launch Site', 'Remark', 'Country',
                         'Market Segment', 'Operator', 'Prime Manufacturer', 'Equipment', 'Configuration', 'Propulsion',
                        'Power', 'Design Life', 'Mass', 'Orbit','COSPAR', 'Date Created'])

        for launch in launches:
            try:
                launch_details = details[launch['payload_url']]
            except KeyError:
                continue
            
            # Date Formatting
            parsed_date = datetime.strptime(launch['date'], '%d.%m.%Y').strftime('%m/%d/%Y')
            cospar = None
            country = 'None'
            market_segment = 'None'
            operator = 'None'
            prime_manufacturer = 'None'
            equipment = 'None'
            configuration = 'None'
            propulsion = 'None'
            power = 'None'
            design_life = 'None'
            mass = 'None'
            orbit_type = 'None'

            if parsed_date and parsed_date <= datetime.now().strftime('%m/%d/%Y'):
                try:
                    cospar = match_cospar(launch['payload_name'], launch['date'], launch_details['Cospar'])
                    print("COSPAR match:", cospar)
                except KeyError:
                    print("ERROR: could not match COSPAR for:", launch['payload_name'])

                country = launch_details.get('Country', 'None')
                market_segment = launch_details.get('Market Segment', 'None')
                operator = launch_details.get('Operator', 'None')
                prime_manufacturer = launch_details.get('Prime Manufacturer', 'None')
                equipment = launch_details.get('Equipment', 'None')
                configuration = launch_details.get('Configuration', 'None')
                propulsion = launch_details.get('Propulsion', 'None')
                power = launch_details.get('Power', 'None')
                design_life = launch_details.get('Design Life', 'None')
                mass = launch_details.get('Mass', 'None')
                orbit_type = launch_details.get('Orbit Type', 'None')

            writer.writerow([
                parsed_date,
                launch['payload_name'],
                launch['payload_url'],
                launch['vehicle'],
                launch['site'],
                launch['remark'],
                country,
                market_segment,
                operator,
                prime_manufacturer,
                equipment,
                configuration,
                propulsion,
                power,
                design_life,
                mass,
                orbit_type,
                cospar,
                datetime.now().strftime('%m/%d/%Y %H:%M:%S') # Date Created
            ])

def main() -> None:
    """
    Main function to scrape and write data to CSV file.
    """
    start_date = datetime.strptime('02.01.2023', '%d.%m.%Y')
    end_date = datetime.strptime('03.01.2023', '%d.%m.%Y')
    launches = load_table(start_date=start_date, end_date=end_date)

    urls = set(launch['payload_url'] for launch in launches)

    details = {}

    for launch in launches:
        details[launch['payload_url']] = {}

    for url in urls:
        payload_name = None
        date = None
        for launch in launches:
            if launch['payload_url'] == url:
                payload_name = launch['payload_name']
                date = launch['date']
                break

        print(f'Loading {url}')
        details[url] = load_details(url, payload_name, date)
        time.sleep(random.randint(1, 10))

    create_csv('G_SC.csv', launches, details)


if __name__ == '__main__':
    main()


In [None]:
# GScraper Code - Launch Events - DO NOT TOUCH

import requests
from bs4 import BeautifulSoup
import time
import random
import csv
from datetime import datetime
from typing import Optional
from fuzzywuzzy import fuzz, process
from operator import itemgetter
import logging

logging.basicConfig(level=logging.DEBUG, filename='debug.log')

# Create a requests session object to reuse connections and headers
session = requests.Session()

def download_table() -> BeautifulSoup:
    """
    Downloads the HTML content of the web page containing the launch data table and returns it as a BeautifulSoup object.
    Uses the requests.Session object to make the request and the lxml parser instead of html.parser.
    """
    url = "https://space.skyrocket.de/doc_chr/lau2023.htm"
    response = session.get(url)
    soup = BeautifulSoup(response.content, "lxml")
    return soup

def load_table(start_date: Optional[datetime] = None, end_date: Optional[datetime] = None) -> list:
    """
    Loads the data from the launch table and returns it as a list of dictionaries.
    Filters the launches by start_date and end_date if provided.
    """
    soup = download_table()

    table = soup.find("table")

    data = []

    if table:
        for row in table.find_all("tr"):
            cells = row.find_all("td")

            if not cells:
                continue

            if cells[0].select("#Planned"):
                continue

            date = cells[1]
            vehicle = cells[3]
            site = cells[4]

            # Skip dates with "X", "Q", or "?"
            try:
                launch_date = datetime.strptime(date.text.strip(), '%d.%m.%Y')
                if any(x in date.text for x in ['X', 'Q', '?']):
                    continue
                elif start_date and end_date and (launch_date < start_date or launch_date > end_date):
                    continue
            except ValueError:
                continue

            data.append({
                'date': date.text.strip(),
                'vehicle': vehicle.text.strip(),
                'site': site.text.strip(),
            })

    return data


def create_csv(filename: str, launches: list, details: dict) -> None:
    """
    Writes the scraped data to a CSV file.
    """
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Date', 'Launch Vehicle', 'Site'])

        for launch in launches:
            date_obj = datetime.strptime(launch['date'], '%d.%m.%Y')
            date_str = datetime.strftime(date_obj, '%m/%d/%Y')
            writer.writerow([date_str, launch['vehicle'], launch['site']])

def main() -> None:
    """
    Main function to scrape and write data to CSV file.
    """
    start_date = datetime.strptime('02.01.2023', '%d.%m.%Y')
    end_date = datetime.strptime('07.03.2023', '%d.%m.%Y')
    launches = load_table(start_date=start_date, end_date=end_date)

    create_csv('G_LE.csv', launches, {})


if __name__ == '__main__':
    main()
