In [1]:
import pandas as pd

In [None]:
launches_file = "../data/raw_datasets/space_launches_advanced_report.xlsx"

In [3]:
types = {
'Name':'string',
'Date (Central Time)':'string',
'Status':'string',
'Provider':'string',
'Rocket':'string', 'Mission':'string',
'Launch Pad':'string'
}

In [4]:
launches_df = pd.read_excel(launches_file,sheet_name="Space Launches",dtype=types)

In [5]:
launches_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7199 entries, 0 to 7198
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Name                 7199 non-null   string
 1   Date (Central Time)  7199 non-null   string
 2   Status               7199 non-null   string
 3   Provider             7199 non-null   string
 4   Rocket               7199 non-null   string
 5   Mission              7199 non-null   string
 6   Launch Pad           7199 non-null   string
dtypes: string(7)
memory usage: 393.8 KB


In [6]:
launches_df.columns = ['name', 'date', 'status', 'provider', 'rocket', 'mission', 'launch_site']

In [7]:
launches_df.head()

Unnamed: 0,name,date,status,provider,rocket,mission,launch_site
0,Sputnik 8K74PS | Sputnik 1,1957-10-04 09:28 AM,Launch Successful,Soviet Space Program,Sputnik 8K74PS,Sputnik 1,"1/5 | Baikonur Cosmodrome, Republic of Kazakhstan"
1,Sputnik 8K74PS | Sputnik 2,1957-11-02 02:30 PM,Launch Successful,Soviet Space Program,Sputnik 8K74PS,Sputnik 2,"1/5 | Baikonur Cosmodrome, Republic of Kazakhstan"
2,Vanguard | Vanguard,1957-12-06 04:44 AM,Launch Failure,US Navy,Vanguard,Vanguard,"Launch Complex 18A | Cape Canaveral, FL, USA"
3,Juno-I | Explorer 1,1958-01-31 03:47 PM,Launch Successful,Army Ballistic Missile Agency,Juno-I,Explorer 1,"Launch Complex 26A | Cape Canaveral, FL, USA"
4,Vanguard | Vanguard,1958-02-04 07:33 PM,Launch Failure,US Navy,Vanguard,Vanguard,"Launch Complex 18A | Cape Canaveral, FL, USA"


In [8]:
launches_df['date'] = pd.to_datetime(launches_df['date'])

In [9]:
launches_df['date']=launches_df['date'].dt.date

In [10]:
launches_df["status"] = launches_df["status"].str.replace("Launch ", "", regex=False)

In [11]:
launches_df["status"] = launches_df["status"].str.replace("Launch was a Partial Failure", "Failure", regex=False)

In [12]:
launches_df["status"] = launches_df["status"].str.replace("Successful", "Success", regex=False)

In [13]:
launches_df["status"] = launches_df["status"].str.strip()

In [14]:
def split_launch_pad(value):
    if pd.isna(value):
        return [None, None, None]
    
    if "|" in value:
        launch_pad, place = value.split("|", 1)
        launch_pad = launch_pad.strip()
        place = place.strip()
        place_parts = place.split(",")
        if len(place_parts) == 1:
            return [launch_pad,place_parts[0].strip(), None]
        if len(place_parts) == 2:
            return [launch_pad,place_parts[0].strip(), place_parts[1].strip()]
        if len(place_parts) == 3:
            return [launch_pad,place_parts[0].strip(),place_parts[2].strip()]
    else:
        return [value.strip(), None, None]


In [15]:
launches_df[["launch_pad", "space_port", "country"]] = launches_df["launch_site"].apply(split_launch_pad).apply(pd.Series)

In [16]:
def extract_mission_from_name(name):
    if pd.isna(name):
        return None
    parts = name.split("|")
    return parts[1].strip() if len(parts) > 1 else None

In [17]:
launches_df["mission"] = launches_df.apply(
    lambda row: extract_mission_from_name(row["name"]) if pd.isna(row["mission"]) or row["mission"] in ["", "—"] else row["mission"],
    axis=1
)

In [18]:
# Drop the Name column
launches_df.drop(columns=["name"], inplace=True)

In [19]:
distinct_countries = launches_df['country'].unique()

In [20]:
distinct_countries

array(['Republic of Kazakhstan', 'USA', None, 'Russian Federation',
       'French Algeria', 'Japan', 'Kenya', 'French Guiana',
       "People's Republic of China", 'India', 'State of Israel',
       'Federative Republic of Brazil', 'Marshall Islands',
       'Islamic Republic of Iran', 'South Korea', nan, 'New Zealand',
       'South Australia'], dtype=object)

In [21]:
def standardize_country(name):
    if pd.isna(name):
        return None
    
    name = name.strip()

    manual_map = {
        "Republic of Kazakhstan": "Kazakhstan",
        "Russian Federation": "Russia",
        "People's Republic of China": "China",
        "Federative Republic of Brazil": "Brazil",
        "Islamic Republic of Iran": "Iran",
        "French Algeria": "Algeria",
        "State of Israel": "Israel",
        "South Australia": "Australia"
    }
    if name in manual_map:
        return manual_map[name]
    return name

In [22]:
launches_df["country"] = launches_df["country"].apply(standardize_country)

In [23]:
launches_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7199 entries, 0 to 7198
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         7199 non-null   object
 1   status       7199 non-null   string
 2   provider     7199 non-null   string
 3   rocket       7199 non-null   string
 4   mission      7199 non-null   object
 5   launch_site  7199 non-null   string
 6   launch_pad   7167 non-null   object
 7   space_port   7167 non-null   object
 8   country      6961 non-null   object
dtypes: object(5), string(4)
memory usage: 506.3+ KB


In [None]:
launches_df.to_excel("../data/extracted_data/launches.xlsx", index=False) 