In [1]:
import pandas as pd

In [None]:
rockets_file = "../data/raw_datasets/all-rockets-from-1957.csv"

In [3]:
columns_to_read = [
    'Name',
    'Cmp',
    'Status',
    'Liftoff Thrust',
    'Payload to LEO',
    'Stages',
    'Rocket Height',
    'Price',
]

In [4]:
types = {
    'Name': 'string',
    'Cmp': 'string',
    'Status': 'string',
    'Liftoff Thrust': 'float64',
    'Payload to LEO': 'float64',
    'Stages': 'int64',
    'Rocket Height': 'float64',
    'Price': 'float64',
}

In [5]:
rockets_df = pd.read_csv(rockets_file,usecols=columns_to_read,dtype='string')

In [6]:
rockets_df.head()

Unnamed: 0,Name,Cmp,Status,Liftoff Thrust,Payload to LEO,Stages,Rocket Height,Price
0,Tsyklon-3,Yuzhmash,Retired,3032,4.1,3,39.0 m,
1,Tsyklon-4M,Yuzhnoye,Planned,3130,5.0,2,38.7 m,$45.0 million
2,Unha-2,KCST,Retired,1192,0.2,3,28.0 m,
3,Unha-3,KCST,Active,1192,0.2,3,32.0 m,
4,Vanguard,US Navy,Retired,135,0.01,3,23.0 m,


In [7]:
rockets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 409 entries, 0 to 408
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            409 non-null    string
 1   Cmp             409 non-null    string
 2   Status          409 non-null    string
 3   Liftoff Thrust  291 non-null    string
 4   Payload to LEO  408 non-null    string
 5   Stages          405 non-null    string
 6   Rocket Height   326 non-null    string
 7   Price           98 non-null     string
dtypes: string(8)
memory usage: 25.7 KB


In [8]:
def parse_float(value):
    if pd.isna(value):
        return None
    value = (
        value.replace(",", "")
             .replace(" m", "")
             .replace("m", "")
             .replace("$", "")
             .replace("million", "")
             .replace("illion", "")
             .strip()
    )
    return float(value)

In [9]:
# Columns that need to be cleaned and converted to float
float_columns = ['Liftoff Thrust', 'Rocket Height', 'Price','Payload to LEO']

In [10]:
rockets_df[float_columns] = rockets_df[float_columns].map(parse_float)

In [11]:
rockets_df['Stages'] = pd.to_numeric(rockets_df['Stages'], errors='coerce').astype('Int64')

In [13]:
column_renames = {
    'Name': 'name',
    'Cmp': 'company',
    'Status': 'status',
    'Liftoff Thrust': 'liftoff_thrust',
    'Payload to LEO': 'payload_leo',
    'Rocket Height': 'height',
    'Price': 'price',
    'Stages': 'stages'
}

In [14]:
rockets_df = rockets_df.rename(columns=column_renames)

In [15]:
rockets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 409 entries, 0 to 408
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            409 non-null    string 
 1   company         409 non-null    string 
 2   status          409 non-null    string 
 3   liftoff_thrust  291 non-null    float64
 4   payload_leo     408 non-null    float64
 5   stages          409 non-null    Int64  
 6   height          326 non-null    float64
 7   price           98 non-null     float64
dtypes: Int64(1), float64(4), string(3)
memory usage: 26.1 KB


In [16]:
rockets_df.replace(0, None, inplace=True)

In [None]:
rockets_df.to_excel("../data/extracted_data/rockets.xlsx", index=False) 