In [1]:
import pandas as pd

In [2]:
satellites_file = "./raw_datasets/UCS-Satellite-Database 5-1-2023.xlsx"

In [3]:
columns_to_read = [
    'Current Official Name of Satellite',
    'Operator/Owner',
    'Country of Operator/Owner',
    'Contractor',
    'Country of Contractor',
    'Purpose',
    'Users',
    'Class of Orbit',
    'Period (minutes)',
    'Launch Mass (kg.)',
    'Date of Launch',
    'Expected Lifetime (yrs.)',
    'Launch Site',
    'Launch Vehicle'
]
types = {
    'Current Official Name of Satellite': 'string',
    'Operator/Owner': 'string',
    'Country of Operator/Owner': 'string',
    'Contractor': 'string',
    'Country of Contractor': 'string',
    'Purpose': 'string',
    'Users': 'string',
    'Class of Orbit': 'string',
    'Period (minutes)': 'float64',
    'Launch Mass (kg.)': 'float64',
    'Date of Launch': 'string',
    'Expected Lifetime (yrs.)': 'float64',
    'Launch Site': 'string',
    'Launch Vehicle': 'string'
}

In [4]:
satellites_df = pd.read_excel(satellites_file,usecols=columns_to_read,dtype=types)

In [5]:
satellites_df = satellites_df.rename(columns={
    'Current Official Name of Satellite': 'name',
    'Operator/Owner': 'operator',
    'Country of Operator/Owner': 'country_of_operator',
    'Contractor': 'contractor',
    'Country of Contractor': 'country_of_contractor',
    'Purpose': 'purpose',
    'Users': 'users',
    'Class of Orbit': 'class_of_orbit',
    'Period (minutes)': 'orbit_period',
    'Launch Mass (kg.)': 'launch_mass',
    'Date of Launch': 'date_of_launch',
    'Expected Lifetime (yrs.)': 'expected_lifetime',
    'Launch Site': 'launch_site',
    'Launch Vehicle': 'launch_vehicle'
})


In [6]:
satellites_df['date_of_launch'] = pd.to_datetime(satellites_df['date_of_launch'],format='mixed').dt.date

In [7]:
satellites_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7560 entries, 0 to 7559
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   7560 non-null   string 
 1   country_of_operator    7560 non-null   string 
 2   operator               7560 non-null   string 
 3   users                  7560 non-null   string 
 4   purpose                7560 non-null   string 
 5   class_of_orbit         7560 non-null   string 
 6   orbit_period           7504 non-null   float64
 7   launch_mass            7315 non-null   float64
 8   date_of_launch         7559 non-null   object 
 9   expected_lifetime      5450 non-null   float64
 10  contractor             7560 non-null   string 
 11  country_of_contractor  7560 non-null   string 
 12  launch_site            7560 non-null   string 
 13  launch_vehicle         7560 non-null   string 
dtypes: float64(3), object(1), string(10)
memory usage: 827.0

In [None]:
# Remplacer les valeurs manquantes de la column orbit_period par la médiane de chaque classe d'orbite
satellites_df['orbit_period'] = satellites_df['orbit_period'].fillna(
    satellites_df.groupby('class_of_orbit')['orbit_period'].transform('median')
)

In [None]:
satellites_df['country_of_operator'] = satellites_df['country_of_operator'].str.split('/')

In [None]:
satellites_df['country_of_contractor'] = satellites_df['country_of_contractor'].str.split('/')

In [None]:
satellites_df.to_excel("./extracted_data/sattelites.xlsx", index=False) 