In [7]:
# Installer les bibliothèques (si ce n'est pas déjà fait)
!pip install pandas numpy

# Importer les bibliothèques
import pandas as pd
import numpy as np

# Charger le dataset
df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_1.csv")

# Afficher les 10 premières lignes
print(df.head(10))

# Identifier et calculer le pourcentage des valeurs manquantes
missing_percentage = df.isnull().sum() / len(df) * 100
print("Pourcentage de valeurs manquantes par colonne:\n", missing_percentage)

# Identifier les colonnes numériques et catégorielles
print("\nTypes de données des colonnes:\n", df.dtypes)

# TASK 1: Nombre de lancements par site
launch_counts = df['LaunchSite'].value_counts()
print("\nNombre de lancements par site:\n", launch_counts)

# TASK 2: Nombre et occurrence de chaque orbite (sans compter GTO)
orbit_counts = df['Orbit'].value_counts()
orbit_counts = orbit_counts.drop('GTO', errors='ignore')
print("\nNombre et occurrence de chaque orbite:\n", orbit_counts)

# TASK 3: Nombre et occurrence des résultats de mission
landing_outcomes = df['Outcome'].value_counts()
print("\nOccurrences des résultats de mission:\n", landing_outcomes)

# Identifier les "bad outcomes" (échecs)
# On choisit les indices correspondant aux échecs dans landing_outcomes
# Ajuster selon l'ordre réel des outcomes dans landing_outcomes
bad_outcomes = set(landing_outcomes.keys()[[1,3,5,6,7]])
print("\nBad outcomes:", bad_outcomes)

# TASK 4: Créer une colonne de classification (1 = succès, 0 = échec)
landing_class = [0 if outcome in bad_outcomes else 1 for outcome in df['Outcome']]
df['Class'] = landing_class

# Afficher les 8 premières lignes de la colonne Class
print("\nColonne Class:\n", df[['Class']].head(8))

# Vérifier le taux de succès
success_rate = df["Class"].mean()
print("\nTaux de succès des lancements:", success_rate)

# Exporter le dataset pour la suite
df.to_csv("dataset_part_2.csv", index=False)


   FlightNumber        Date BoosterVersion  PayloadMass Orbit    LaunchSite  \
0             1  2010-06-04       Falcon 9  6104.959412   LEO  CCAFS SLC 40   
1             2  2012-05-22       Falcon 9   525.000000   LEO  CCAFS SLC 40   
2             3  2013-03-01       Falcon 9   677.000000   ISS  CCAFS SLC 40   
3             4  2013-09-29       Falcon 9   500.000000    PO   VAFB SLC 4E   
4             5  2013-12-03       Falcon 9  3170.000000   GTO  CCAFS SLC 40   
5             6  2014-01-06       Falcon 9  3325.000000   GTO  CCAFS SLC 40   
6             7  2014-04-18       Falcon 9  2296.000000   ISS  CCAFS SLC 40   
7             8  2014-07-14       Falcon 9  1316.000000   LEO  CCAFS SLC 40   
8             9  2014-08-05       Falcon 9  4535.000000   GTO  CCAFS SLC 40   
9            10  2014-09-07       Falcon 9  4428.000000   GTO  CCAFS SLC 40   

       Outcome  Flights  GridFins  Reused   Legs LandingPad  Block  \
0    None None        1     False   False  False        NaN 

In [None]:
# -----------------------------------------------
# EXO 3 – EDA et Préparation pour Analyse Prédictive
# -----------------------------------------------

# Installer pandas et numpy dans Jupyter
!pip install pandas numpy --quiet

# Importer les bibliothèques (pas besoin d'installer sqlite3)
import pandas as pd
import numpy as np
import sqlite3

# Charger le dataset
df = pd.read_csv(
    "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_1.csv"
)

# Aperçu des 10 premières lignes
print("First 10 rows of the dataset:")
print(df.head(10))

# Vérifier les valeurs manquantes
missing_percentage = df.isnull().sum() / len(df) * 100
print("\nPercentage of missing values per column:\n", missing_percentage)

# Identifier les colonnes numériques et catégorielles
print("\nColumn data types:\n", df.dtypes)

# Nombre de lancements par site
launch_counts = df['LaunchSite'].value_counts()
print("\nNumber of launches per site:\n", launch_counts)

# Nombre et occurrence de chaque orbite (sans GTO)
orbit_counts = df['Orbit'].value_counts().drop('GTO', errors='ignore')
print("\nNumber and occurrence of each orbit:\n", orbit_counts)

# Nombre et occurrence des résultats de mission
landing_outcomes = df['Outcome'].value_counts()
print("\nOccurrences of mission outcomes:\n", landing_outcomes)

# Identifier les "bad outcomes"
bad_outcomes = set(landing_outcomes.keys()[[1,3,5,6,7]])
print("\nBad outcomes:", bad_outcomes)

# Créer colonne Class
landing_class = [0 if outcome in bad_outcomes else 1 for outcome in df['Outcome']]
df['Class'] = landing_class
print("\nClass column preview:")
print(df[['Class']].head(8))

# Taux de succès
success_rate = df["Class"].mean()
print("\nLaunch success rate:", success_rate)

# Exporter pour la suite
df.to_csv("dataset_part_2.csv", index=False)

# EDA avec SQL (SQLite)
conn = sqlite3.connect(":memory:")
df.to_sql("spacex_launches", conn, index=False, if_exists='replace')

# Exemple de requêtes SQL
launch_site_sql = pd.read_sql("SELECT LaunchSite, COUNT(*) AS LaunchCount FROM spacex_launches GROUP BY LaunchSite ORDER BY LaunchCount DESC;", conn)
print("\nSQL - Number of launches per site:")
print(launch_site_sql)

orbit_sql = pd.read_sql("SELECT Orbit, COUNT(*) AS Count FROM spacex_launches WHERE Orbit != 'GTO' GROUP BY Orbit ORDER BY Count DESC;", conn)
print("\nSQL - Number and occurrence of each orbit:")
print(orbit_sql)

outcome_sql = pd.read_sql("SELECT Outcome, COUNT(*) AS Count FROM spacex_launches GROUP BY Outcome ORDER BY Count DESC;", conn)
print("\nSQL - Occurrences of mission outcomes:")
print(outcome_sql)

success_rate_sql = pd.read_sql("SELECT AVG(Class) AS SuccessRate FROM spacex_launches;", conn)
print("\nSQL - Launch success rate:")
print(success_rate_sql)
