In [12]:
import pandas as pd
from sqlalchemy import create_engine, text

# Connect to the database
engine = create_engine('mysql+pymysql://root:@localhost/space_launch_db')

# Query to get all table names in the current database
query_tables = """
SELECT TABLE_NAME 
FROM information_schema.TABLES 
WHERE TABLE_SCHEMA = DATABASE()
"""

# Open a connection and execute the query to get table names
with engine.connect() as connection:
    result_tables = connection.execute(text(query_tables)).fetchall()

# Extract table names from the result
table_names = [row[0] for row in result_tables]

# Dictionary to hold the DataFrames for each table
dfs = {}

# Loop through each table, load it into a DataFrame, and store it in the dictionary
for table in table_names:
    query = f"SELECT * FROM {table}"
    dfs[table] = pd.read_sql(query, engine)

# Now all tables are loaded into DataFrames and stored in the `dfs` dictionary
# Example: To access a specific table's DataFrame, you can use dfs['table_name']
# Print all table names and their corresponding DataFrame shapes
for table, df in dfs.items():
    print(f"Table: {table}, Shape: {df.shape}")


Table: dim_countries, Shape: (249, 2)
Table: dim_locations, Shape: (137, 5)
Table: dim_organisations, Shape: (56, 3)
Table: fact_conflicts, Shape: (2686, 6)
Table: fact_launches, Shape: (4324, 11)


In [13]:
fact_launches = dfs['fact_launches']
fact_conflicts = dfs['fact_conflicts']
dim_countries = dfs['dim_countries']
dim_locations = dfs['dim_locations']
dim_organisations = dfs['dim_organisations']
fact_launches = fact_launches.merge(dim_locations, on='location_id', how='left')
fact_launches = fact_launches.merge(dim_organisations, on='organisation_id', how='left')
fact_launches = fact_launches.merge(dim_countries, on='country_id', how='left')
fact_launches.drop(columns= ['organisation_id', 'location_id'], inplace = True)

In [14]:
print(fact_launches.head())

   id        date                                        detail rocket_status  \
0   0  2020-08-07  Falcon 9 Block 5 | Starlink V1 L9 & BlackSky  StatusActive   
1   1  2020-08-06           Long March 2D | Gaofen-9 04 & Q-SAT  StatusActive   
2   2  2020-08-04            Starship Prototype | 150 Meter Hop  StatusActive   
3   3  2020-07-30  Proton-M/Briz-M | Ekspress-80 & Ekspress-103  StatusActive   
4   4  2020-07-30                    Atlas V 541 | Perseverance  StatusActive   

    price mission_status  wind_speed  humidity  temperature  \
0   50.00        Success     5.33923   75.4536      5.10021   
1   29.75        Success     8.35946   51.0117      5.46586   
2     NaN        Success    11.74800   56.7506     20.54830   
3   65.00        Success    13.00410   69.0498     -1.34319   
4  145.00        Success     5.58771   60.5856     18.02130   

                                            location  latitude  longitude  \
0         LC-39A, Kennedy Space Center, Florida, USA   30

In [15]:
df = fact_launches

In [16]:
df['mission_status'].unique()

array(['Success', 'Failure', 'Prelaunch Failure', 'Partial Failure'],
      dtype=object)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


# Preprocessing
# Dropping columns that are not needed (id, detail, date, etc.)
df.drop(columns=['id', 'detail', 'date'], inplace=True)

# Handling missing values
df['price'].fillna(df['price'].mean(), inplace=True)

# Convert target variable to binary (Success = 1, Failure = 0)
df['mission_status'] = df['mission_status'].apply(lambda x: 1 if x == 'Success' else 0)

# Encoding categorical columns
label_encoders = {}
for col in ['rocket_status', 'location', 'organisation', 'country']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split the data into features and target
X = df.drop(columns='mission_status')
y = df['mission_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price'].fillna(df['price'].mean(), inplace=True)


Accuracy: 0.9040
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.11      0.19        87
           1       0.91      0.99      0.95       778

    accuracy                           0.90       865
   macro avg       0.77      0.55      0.57       865
weighted avg       0.88      0.90      0.87       865



In [19]:
import joblib

model_path = '../models/success_prediction_model.pkl'
joblib.dump(model, model_path)


['../models/success_prediction_model.pkl']

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Handling missing values in the price (target variable)
df.dropna(subset=['price'], inplace=True)

# Cap the price at the 99th percentile to remove extreme outliers
price_cap = df['price'].quantile(0.99)
df['price'] = np.where(df['price'] > price_cap, price_cap, df['price'])

# Apply log transformation to the price column to stabilize variance
df['price'] = np.log1p(df['price'])

# Encode categorical columns
label_encoders = {}
for col in ['rocket_status', 'location', 'organisation', 'country']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split the data into features and target
X = df.drop(columns='price')
y = df['price']

# Normalize/scale the numerical features
scaler = StandardScaler()
X[['wind_speed', 'humidity', 'temperature', 'latitude', 'longitude']] = scaler.fit_transform(
    X[['wind_speed', 'humidity', 'temperature', 'latitude', 'longitude']])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Model Training with cross-validation
model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# Reverse log transformation for price
y_test_exp = np.expm1(y_test)
y_pred_exp = np.expm1(y_pred)

# Model Evaluation
mse = mean_squared_error(y_test_exp, y_pred_exp)
r2 = r2_score(y_test_exp, y_pred_exp)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

# Feature Importance
print("Feature Importances:", best_model.feature_importances_)


Mean Squared Error: 415.9626
R-squared: 0.9058
Feature Importances: [0.22607448 0.03389455 0.03106727 0.03007478 0.25637455 0.12067895
 0.11527556 0.00877004 0.15371925 0.01851118 0.00555938]


In [22]:
import os

model_dir = '../models'
os.makedirs(model_dir, exist_ok=True)  # Create the directory if it doesn't exist
model_path = os.path.join(model_dir, 'rocket_launch_cost_model.pkl')

# Save the trained model
joblib.dump(best_model, model_path)

print(f"Model saved to {model_path}")

Model saved to ../models/rocket_launch_cost_model.pkl
