<a href="https://colab.research.google.com/github/abdo-ghg/Kepler-Exoplanet-/blob/main/Kepler_Exoplanet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kepler Exoplanet Classification and Modeling Notebook


## 🪐 Notebook Description

This Jupyter Notebook is part of the ExoML Platform, an AI-powered system designed to detect and classify potential exoplanets from NASA’s Kepler, K2, and TESS missions.
The notebook focuses on data preprocessing, exploratory data analysis, feature engineering, and model training to distinguish between confirmed exoplanets and false positives.

It allows for:

Visual exploration of astronomical datasets

Model selection and performance evaluation

Integration with a Flask backend to visualize accuracy, AUC, recall, and precision on the website

Top 5 feature selection for each dataset to enable real-time prediction through the web app

## 📦 Imports & Configurations
Import libraries and set global options.

In [None]:
import os, json, uuid, datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix, roc_curve)
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import joblib
import datetime, uuid, os, json
import warnings
warnings.filterwarnings('ignore')
# Paths (adjust if needed)
NOTEBOOK_DIR = os.getcwd()
BASE_MODEL_DIR = os.path.join('..','models','Kepler')
PLOTS_DIR = os.path.join('..','static','plots','Kepler')
RESULTS_DIR = os.path.join('..','results')
DATA_PATH = os.path.join('..','..','Data Sources','Kepler.csv')  # ../../Data Sources/Kepler.csv from Notebook dir

os.makedirs(BASE_MODEL_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

RANDOM_STATE = 42
pd.set_option('display.max_columns', 200)
sns.set(style='whitegrid')

## 📂 Data Loading


In [None]:
def load_raw_dataset(csv_path):
    df = pd.read_csv(csv_path)
    print(f"✅ Loaded dataset: {df.shape[0]} rows, {df.shape[1]} columns")
    return df

DATA_PATH = os.path.join('/content/Kepler.csv')
df = load_raw_dataset(DATA_PATH)


✅ Loaded dataset: 9564 rows, 141 columns


## ⚙️ Feature Engineering
Create new features, transform data.

In [None]:
selected_columns = [
    # --- Physical Parameters ---
    "koi_prad", "koi_prad_err1", "koi_prad_err2",
    "koi_ror", "koi_depth", "koi_srho",

    # --- Orbital Parameters ---
    "koi_period", "koi_sma", "koi_eccen","koi_incl",
    "koi_duration", "koi_ingress", "koi_dor",

    # --- Thermal / Habitability ---
    "koi_teq", "koi_insol",

    # --- Stellar Properties ---
    "koi_steff", "koi_slogg", "koi_smet",
    "koi_srad", "koi_smass", "koi_sage",

    # --- Detection / Validation ---
    "koi_disposition", "koi_pdisposition", "koi_score",
    "koi_model_snr", "koi_num_transits",

    # --- Coordinates & brightness ---
    "ra", "dec", "koi_kepmag"
]

df = df[selected_columns]
df.shape

(9564, 29)

In [None]:
rename_dict = {
    # Physical
    "koi_prad": "planet_radius_earth",
    "koi_prad_err1": "planet_radius_err_upper",
    "koi_prad_err2": "planet_radius_err_lower",
    "koi_ror": "radius_ratio_Rp_Rstar",
    "koi_depth": "transit_depth_ppm",
    "koi_srho": "stellar_density_gcm3",

    # Orbital
    "koi_period": "orbital_period_days",
    "koi_sma": "semi_major_axis_AU",
    "koi_eccen": "eccentricity",
    "koi_incl": "inclination_deg",
    "koi_duration": "transit_duration_hrs",
    "koi_ingress": "ingress_duration_hrs",
    "koi_dor": "scaled_distance_a_Rstar",

    # Thermal
    "koi_teq": "equilibrium_temp_K",
    "koi_insol": "insolation_flux_Earth",

    # Stellar
    "koi_steff": "stellar_temp_K",
    "koi_slogg": "stellar_logg",
    "koi_smet": "stellar_metallicity_FeH",
    "koi_srad": "stellar_radius_solar",
    "koi_smass": "stellar_mass_solar",
    "koi_sage": "stellar_age_Gyr",

    # Detection / Validation
    "koi_disposition": "final_disposition",
    "koi_pdisposition": "kepler_disposition",
    "koi_score": "disposition_score",
    "koi_model_snr": "signal_to_noise",
    "koi_num_transits": "num_transits",

    # Coordinates & brightness
    "ra": "RA_deg",
    "dec": "Dec_deg",
    "koi_kepmag": "kepler_mag"
}


if 'koi_disposition' not in selected_columns:
    selected_columns.append('koi_disposition') # Add if missing for filtering

cols_to_select = [c for c in selected_columns if c in df.columns] # Use df instead of df0
df = df[cols_to_select].rename(columns=rename_dict)

if 'final_disposition' in df.columns:
    # Filter only if the final_disposition column exists
    df = df[df['final_disposition'].isin(['CONFIRMED','CANDIDATE','FALSE POSITIVE'])].copy()
    df['Target'] = (df['final_disposition'] != 'FALSE POSITIVE').astype(int)
else:
    # If final_disposition is not in df after renaming, print the warning
    print('Warning: final_disposition column not found after selection/renaming. Cannot filter by disposition or create Target.')

print('Shape after selecting columns and attempting to filter dispositions:', df.shape)

Shape after selecting columns and attempting to filter dispositions: (9564, 30)


## 🧹 Data Cleaning
Handle missing values, duplicates, data types.

In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.isnull().sum()

Unnamed: 0,0
planet_radius_earth,363
planet_radius_err_upper,363
planet_radius_err_lower,363
radius_ratio_Rp_Rstar,363
transit_depth_ppm,363
stellar_density_gcm3,321
orbital_period_days,0
semi_major_axis_AU,363
eccentricity,363
inclination_deg,364


In [None]:
from sklearn.impute import SimpleImputer # Import SimpleImputer

all_numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
all_categorical_cols = df.select_dtypes(include='object').columns.tolist()

numeric_cols_with_all_missing = [col for col in all_numeric_cols if df[col].isnull().all()]
print(f"Numeric columns with all missing values (will be skipped by IterativeImputer): {numeric_cols_with_all_missing}")

imputable_numeric_cols = [col for col in all_numeric_cols if col not in numeric_cols_with_all_missing]
print(f"Numeric columns to impute iteratively: {imputable_numeric_cols}")

if len(imputable_numeric_cols) > 0:
    df_numeric_imputable = df[imputable_numeric_cols].copy()
    imputer = IterativeImputer(random_state=42)
    df_imputed_values = imputer.fit_transform(df_numeric_imputable)
    df_imputed_part = pd.DataFrame(df_imputed_values, columns=imputable_numeric_cols, index=df.index)
    df[imputable_numeric_cols] = df_imputed_part[imputable_numeric_cols]
    print("Iterative imputation applied to numeric columns that are not all missing.")
    print("\nMissing values after numeric imputation (imputable numeric columns):")
    print(df[imputable_numeric_cols].isnull().sum().sum()) # Should be 0 if imputation was successful for these
else:
    print("No numeric columns found that require iterative imputation.")


print(f"\nCategorical columns identified: {all_categorical_cols}")

if len(all_categorical_cols) > 0:
    cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
    df[all_categorical_cols] = cat_imputer.fit_transform(df[all_categorical_cols])
    print("Simple imputation applied to categorical columns.")
    print("\nMissing values after categorical imputation:")
    print(df[all_categorical_cols].isnull().sum().sum()) # Should be 0 after imputation

else:
    print("No categorical columns found to impute.")

Numeric columns with all missing values (will be skipped by IterativeImputer): ['ingress_duration_hrs', 'stellar_age_Gyr']
Numeric columns to impute iteratively: ['planet_radius_earth', 'planet_radius_err_upper', 'planet_radius_err_lower', 'radius_ratio_Rp_Rstar', 'transit_depth_ppm', 'stellar_density_gcm3', 'orbital_period_days', 'semi_major_axis_AU', 'eccentricity', 'inclination_deg', 'transit_duration_hrs', 'scaled_distance_a_Rstar', 'equilibrium_temp_K', 'insolation_flux_Earth', 'stellar_temp_K', 'stellar_logg', 'stellar_metallicity_FeH', 'stellar_radius_solar', 'stellar_mass_solar', 'disposition_score', 'signal_to_noise', 'num_transits', 'RA_deg', 'Dec_deg', 'kepler_mag', 'Target']
Iterative imputation applied to numeric columns that are not all missing.

Missing values after numeric imputation (imputable numeric columns):
0

Categorical columns identified: ['final_disposition', 'kepler_disposition']
Simple imputation applied to categorical columns.

Missing values after categoric

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

print("Numeric columns:", numeric_cols)
print("\nCategorical columns:", categorical_cols)

Numeric columns: ['planet_radius_earth', 'planet_radius_err_upper', 'planet_radius_err_lower', 'radius_ratio_Rp_Rstar', 'transit_depth_ppm', 'stellar_density_gcm3', 'semi_major_axis_AU', 'inclination_deg', 'transit_duration_hrs', 'scaled_distance_a_Rstar', 'equilibrium_temp_K', 'insolation_flux_Earth', 'stellar_temp_K', 'stellar_logg', 'stellar_metallicity_FeH', 'stellar_radius_solar', 'stellar_mass_solar', 'disposition_score', 'signal_to_noise', 'num_transits', 'RA_deg', 'Dec_deg', 'kepler_mag', 'Target']

Categorical columns: ['kepler_disposition']


In [None]:
df.shape

(9564, 30)

In [None]:
df.nunique()

Unnamed: 0,0
planet_radius_earth,3351
planet_radius_err_upper,2150
planet_radius_err_lower,1601
radius_ratio_Rp_Rstar,8865
transit_depth_ppm,7310
stellar_density_gcm3,9323
orbital_period_days,9564
semi_major_axis_AU,4159
eccentricity,1
inclination_deg,2624


In [None]:
columns_to_drop = ['final_disposition', 'stellar_age_Gyr', 'ingress_duration_hrs', 'eccentricity','orbital_period_days']
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
df = df.drop(columns=existing_columns_to_drop, axis=1)
print("Shape after dropping columns:", df.shape)

Shape after dropping columns: (9564, 25)


## 🔍 Exploratory Data Analysis (EDA)
Summarize the dataset and visualize distributions.

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   planet_radius_earth      9564 non-null   float64
 1   planet_radius_err_upper  9564 non-null   float64
 2   planet_radius_err_lower  9564 non-null   float64
 3   radius_ratio_Rp_Rstar    9564 non-null   float64
 4   transit_depth_ppm        9564 non-null   float64
 5   stellar_density_gcm3     9564 non-null   float64
 6   semi_major_axis_AU       9564 non-null   float64
 7   inclination_deg          9564 non-null   float64
 8   transit_duration_hrs     9564 non-null   float64
 9   scaled_distance_a_Rstar  9564 non-null   float64
 10  equilibrium_temp_K       9564 non-null   float64
 11  insolation_flux_Earth    9564 non-null   float64
 12  stellar_temp_K           9564 non-null   float64
 13  stellar_logg             9564 non-null   float64
 14  stellar_metallicity_FeH 

In [None]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
planet_radius_earth,9564.0,103.675,3018.70964,-56.554867,1.43,2.49,21.635,200346.0
planet_radius_err_upper,9564.0,17.438449,383.759461,-443.297312,0.24,0.54,3.280411,21640.0
planet_radius_err_lower,9564.0,-33.496626,1171.380913,-77200.0,-2.47,-0.32,-0.14,94.70556
radius_ratio_Rp_Rstar,9564.0,0.284767,3.244525,-2.176886,0.012524,0.021889,0.142788,99.87065
transit_depth_ppm,9564.0,23904.880635,80797.970599,-50269.413204,162.375,441.45,1928.075,1541400.0
stellar_density_gcm3,9564.0,9.252888,52.907912,-6.574746,0.244685,1.03347,3.27876,980.8542
semi_major_axis_AU,9564.0,0.224623,0.556101,-0.1057,0.0392,0.09,0.226575,44.9892
inclination_deg,9564.0,82.397168,14.971559,2.29,83.034719,88.33,89.74,93.3268
transit_duration_hrs,9564.0,5.621606,6.471554,0.052,2.43775,3.7926,6.2765,138.54
scaled_distance_a_Rstar,9564.0,78.073535,829.407958,-99.834648,5.64,16.565,50.9,79614.0


In [None]:
print('Columns:', df.columns.tolist())
print('\nMissing values per column:')
print(df.isnull().sum().sort_values(ascending=False).head(20))
print('\nTarget distribution:')
print(df['Target'].value_counts(normalize=True))

Columns: ['planet_radius_earth', 'planet_radius_err_upper', 'planet_radius_err_lower', 'radius_ratio_Rp_Rstar', 'transit_depth_ppm', 'stellar_density_gcm3', 'semi_major_axis_AU', 'inclination_deg', 'transit_duration_hrs', 'scaled_distance_a_Rstar', 'equilibrium_temp_K', 'insolation_flux_Earth', 'stellar_temp_K', 'stellar_logg', 'stellar_metallicity_FeH', 'stellar_radius_solar', 'stellar_mass_solar', 'kepler_disposition', 'disposition_score', 'signal_to_noise', 'num_transits', 'RA_deg', 'Dec_deg', 'kepler_mag', 'Target']

Missing values per column:
planet_radius_earth        0
planet_radius_err_upper    0
planet_radius_err_lower    0
radius_ratio_Rp_Rstar      0
transit_depth_ppm          0
stellar_density_gcm3       0
semi_major_axis_AU         0
inclination_deg            0
transit_duration_hrs       0
scaled_distance_a_Rstar    0
equilibrium_temp_K         0
insolation_flux_Earth      0
stellar_temp_K             0
stellar_logg               0
stellar_metallicity_FeH    0
stellar_rad

## 📈 Data Visualization
Deeper insights with plots.

In [None]:
numeric_df = df.select_dtypes(include=np.number)
correlation_matrix = numeric_df.corr()

# Find pairs of columns with correlation >= 0.5 or <= -0.5 (excluding self-correlation)
high_corr_pairs = []
# Use abs() for correlation to include strong negative correlations
for i in range(len(correlation_matrix.columns)):
    for j in range(i + 1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) >= 0.5:
            col1 = correlation_matrix.columns[i]
            col2 = correlation_matrix.columns[j]
            high_corr_pairs.append((col1, col2, correlation_matrix.iloc[i, j]))

print(f"Found {len(high_corr_pairs)} pairs with |correlation| >= 0.5")

# Determine the number of rows and columns for subplots
n_pairs = len(high_corr_pairs)
n_cols = 3  # You can adjust the number of columns as needed
n_rows = (n_pairs + n_cols - 1) // n_cols

plt.figure(figsize=(n_cols * 6, n_rows * 5)) # Adjust figure size dynamically

for i, (col1, col2, corr_value) in enumerate(high_corr_pairs):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.scatterplot(data=df, x=col1, y=col2, hue='Target', alpha=0.6)
    plt.title(f'{col1} vs {col2} (Corr: {corr_value:.2f})')
    plt.xlabel(col1)
    plt.ylabel(col2)
    # Optional: Remove legend from individual subplots if there are many to avoid clutter
    if i < n_pairs - 1: # Keep legend only for the last plot or remove entirely
        plt.legend([],[], frameon=False)

plt.tight_layout()
plt.show()

In [None]:
numeric_df = df.select_dtypes(include=np.number)

plt.figure(figsize=(20,10))
sns.heatmap(numeric_df.corr(),annot=True,cmap='coolwarm',linewidths=0.5)
plt.show()

dispsition score (koi_scroe) is a the importantest feature

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x=df['Target'])
plt.title("Target Distribution (Exoplanet vs Non-Exoplanet)")
plt.show()

In [None]:

cols_for_hist = [col for col in  if df[col].nunique() > 4]

n_cols = 4
n_rows = (len(cols_for_hist) + n_cols - 1) // n_cols

plt.figure(figsize=(20, n_rows * 5))

for i, col in enumerate(cols_for_hist):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.histplot(data=df, x=col, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
def feature_engineering(df):
    # your renaming, filtering, imputing logic here
    return cleaned_df

#### Outliers

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns

n_cols = 4  # Number of columns per row in the subplot grid
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols  # Calculate the number of rows needed

plt.figure(figsize=(20, n_rows * 5)) # Adjust figure size based on the number of rows

for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, n_cols, i + 1)
    sns.boxplot(y=df[col])
    plt.title(col)
    plt.xlabel('')

plt.tight_layout()
plt.show()

In [None]:
def remove_outliers(df, col):
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

for col in numeric_cols:
    df = remove_outliers(df, col)

#### Splite


In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Target', axis=1)
y = df['Target']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Data splitting complete:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
# Define numeric and categorical features
numeric_features = X_train.select_dtypes(include=np.number).columns
categorical_features = X_train.select_dtypes(include='object').columns

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

print("Preprocessor defined.")

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

print("Shape of processed training data:", X_train_processed.shape)
print("Shape of processed validation data:", X_val_processed.shape)
print("Shape of processed test data:", X_test_processed.shape)

## 🤖 Modeling
Train and evaluate models.

In [None]:
def train_models(X_train, y_train, preprocessor, save_dir=BASE_MODEL_DIR):
    models = {}

    # Ensure target variable is integer type for classifiers that expect it
    y_train_int = y_train.astype(int)

    # Random Forest pipeline
    rf_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
    ])

    rf_pipe.fit(X_train, y_train_int)
    models['RandomForest'] = rf_pipe

    # XGBoost pipeline
    xgb_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(objective='binary:logistic',
                                     random_state=RANDOM_STATE,
                                     use_label_encoder=False,
                                     eval_metric='logloss'))
    ])
    xgb_pipe.fit(X_train, y_train_int) # Use y_train_int
    models['XGBoost'] = xgb_pipe

    # Logistic Regression pipeline
    lr_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))
    ])
    lr_pipe.fit(X_train, y_train_int) # Use y_train_int
    models['LogisticRegression'] = lr_pipe

    # حفظ النماذج
    for name, model in models.items():
        joblib.dump(model, os.path.join(save_dir, f"{name}_pipeline.pkl"))

    return models

In [None]:
models = train_models(X_train, y_train, preprocessor, save_dir=BASE_MODEL_DIR)
print("✅ Trained models:", list(models.keys()))

# Evaluation


In [None]:
BASE_DIR = os.getcwd()
RESULTS_DIR = os.path.join(BASE_DIR, "results")
PLOTS_DIR = os.path.join(BASE_DIR, "plots")
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

# === Evaluation Function ===
def evaluate_and_save(models, X_test, y_test, feature_names, dataset_name='Kepler',
                      plots_dir=PLOTS_DIR, results_dir=RESULTS_DIR):

    results = {}
    for name, model in models.items():
        safe_name = name.replace(" ", "_")

        y_pred = model.predict(X_test)

        # probabilities
        y_proba = None
        if hasattr(model, "predict_proba"):
            try:
                y_proba = model.predict_proba(X_test)[:, 1]
            except:
                pass

        metrics = {
            "accuracy": float(accuracy_score(y_test, y_pred)),
            "precision": float(precision_score(y_test, y_pred, zero_division=0)),
            "recall": float(recall_score(y_test, y_pred, zero_division=0)),
            "f1": float(f1_score(y_test, y_pred, zero_division=0)),
            "auc": float(roc_auc_score(y_test, y_proba)) if y_proba is not None else None
        }
        results[name] = metrics

        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(5,4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                    xticklabels=['Pred 0','Pred 1'], yticklabels=['True 0','True 1'])
        plt.title(f"{dataset_name} - {name} Confusion Matrix")
        plt.savefig(os.path.join(plots_dir, f"{safe_name}_confusion_matrix.png"), bbox_inches='tight')
        plt.close()

        # ROC Curve
        if y_proba is not None:
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            plt.figure(figsize=(6,4))
            plt.plot(fpr, tpr, lw=2, label=f"AUC = {metrics['auc']:.3f}")
            plt.plot([0,1],[0,1],'--')
            plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title(f"{dataset_name} - {name} ROC")
            plt.legend(loc='lower right')
            plt.savefig(os.path.join(plots_dir, f"{safe_name}_roc.png"), bbox_inches='tight')
            plt.close()


    # Save results
    out = {
        "meta": {
            "run_id": str(uuid.uuid4()),
            "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
            "dataset": dataset_name
        },
        "results": results
    }

    with open(os.path.join(results_dir, f"{dataset_name}_metrics.json"), 'w') as f:
        json.dump(out, f, indent=4)

    return out

# === Get feature names ===
try:
    # Get feature names AFTER preprocessing
    # This will now match the shape of the data that will be passed to the classifier within the pipeline
    processed_feature_names = preprocessor.get_feature_names_out()
except:
    # Fallback if get_feature_names_out is not available or preprocessor structure is different
    print("Warning: Could not get feature names from preprocessor. Using X_test columns as a fallback.")
    processed_feature_names = X_test.columns.tolist()


# === Run Evaluation ===
# Check if models are trained before evaluating
if 'models' in globals():
    # Pass the original X_test to the evaluation function
    eval_output = evaluate_and_save(models, X_test, y_test, processed_feature_names, dataset_name='Kepler')
    print("✅ Evaluation saved to:", RESULTS_DIR)
    print(json.dumps(eval_output, indent=2))
else:
    print("Error: Models are not trained yet. Please run the model training cell first.")

# Comparison



In [None]:
def save_comparison_csv(results_json_path=os.path.join(RESULTS_DIR,'Kepler_metrics.json'),
                        out_csv=os.path.join(RESULTS_DIR,'Kepler_models_comparison.csv')):
    with open(results_json_path,'r') as f:
        data = json.load(f)
    rows = []
    for model_name, metrics in data['results'].items():
        row = {"model": model_name}
        row.update(metrics)
        rows.append(row)
    df_comp = pd.DataFrame(rows)
    df_comp.to_csv(out_csv, index=False)
    return df_comp

df_comp = save_comparison_csv()
print("Saved comparison CSV:", os.path.join(RESULTS_DIR,'Kepler_models_comparison.csv'))
df_comp


## 📊 Feature Importance
Compare results across models.

In [None]:
def extract_and_save_feature_importances(models: dict,
                                         feature_names: list, # processed feature names
                                         top_k=5,
                                         plots_dir=PLOTS_DIR,
                                         out_json=os.path.join(RESULTS_DIR, 'model_feature_importances.json'),
                                         out_csv=os.path.join(RESULTS_DIR, 'model_feature_importances.csv')):

    all_results = {}
    rows = []

    for name, model in models.items():
        imp_arr = None

        # direct attribute
        if hasattr(model, "feature_importances_"):
            imp_arr = np.array(model.feature_importances_)

        # voting classifier case
        elif hasattr(model, "estimators_") or hasattr(model, "named_estimators_"):
            imps = []
            ests = getattr(model, "estimators_", None) or list(getattr(model, "named_estimators_", {}).values())
            for est in ests:
                if hasattr(est, "feature_importances_"):
                    imps.append(np.array(est.feature_importances_))
            if imps:
                imp_arr = np.mean(imps, axis=0)

        # no importances found
        if imp_arr is None:
            all_results[name] = None
            rows.append({"model": name, "feature": None, "importance": None})
            print(f"[{name}] No feature_importances_ available.")
            continue

        # check alignment
        if len(imp_arr) != len(feature_names):
            print(f"⚠️ Warning: Feature importance array length ({len(imp_arr)}) ≠ feature names length ({len(feature_names)}) for model {name}. Skipping.")
            all_results[name] = None
            continue

        # Series of importances
        s = pd.Series(imp_arr, index=feature_names).sort_values(ascending=False)
        top_series = s.head(top_k)
        all_results[name] = top_series.to_dict()

        # rows for csv
        for feat, val in s.items():
            rows.append({"model": name, "feature": feat, "importance": float(val)})

        # plot
        safe_name = name.replace(" ", "_")
        plt.figure(figsize=(6, max(3, len(top_series)*0.6)))
        sns.barplot(x=top_series.values, y=top_series.index)
        plt.title(f"{name} - Top {top_k} Feature Importances")
        plt.xlabel("Importance"); plt.ylabel("Features")
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, f"{safe_name}_feature_importances.png"), bbox_inches='tight')
        plt.close()

        # print summary
        print(f"\nTop {top_k} features for {name}:")
        print(top_series.to_string())

    # save JSON + CSV
    with open(out_json, 'w') as f:
        json.dump(all_results, f, indent=2)

    df_rows = pd.DataFrame(rows)
    df_rows.to_csv(out_csv, index=False)

    return all_results, df_rows


# === Run Feature Importance Extraction ===
top_k = 5

try:
    model_feature_importances, df_model_imps = extract_and_save_feature_importances(models, processed_feature_names, top_k=top_k)
    print("\n✅ Saved feature importance results in:", RESULTS_DIR)
except NameError:
    print("Error: 'processed_feature_names' is not defined. Run the previous evaluation cell first.")
except Exception as e:
    print(f"An error occurred during feature importance extraction: {e}")

In [None]:
# Load the trained models
loaded_models = {
    'RandomForest': joblib.load(os.path.join(BASE_MODEL_DIR, 'RandomForest_pipeline.pkl')),
    'XGBoost': joblib.load(os.path.join(BASE_MODEL_DIR, 'XGBoost_pipeline.pkl'))
}

# Get feature names from the preprocessor fitted on the training data
# Assuming preprocessor was fitted in a previous cell and is available
# If preprocessor is not available, you might need to refit it or adjust
try:
    feature_names = loaded_models['RandomForest'].named_steps['preprocessor'].get_feature_names_out()
except:
    # Fallback if get_feature_names_out is not available or preprocessor structure is different
    print("Warning: Could not get feature names from preprocessor. Using X_train columns as a fallback.")
    feature_names = X_train.columns.tolist()


# --- Random Forest Feature Importance ---
if 'RandomForest' in loaded_models and hasattr(loaded_models['RandomForest'].named_steps['classifier'], 'feature_importances_'):
    rf_feature_importances = loaded_models['RandomForest'].named_steps['classifier'].feature_importances_

    rf_feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': rf_feature_importances
    })

    rf_feature_importance_df = rf_feature_importance_df.sort_values(by='Importance', ascending=False)

    print("Top 10 Feature Importances (Random Forest):")
    display(rf_feature_importance_df.head(10))

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=rf_feature_importance_df.head(10))
    plt.title('Top 10 Feature Importances (Random Forest)')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.show()
else:
    print("RandomForest model not available or does not have feature_importances_.")

# --- XGBoost Feature Importance ---
if 'XGBoost' in loaded_models and hasattr(loaded_models['XGBoost'].named_steps['classifier'], 'feature_importances_'):
    xgb_feature_importances = loaded_models['XGBoost'].named_steps['classifier'].feature_importances_

    xgb_feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': xgb_feature_importances
    })

    xgb_feature_importance_df = xgb_feature_importance_df.sort_values(by='Importance', ascending=False)

    print("\nTop 10 Feature Importances (XGBoost):")
    display(xgb_feature_importance_df.head(10))

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=xgb_feature_importance_df.head(10))
    plt.title('Top 10 Feature Importances (XGBoost)')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.show()
else:
     print("XGBoost model not available or does not have feature_importances_.")

# --- Ensemble Feature Importance (Averaged) ---
if 'RandomForest' in loaded_models and 'XGBoost' in loaded_models and \
   hasattr(loaded_models['RandomForest'].named_steps['classifier'], 'feature_importances_') and \
   hasattr(loaded_models['XGBoost'].named_steps['classifier'], 'feature_importances_'):

    ensemble_feature_importances = (loaded_models['RandomForest'].named_steps['classifier'].feature_importances_ +
                                    loaded_models['XGBoost'].named_steps['classifier'].feature_importances_) / 2

    ensemble_feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': ensemble_feature_importances
    })

    ensemble_feature_importance_df = ensemble_feature_importance_df.sort_values(by='Importance', ascending=False)

    print("\nTop 10 Feature Importances (Ensemble - Averaged):")
    display(ensemble_feature_importance_df.head(10))

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=ensemble_feature_importance_df.head(10))
    plt.title('Top 10 Feature Importances (Ensemble - Averaged)')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.show()
else:
    print("Could not calculate Ensemble Feature Importance. Both RandomForest and XGBoost models with feature_importances_ are required.")

In [None]:
import shap
import matplotlib.pyplot as plt

# Use one of the loaded models instead of 'best_model'
# For example, using the RandomForest model:
if 'loaded_models' in globals() and 'RandomForest' in loaded_models and 'X_test_processed' in globals() and 'feature_names' in globals():
    try:
        explainer = shap.Explainer(loaded_models['RandomForest'].named_steps['classifier'], X_test_processed)
        shap_values = explainer(X_test_processed)

        # Determine a relevant figure size based on the number of features
        num_features = len(feature_names)
        fig_height = max(90, int(num_features * 1.5)) # Adjust height based on number of features
        plt.figure(figsize=(50, fig_height)) # Set figure size

        shap.summary_plot(shap_values, feature_names=feature_names, show=False) # show=False to prevent immediate display
        plt.title("SHAP Summary Plot (Random Forest)")
        plt.tight_layout() # Adjust layout to prevent labels overlapping
        plt.show() # Display the plot

    except Exception as e:
        print(f"An error occurred during SHAP plot generation: {e}")
        print("Please ensure loaded_models, X_test_processed, and feature_names are defined and contain valid data.")

else:
    print("Error: Required variables (loaded_models, X_test_processed, or feature_names) are not defined. Please run previous cells.")

# prediction


In [None]:
def predict_from_input(model_name, input_features, model_dir=BASE_MODEL_DIR, feature_file=os.path.join(BASE_MODEL_DIR,'features.json')):
    """
    model_name: 'RandomForest' / 'XGBoost' / 'Ensemble'
    input_features: list/array بنفس ترتيب features.json
    """
    # load features list & scaler & model
    if not os.path.exists(feature_file):
        raise FileNotFoundError(f"{feature_file} not found. Make sure you saved features.json earlier.")

    with open(feature_file, 'r') as f:
        features = json.load(f)

    if len(input_features) != len(features):
        raise ValueError(f"Expected {len(features)} features in the input, got {len(input_features)}")

    scaler = joblib.load(os.path.join(model_dir,'scaler.pkl'))
    model = joblib.load(os.path.join(model_dir,f"{model_name}.pkl"))
    X = np.array(input_features).reshape(1, -1)
    X_scaled = scaler.transform(X)

    pred = int(model.predict(X_scaled)[0])
    prob = float(model.predict_proba(X_scaled)[0][1]) if hasattr(model, "predict_proba") else None

    return {"prediction": pred, "probability": prob, "used_features": features}


## ✅ Summary & Next Steps
Key findings, insights, and recommendations.

- Main insights:
- Model performance:
- Next steps: