<a href="https://colab.research.google.com/github/abhi6174/EXO-E/blob/main/Exoplanet_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# ==========================================================
# AUTOMATIC EXOPLANET DETECTION PIPELINE
# - Automatic Data Cleaning
# - Automatic Feature Selection
# - Exoplanet Prediction
# ==========================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

class AutoExoplanetDetector:
    def __init__(self):
        self.df = None
        self.df_cleaned = None
        self.important_features = []
        self.model = None
        self.imputer = None
        self.scaler = None
        self.label_encoder = None

    def auto_clean_data(self, df):
        """Automatically clean the dataset"""
        print("🧹 AUTOMATIC DATA CLEANING...")
        df_cleaned = df.copy()

        # Step 1: Check target column
        if 'koi_disposition' not in df_cleaned.columns:
            raise ValueError("❌ Target column 'koi_disposition' not found!")

        # Step 2: Remove CANDIDATE rows
        initial_rows = len(df_cleaned)
        df_cleaned = df_cleaned[df_cleaned['koi_disposition'] != 'CANDIDATE']
        removed_candidates = initial_rows - len(df_cleaned)
        print(f"✅ Removed {removed_candidates} 'CANDIDATE' rows")

        # Step 3: Select numeric features
        numeric_cols = df_cleaned.select_dtypes(include=['number']).columns.tolist()
        if 'koi_disposition' in numeric_cols:
            numeric_cols.remove('koi_disposition')
        print(f"✅ Found {len(numeric_cols)} numeric features")

        # Step 4: Remove high missing value columns (>50%)
        missing_percent = (df_cleaned[numeric_cols].isnull().sum() / len(df_cleaned)) * 100
        cols_to_keep = missing_percent[missing_percent <= 50].index.tolist()
        removed_cols = len(numeric_cols) - len(cols_to_keep)
        print(f"✅ Removed {removed_cols} columns with >50% missing values")

        # Step 5: Remove rows with all features missing
        initial_rows = len(df_cleaned)
        df_cleaned = df_cleaned.dropna(how='all', subset=cols_to_keep)
        removed_rows = initial_rows - len(df_cleaned)
        print(f"✅ Removed {removed_rows} rows with all features missing")

        # Step 6: Impute missing values
        self.imputer = SimpleImputer(strategy='median')
        df_cleaned[cols_to_keep] = self.imputer.fit_transform(df_cleaned[cols_to_keep])
        print("✅ Imputed missing values with median")

        # Step 7: Remove constant columns
        constant_cols = [col for col in cols_to_keep if df_cleaned[col].nunique() <= 1]
        cols_to_keep = [col for col in cols_to_keep if col not in constant_cols]
        print(f"✅ Removed {len(constant_cols)} constant columns")

        # Keep only cleaned features + target
        final_cols = cols_to_keep + ['koi_disposition']
        self.df_cleaned = df_cleaned[final_cols]

        print(f"🎯 Final cleaned dataset: {self.df_cleaned.shape}")
        return cols_to_keep

    def auto_select_features(self, numeric_cols):
        """Automatically select important features using ML"""
        print("\n🤖 AUTOMATIC FEATURE SELECTION...")

        X = self.df_cleaned[numeric_cols]
        y = self.df_cleaned['koi_disposition']

        # Encode target
        self.label_encoder = LabelEncoder()
        y_encoded = self.label_encoder.fit_transform(y)

        # Use Random Forest for feature selection
        rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_selector.fit(X, y_encoded)

        # Get feature importance
        importances = rf_selector.feature_importances_
        feature_importance_df = pd.DataFrame({
            'Feature': numeric_cols,
            'Importance': importances
        }).sort_values('Importance', ascending=False)

        # Automatically select top features (importance > 0.01)
        self.important_features = feature_importance_df[feature_importance_df['Importance'] > 0.01]['Feature'].tolist()

        # Ensure minimum 8 features
        if len(self.important_features) < 8:
            self.important_features = feature_importance_df.head(10)['Feature'].tolist()

        print(f"✅ Automatically selected {len(self.important_features)} important features:")
        for i, feature in enumerate(self.important_features, 1):
            importance = feature_importance_df[feature_importance_df['Feature'] == feature]['Importance'].values[0]
            print(f"   {i:2d}. {feature} (importance: {importance:.4f})")

        return self.important_features

    def train_model(self):
        """Train the prediction model"""
        print("\n🎯 TRAINING PREDICTION MODEL...")

        X = self.df_cleaned[self.important_features]
        y = self.df_cleaned['koi_disposition']
        y_encoded = self.label_encoder.transform(y)

        # Scale features
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
        )

        # Train model
        self.model = RandomForestClassifier(
            n_estimators=150,
            max_depth=10,
            min_samples_split=8,
            min_samples_leaf=4,
            random_state=42
        )

        self.model.fit(X_train, y_train)

        # Evaluate model
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"📊 MODEL PERFORMANCE:")
        print(f"   Accuracy: {accuracy:.4f}")
        print(f"   Features used: {len(self.important_features)}")

        print("\n📈 Classification Report:")
        print(classification_report(y_test, y_pred, target_names=self.label_encoder.classes_))

        return accuracy

    def predict_new_data(self, new_data):
        """Predict exoplanet status for new data"""
        print("\n🔮 MAKING PREDICTIONS...")

        # Clean new data
        new_df = new_data.copy()

        # Check available features
        available_features = [f for f in self.important_features if f in new_df.columns]
        missing_features = [f for f in self.important_features if f not in new_df.columns]

        if missing_features:
            print(f"⚠️  Missing {len(missing_features)} features, using {len(available_features)} available features")

        # Handle missing features by creating NaN columns
        for feature in self.important_features:
            if feature not in new_df.columns:
                new_df[feature] = np.nan

        # Impute missing values
        new_df[self.important_features] = self.imputer.transform(new_df[self.important_features])

        # Scale features
        X_new = self.scaler.transform(new_df[self.important_features])

        # Make predictions
        predictions = self.model.predict(X_new)
        probabilities = self.model.predict_proba(X_new)

        # Decode predictions
        predicted_labels = self.label_encoder.inverse_transform(predictions)

        # Create results
        results_df = new_df[self.important_features].copy()
        results_df['PREDICTED_EXOPLANET_STATUS'] = predicted_labels
        results_df['CONFIDENCE'] = np.max(probabilities, axis=1)

        # Add probability for each class
        for i, class_name in enumerate(self.label_encoder.classes_):
            results_df[f'PROB_{class_name}'] = probabilities[:, i]

        print(f"🎯 PREDICTION RESULTS:")
        print(f"Total predictions: {len(results_df)}")

        # Count predictions
        prediction_counts = results_df['PREDICTED_EXOPLANET_STATUS'].value_counts()
        for status, count in prediction_counts.items():
            percentage = (count / len(results_df)) * 100
            print(f"   {status}: {count} ({percentage:.1f}%)")

        print("\n📋 Sample predictions:")
        display_cols = ['PREDICTED_EXOPLANET_STATUS', 'CONFIDENCE'] + [f'PROB_{cls}' for cls in self.label_encoder.classes_]
        print(results_df[display_cols].head())

        return results_df

# ==========================================================
# MAIN EXECUTION
# ==========================================================

def main():
    detector = AutoExoplanetDetector()

    print("🚀 AUTOMATIC EXOPLANET DETECTION PIPELINE")
    print("="*50)

    # Upload training data
    from google.colab import files
    import io

    print("📤 Upload TRAINING dataset (CSV file):")
    uploaded_train = files.upload()
    train_filename = list(uploaded_train.keys())[0]
    df_train = pd.read_csv(io.BytesIO(uploaded_train[train_filename]))
    print(f"✅ Training data loaded: {df_train.shape}")

    # Step 1: Automatic cleaning
    numeric_cols = detector.auto_clean_data(df_train)

    # Step 2: Automatic feature selection
    important_features = detector.auto_select_features(numeric_cols)

    # Step 3: Train model
    accuracy = detector.train_model()

    # Step 4: Predict on new data
    print("\n📤 Upload NEW dataset for prediction (CSV file):")
    uploaded_new = files.upload()
    new_filename = list(uploaded_new.keys())[0]
    df_new = pd.read_csv(io.BytesIO(uploaded_new[new_filename]))
    print(f"✅ New data loaded: {df_new.shape}")

    # Make predictions
    predictions = detector.predict_new_data(df_new)

    # Save results
    predictions.to_csv('exoplanet_predictions.csv', index=False)
    print("\n💾 Predictions saved to 'exoplanet_predictions.csv'")

    print("\n✅ PIPELINE COMPLETED!")
    print(f"🎯 Model trained with {accuracy:.4f} accuracy")
    print(f"🔧 Using {len(important_features)} automatically selected features")

# Run the pipeline
if __name__ == "__main__":
    main()

🚀 AUTOMATIC EXOPLANET DETECTION PIPELINE
📤 Upload TRAINING dataset (CSV file):


Saving cumulative.csv to cumulative (1).csv
✅ Training data loaded: (9564, 50)
🧹 AUTOMATIC DATA CLEANING...
✅ Removed 2248 'CANDIDATE' rows
✅ Found 45 numeric features
✅ Removed 2 columns with >50% missing values
✅ Removed 0 rows with all features missing
✅ Imputed missing values with median
✅ Removed 0 constant columns
🎯 Final cleaned dataset: (7316, 44)

🤖 AUTOMATIC FEATURE SELECTION...
✅ Automatically selected 16 important features:
    1. koi_score (importance: 0.3779)
    2. rowid (importance: 0.1070)
    3. koi_steff_err1 (importance: 0.0693)
    4. koi_fpflag_co (importance: 0.0577)
    5. koi_prad (importance: 0.0519)
    6. koi_steff_err2 (importance: 0.0394)
    7. koi_fpflag_ss (importance: 0.0373)
    8. koi_fpflag_nt (importance: 0.0305)
    9. koi_prad_err2 (importance: 0.0210)
   10. koi_prad_err1 (importance: 0.0207)
   11. koi_fpflag_ec (importance: 0.0173)
   12. koi_slogg_err2 (importance: 0.0172)
   13. koi_srad_err1 (importance: 0.0160)
   14. koi_duration_err2 (im

KeyboardInterrupt: 