In [1]:
# Required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os # For saving plots

# Core ML imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier # Added Random Forest
from imblearn.over_sampling import SMOTE # Added SMOTE
# Metrics import
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score

# Plotting library
import plotly.express as px
import plotly.graph_objects as go

# --- Configuration ---
# Define paths to the dataset files
TRAIN_CSV_PATH = 'exoTrain.csv'
TEST_CSV_PATH = 'exoTest.csv'
# Set a random state for reproducibility
RANDOM_STATE = 42
# Define the proportion of data to use for the test set
TEST_SET_SIZE = 0.25
# PCA components (used in PCA strategy)
PCA_COMPONENTS = 64 # Adjusted number of components
# Directory to save plots
PLOT_DIR = 'plots'
os.makedirs(PLOT_DIR, exist_ok=True) # Create directory if it doesn't exist

# --- Data Loading and Merging ---
def load_and_merge_data(train_path, test_path):
    """Loads and merges training and testing data."""
    # (Same as in exoplanet_merged_split_v1)
    try:
        print(f"Loading training data from: {train_path}")
        df_train = pd.read_csv(train_path)
        print(f"Loading testing data from: {test_path}")
        df_test = pd.read_csv(test_path)
        print("Data loaded successfully.")
        print("Merging training and testing data...")
        df_combined = pd.concat([df_train, df_test], ignore_index=True)
        print(f"Combined dataset shape: {df_combined.shape}")
        return df_combined
    except FileNotFoundError:
        print(f"Error: Files not found.")
        return None
    except Exception as e:
        print(f"An error occurred during data loading or merging: {e}")
        return None

# --- Base Splitting Function ---
def get_splits(df_combined, test_size=0.25, random_state=42):
    """Separates features/labels, converts labels, performs stratified split."""
    print("\nSeparating features/labels and splitting data...")
    X_combined_raw = df_combined.drop('LABEL', axis=1)
    y_combined_raw = df_combined['LABEL']
    y_combined = (y_combined_raw - 1).astype(int).values # Convert labels 1,2 -> 0,1

    print(f"Combined dataset feature shape: {X_combined_raw.shape}")
    print(f"Combined labels distribution:\n{pd.Series(y_combined).value_counts(normalize=True)}")

    print(f"Performing stratified train-test split (test_size={test_size})...")
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X_combined_raw.values, y_combined,
        test_size=test_size, random_state=random_state, stratify=y_combined
    )
    print(f"Shape of new training features: {X_train_raw.shape}")
    print(f"Shape of new testing features: {X_test_raw.shape}")
    print(f"Class distribution in new training set:\n{pd.Series(y_train).value_counts(normalize=True)}")
    print(f"Class distribution in new test set:\n{pd.Series(y_test).value_counts(normalize=True)}")
    return X_train_raw, X_test_raw, y_train, y_test

# --- Preprocessing Strategy Functions ---

def preprocess_row_scale(X_train_raw, X_test_raw):
    """Applies row-wise standard scaling independently to train and test sets."""
    print("Applying row-wise standard scaling...")
    # Scale training data rows
    mean_train = np.mean(X_train_raw, axis=1, keepdims=True)
    std_train = np.std(X_train_raw, axis=1, keepdims=True)
    std_train[std_train == 0] = 1.0
    X_train_scaled = (X_train_raw - mean_train) / std_train
    # Scale test data rows
    mean_test = np.mean(X_test_raw, axis=1, keepdims=True)
    std_test = np.std(X_test_raw, axis=1, keepdims=True)
    std_test[std_test == 0] = 1.0
    X_test_scaled = (X_test_raw - mean_test) / std_test
    return X_train_scaled, X_test_scaled

def preprocess_col_scale_pca(X_train_raw, X_test_raw, n_components=64, random_state=42):
    """Applies column-wise scaling (StandardScaler) and PCA."""
    print("Applying column-wise standard scaling and PCA...")
    # Column Scale (StandardScaler)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_raw)
    X_test_scaled = scaler.transform(X_test_raw)
    print(f"Shape after scaling: Train={X_train_scaled.shape}, Test={X_test_scaled.shape}")

    # PCA
    pca = PCA(n_components=n_components, random_state=random_state)
    X_train_processed = pca.fit_transform(X_train_scaled)
    X_test_processed = pca.transform(X_test_scaled)
    print(f"Shape after PCA: Train={X_train_processed.shape}, Test={X_test_processed.shape}")
    print(f"PCA Explained Variance Ratio ({n_components} components): {np.sum(pca.explained_variance_ratio_):.4f}")
    return X_train_processed, X_test_processed

def preprocess_col_scale_smote(X_train_raw, X_test_raw, y_train, random_state=42):
    """Applies column-wise scaling (StandardScaler) and SMOTE to training data."""
    print("Applying column-wise standard scaling and SMOTE...")
    # Column Scale (StandardScaler)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_raw)
    X_test_scaled = scaler.transform(X_test_raw) # Scale test set too
    print(f"Shape after scaling: Train={X_train_scaled.shape}, Test={X_test_scaled.shape}")

    # SMOTE (only on training data)
    smote = SMOTE(sampling_strategy='auto', random_state=random_state, k_neighbors=5)
    print("Applying SMOTE to training data...")
    start_time = time.time()
    X_train_processed, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
    end_time = time.time()
    print(f"SMOTE resampling completed in {end_time - start_time:.2f} seconds.")
    print(f"Training data shape after SMOTE: {X_train_processed.shape}")
    print(f"Training labels distribution after SMOTE:\n{pd.Series(y_train_resampled).value_counts(normalize=True)}")
    # IMPORTANT: Return resampled train data BUT original scaled test data and original test labels
    return X_train_processed, X_test_scaled, y_train_resampled # Need resampled y_train for training

# --- Model Training Functions ---
# Added use_class_weight parameter

def train_logistic_regression(X_train, y_train, use_class_weight=False):
    """Trains Logistic Regression."""
    model_params = {
        'solver': 'saga', 'max_iter': 1000, 'tol': 1e-3,
        'random_state': RANDOM_STATE, 'n_jobs': -1, 'verbose': 0
    }
    if use_class_weight:
        model_params['class_weight'] = 'balanced'
        print("\n--- Training Logistic Regression (Class Weights) ---")
    else:
        print("\n--- Training Logistic Regression ---")

    model = LogisticRegression(**model_params)
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training completed in {end_time - start_time:.2f} seconds.")
    return model

def train_svc(X_train, y_train, use_class_weight=False):
    """Trains SVC."""
    model_params = {
        'probability': True, 'random_state': RANDOM_STATE, 'cache_size': 500
    }
    if use_class_weight:
        model_params['class_weight'] = 'balanced'
        print("\n--- Training SVC (Class Weights) ---")
    else:
        print("\n--- Training SVC ---")

    model = SVC(**model_params)
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training completed in {end_time - start_time:.2f} seconds.")
    return model

def train_decision_tree(X_train, y_train, use_class_weight=False):
    """Trains a Decision Tree Classifier."""
    model_params = {'random_state': RANDOM_STATE}
    if use_class_weight:
        model_params['class_weight'] = 'balanced'
        print("\n--- Training Decision Tree (Class Weights) ---")
    else:
        print("\n--- Training Decision Tree ---")

    model = DecisionTreeClassifier(**model_params)
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training completed in {end_time - start_time:.2f} seconds.")
    return model

def train_random_forest(X_train, y_train, use_class_weight=False):
    """Trains a Random Forest Classifier."""
    model_params = {'n_estimators': 100, 'random_state': RANDOM_STATE, 'n_jobs': -1}
    if use_class_weight:
        # balanced_subsample is often preferred for RF
        model_params['class_weight'] = 'balanced_subsample'
        print("\n--- Training Random Forest (Class Weights) ---")
    else:
        print("\n--- Training Random Forest ---")

    model = RandomForestClassifier(**model_params)
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    print(f"Training completed in {end_time - start_time:.2f} seconds.")
    return model

# --- Model Evaluation Function (Returns Metrics) ---
def evaluate_model(model, X_test, y_test, model_name="Model", strategy_name="Strategy"):
    """Evaluates the model and returns key performance metrics."""
    print(f"\n--- Evaluating {model_name} ({strategy_name}) ---")
    results = {
        'Model': model_name,
        'Strategy': strategy_name,
        'Balanced Accuracy': np.nan,
        'Precision (Class 1)': np.nan,
        'Recall (Class 1)': np.nan,
        'F1-Score (Class 1)': np.nan
    }
    try:
        print("Predicting on test data...")
        start_time = time.time()
        y_pred = model.predict(X_test)
        end_time = time.time()
        print(f"Prediction completed in {end_time - start_time:.2f} seconds.")
        y_pred = y_pred.astype(int)

        # Calculate Balanced Accuracy
        bal_acc = balanced_accuracy_score(y_test, y_pred)
        results['Balanced Accuracy'] = bal_acc
        print(f"Balanced Accuracy: {bal_acc:.4f}")

        # Get Classification Report as dict
        report = classification_report(y_test, y_pred, labels=[0, 1],
                                       target_names=["NO exoplanet (0)", "YES exoplanet (1)"],
                                       output_dict=True, zero_division=0)
        print("\nClassification Report:")
        # Print formatted report for clarity
        print(classification_report(y_test, y_pred, labels=[0, 1],
                                    target_names=["NO exoplanet (0)", "YES exoplanet (1)"],
                                    zero_division=0))


        # Extract metrics for the positive class ('YES exoplanet (1)')
        if 'YES exoplanet (1)' in report:
            results['Precision (Class 1)'] = report['YES exoplanet (1)']['precision']
            results['Recall (Class 1)'] = report['YES exoplanet (1)']['recall']
            results['F1-Score (Class 1)'] = report['YES exoplanet (1)']['f1-score']
        else:
             print("Warning: Class 1 not found in classification report results.")


        # Plot Confusion Matrix (Optional display, primarily returning metrics)
        # print("\nConfusion Matrix:")
        # cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
        # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["NO exoplanet (0)", "YES exoplanet (1)"])
        # fig, ax = plt.subplots(figsize=(6, 6))
        # disp.plot(ax=ax, cmap='Blues', values_format='d')
        # ax.set_title(f'{model_name} ({strategy_name}) Confusion Matrix')
        # plt.tight_layout()
        # plt.show() # This might block execution depending on environment

    except Exception as e:
        print(f"An error occurred during evaluation: {e}")

    return results


# --- Main Execution Block ---
if __name__ == "__main__":
    print("Starting Exoplanet Detection Strategy Comparison Script...")

    # 1. Load and Merge Data
    df_combined = load_and_merge_data(TRAIN_CSV_PATH, TEST_CSV_PATH)

    if df_combined is not None:
        # 2. Get Base Train/Test Splits (Raw Features)
        X_train_raw, X_test_raw, y_train_base, y_test = get_splits(
            df_combined, test_size=TEST_SET_SIZE, random_state=RANDOM_STATE
        )

        # --- Define Strategies and Models ---
        strategies = {
            "Row Scale": {'preprocess': preprocess_row_scale, 'args': {}, 'use_class_weight': False, 'y_train_source': y_train_base},
            "Row Scale + Class Weight": {'preprocess': preprocess_row_scale, 'args': {}, 'use_class_weight': True, 'y_train_source': y_train_base},
            "Col Scale + PCA": {'preprocess': preprocess_col_scale_pca, 'args': {'n_components': PCA_COMPONENTS, 'random_state': RANDOM_STATE}, 'use_class_weight': False, 'y_train_source': y_train_base},
            "Col Scale + SMOTE": {'preprocess': preprocess_col_scale_smote, 'args': {'random_state': RANDOM_STATE}, 'use_class_weight': False, 'y_train_source': None} # y_train comes from SMOTE func
        }

        models_to_train = {
            "Logistic Regression": train_logistic_regression,
            "SVC": train_svc,
            "Decision Tree": train_decision_tree,
            "Random Forest": train_random_forest
        }

        all_results = [] # List to store results dictionaries

        # --- Run Experiments ---
        for strategy_name, config in strategies.items():
            print(f"\n===== Processing Strategy: {strategy_name} =====")

            # Apply preprocessing
            preprocess_func = config['preprocess']
            preprocess_args = config['args']
            y_train_current = config['y_train_source'] # Base y_train unless overridden (like by SMOTE)

            if strategy_name == "Col Scale + SMOTE":
                 # SMOTE function returns resampled y_train
                 X_train_processed, X_test_processed, y_train_current = preprocess_func(X_train_raw, X_test_raw, y_train_base, **preprocess_args)
            else:
                 # Other functions just process X
                 X_train_processed, X_test_processed = preprocess_func(X_train_raw, X_test_raw, **preprocess_args)


            # Train and evaluate models for this strategy
            for model_name, train_func in models_to_train.items():
                # Train model
                model = train_func(X_train_processed, y_train_current, use_class_weight=config['use_class_weight'])

                # Evaluate model
                eval_results = evaluate_model(model, X_test_processed, y_test, model_name, strategy_name)
                all_results.append(eval_results)


        # --- Process and Visualize Results ---
        results_df = pd.DataFrame(all_results)
        print("\n===== Overall Results Summary =====")
        print(results_df)

        # Save results to CSV
        results_csv_path = os.path.join(PLOT_DIR, 'model_comparison_results.csv')
        results_df.to_csv(results_csv_path, index=False)
        print(f"\nResults saved to {results_csv_path}")

        # Create Interactive Plots using Plotly
        print("\nGenerating interactive comparison plots...")

        metrics_to_plot = ['Balanced Accuracy', 'F1-Score (Class 1)', 'Recall (Class 1)', 'Precision (Class 1)']

        for metric in metrics_to_plot:
            fig = px.bar(results_df, x='Model', y=metric, color='Strategy',
                         barmode='group', title=f'{metric} Comparison by Model and Strategy',
                         labels={'Model': 'Classifier Model', metric: metric, 'Strategy': 'Preprocessing Strategy'},
                         template='plotly_white') # Use a clean template
            fig.update_layout(title_x=0.5) # Center title
            plot_filename = os.path.join(PLOT_DIR, f'{metric.replace(" ", "_").lower()}_comparison.html')
            fig.write_html(plot_filename)
            print(f"Saved plot: {plot_filename}")


        print("\nScript finished.")
    else:
        print("\nScript aborted due to data loading or merging errors.")



Starting Exoplanet Detection Strategy Comparison Script...
Loading training data from: exoTrain.csv
Loading testing data from: exoTest.csv
Data loaded successfully.
Merging training and testing data...
Combined dataset shape: (5657, 3198)

Separating features/labels and splitting data...
Combined dataset feature shape: (5657, 3197)
Combined labels distribution:
0    0.992576
1    0.007424
Name: proportion, dtype: float64
Performing stratified train-test split (test_size=0.25)...
Shape of new training features: (4242, 3197)
Shape of new testing features: (1415, 3197)
Class distribution in new training set:
0    0.992692
1    0.007308
Name: proportion, dtype: float64
Class distribution in new test set:
0    0.992226
1    0.007774
Name: proportion, dtype: float64

===== Processing Strategy: Row Scale =====
Applying row-wise standard scaling...

--- Training Logistic Regression ---
Training completed in 22.09 seconds.

--- Evaluating Logistic Regression (Row Scale) ---
Predicting on test d