In [6]:
# Variable Importance & Ensemble vs Average Comparison
#
# This cell provides two workflows:
# 1) Feature importance on bioclim predictors with iterative elimination
# 2) Ensemble projected change vs average of individual models
#
# Assumptions:
# - You already have a modeling dataset X (bioclim features) and y (labels/probabilities)
# - You have per-model projections (rasters/arrays) and an ensemble projection of the same shape
#
# Fill the hooks marked with TODO to connect to your data.

from __future__ import annotations
import os
import json
from typing import List, Dict, Tuple
import numpy as np
import pandas as pd

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier

# MaxEnt integration (if available)
try:
    import elapid as ela
    MAXENT_AVAILABLE = True
except ImportError:
    MAXENT_AVAILABLE = False
    print("elapid not available - will use RandomForest for feature importance")

# Plotting
import matplotlib.pyplot as plt

# -----------------------------
# Config hooks (edit as needed)
# -----------------------------
# Paths to hook into your existing data pipeline
DATA_DIR = "/scratch/gito_aciar/sdm-toolbox/out/leptocybe-invasa/output/exp_ensemble_mean_random_south-east-asia_False_False/"
TRAIN_DATA_FILE = "ensemble_mean_model-train_input-data_leptocybe-invasa_random_south-east-asia_Set1_1.csv"

# Load training data from your existing pipeline
# Expected: CSV with bioclim features, 'class' column (0/1), 'SampleWeight', and 'geometry'
# TODO: adjust the path to match your actual training data file

def load_training_data(csv_path: str = None) -> Tuple[pd.DataFrame, pd.Series, pd.Series]:
    """
    Load training data from CSV file created by your pipeline.
    
    Returns:
        X: DataFrame with only predictor columns (bioclim features)
        y: Series with binary target (0/1)
        sample_weight: Series with sample weights
    """
    if csv_path is None:
        # Default path - your specific training data file
        csv_path = os.path.join(DATA_DIR, TRAIN_DATA_FILE)
    
    # Load the training table
    train = pd.read_csv(csv_path)
    
    # Split features/labels/weights; drop non-feature columns
    # x_train contains only predictor columns used by the model
    x_train = train.drop(columns=['class', 'SampleWeight', 'geometry'])
    # y_train is the binary target: 1 for presence, 0 for background
    y_train = train['class']
    # sample_weight per observation passed to the learner to correct bias/imbalance
    sample_weight_train = train['SampleWeight']
    
    print(f"Loaded training data: {x_train.shape[0]} samples, {x_train.shape[1]} features")
    print(f"Feature columns: {list(x_train.columns)}")
    print(f"Class distribution: {y_train.value_counts().to_dict()}")
    
    return x_train, y_train, sample_weight_train

# Placeholder to load projections from several individual models and the ensemble
# Should return arrays of same shape, e.g., (H, W) or (N,) flattened
# TODO: replace with your actual loader

def load_projections() -> Tuple[List[np.ndarray], np.ndarray]:
    # Example:
    # individual = [np.load("model1_proj.npy"), np.load("model2_proj.npy"), ...]
    # ensemble = np.load("ensemble_proj.npy")
    # return individual, ensemble
    raise NotImplementedError("Implement load_projections() to return (individual_list, ensemble)")

# -----------------------------
# 1) Feature importance + elimination
# -----------------------------

def compute_feature_importance_simple(
    X: pd.DataFrame,
    y: pd.Series,
    sample_weight: pd.Series = None,
    use_maxent: bool = False,
) -> pd.DataFrame:
    """
    Simple feature importance without cross-validation (faster, more reliable).
    """
    print("Using simple feature importance (no CV)...")
    
    if use_maxent and MAXENT_AVAILABLE:
        model = ela.MaxentModel(transform='logistic', beta_multiplier=1.5)
        model_name = "MaxEnt"
    else:
        model = RandomForestClassifier(
            n_estimators=100,  # Reduced for speed
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            n_jobs=-1,
            random_state=42,
        )
        model_name = "RandomForest"
    
    # Fit on full dataset
    try:
        if sample_weight is not None:
            model.fit(X, y, sample_weight=sample_weight)
        else:
            model.fit(X, y)
        print(f"{model_name} model fitted successfully")
    except Exception as e:
        print(f"Error fitting {model_name}: {e}")
        raise
    
    # Get feature importances
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        imp_df = pd.DataFrame({
            'importance': importances,
            'feature': X.columns
        }).set_index('feature').sort_values('importance', ascending=False)
        print(f"Using {model_name} feature_importances_")
    else:
        # For MaxEnt without feature_importances_, use permutation importance on full dataset
        print("MaxEnt doesn't have feature_importances_, using permutation importance...")
        try:
            pi = permutation_importance(
                model, X, y, 
                n_repeats=5,  # Reduced for speed
                scoring="roc_auc", 
                n_jobs=-1, 
                random_state=42,
                sample_weight=sample_weight
            )
            imp_df = pd.DataFrame({
                'importance': pi.importances_mean,
                'feature': X.columns
            }).set_index('feature').sort_values('importance', ascending=False)
        except Exception as e:
            print(f"Permutation importance failed: {e}")
            # Last resort: equal importance
            imp_df = pd.DataFrame({
                'importance': [1.0] * len(X.columns),
                'feature': X.columns
            }).set_index('feature')
            print("Using equal importance as fallback")
    
    return imp_df


def compute_feature_importance(
    X: pd.DataFrame,
    y: pd.Series,
    sample_weight: pd.Series = None,
    n_splits: int = 5,
    random_state: int = 42,
    use_maxent: bool = False,
) -> pd.DataFrame:
    """
    Compute feature importance using cross-validation.
    
    Args:
        X: Feature matrix
        y: Target labels
        sample_weight: Sample weights (optional)
        n_splits: Number of CV folds
        random_state: Random seed
        use_maxent: Whether to use MaxEnt instead of RandomForest
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    if use_maxent and MAXENT_AVAILABLE:
        # Use MaxEnt model (same as your pipeline)
        model_class = lambda: ela.MaxentModel(transform='logistic', beta_multiplier=1.5)
        model_name = "MaxEnt"
    else:
        # Use RandomForest as fallback
        model_class = lambda: RandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            n_jobs=-1,
            random_state=random_state,
        )
        model_name = "RandomForest"

    perm_imps: List[pd.Series] = []
    aucs: List[float] = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold {fold+1}: train={len(train_idx)}, test={len(test_idx)}")
        
        # Check if test set is empty
        if len(test_idx) == 0:
            print(f"Warning: Empty test set in fold {fold+1}, skipping...")
            continue
            
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Check class distribution in test set
        test_class_dist = y_test.value_counts()
        print(f"  Test set class distribution: {test_class_dist.to_dict()}")
        
        # Skip if test set has only one class (can't compute AUC)
        if len(test_class_dist) < 2:
            print(f"  Warning: Test set has only one class in fold {fold+1}, skipping...")
            continue
        
        # Handle sample weights
        if sample_weight is not None:
            sample_weight_train = sample_weight.iloc[train_idx]
            sample_weight_test = sample_weight.iloc[test_idx]
        else:
            sample_weight_train = None
            sample_weight_test = None
        
        # Fit model
        model = model_class()
        try:
            if sample_weight_train is not None:
                model.fit(X_train, y_train, sample_weight=sample_weight_train)
            else:
                model.fit(X_train, y_train)
        except Exception as e:
            print(f"  Error fitting model in fold {fold+1}: {e}")
            continue
        
        # Get predictions
        try:
            if hasattr(model, 'predict_proba'):
                y_prob = model.predict_proba(X_test)[:, 1]
            else:
                # For MaxEnt, use predict method
                y_prob = model.predict(X_test)
            
            auc = roc_auc_score(y_test, y_prob)
            aucs.append(auc)
            print(f"  AUC: {auc:.3f}")
        except Exception as e:
            print(f"  Error computing AUC in fold {fold+1}: {e}")
            continue

        # Permutation importance on the held-out fold
        try:
            pi = permutation_importance(
                model, X_test, y_test, 
                n_repeats=10, 
                scoring="roc_auc", 
                n_jobs=-1, 
                random_state=random_state,
                sample_weight=sample_weight_test
            )
            perm_imps.append(pd.Series(pi.importances_mean, index=X.columns))
            print(f"  Permutation importance computed successfully")
        except Exception as e:
            print(f"  Error computing permutation importance in fold {fold+1}: {e}")
            # Fallback: use feature importances from the model if available
            if hasattr(model, 'feature_importances_'):
                perm_imps.append(pd.Series(model.feature_importances_, index=X.columns))
                print(f"  Using model feature_importances_ as fallback")
            else:
                print(f"  No fallback available, skipping fold {fold+1}")
                continue

    # Check if we have any valid results
    if len(perm_imps) == 0:
        raise ValueError("No valid folds were processed. Check your data and class distribution.")
    
    if len(aucs) == 0:
        print("Warning: No valid AUC scores computed.")
        auc_mean, auc_std = 0.0, 0.0
    else:
        auc_mean, auc_std = np.mean(aucs), np.std(aucs)
    
    imp_df = pd.concat(perm_imps, axis=1)
    imp_df.columns = [f"fold_{i+1}" for i in range(imp_df.shape[1])]
    imp_df["mean_importance"] = imp_df.mean(axis=1)
    imp_df.sort_values("mean_importance", ascending=False, inplace=True)

    print(f"{model_name} CV AUC mean: {auc_mean:.3f} Â± {auc_std:.3f} (from {len(aucs)} valid folds)")
    return imp_df


def iterative_elimination(
    X: pd.DataFrame,
    y: pd.Series,
    sample_weight: pd.Series = None,
    target_num_features: int = 5,
    n_splits: int = 5,
    random_state: int = 42,
    use_maxent: bool = False,
    use_simple: bool = True,
) -> Dict[str, object]:
    """
    Iteratively remove least important features until target number is reached.
    
    Args:
        X: Feature matrix
        y: Target labels
        sample_weight: Sample weights (optional)
        target_num_features: Target number of features to keep
        n_splits: Number of CV folds
        random_state: Random seed
        use_maxent: Whether to use MaxEnt instead of RandomForest
        use_simple: Whether to use simple (no CV) feature importance
    """
    remaining_features = list(X.columns)
    history: List[Dict[str, object]] = []

    print(f"Starting with {len(remaining_features)} features, targeting {target_num_features}")

    while len(remaining_features) > target_num_features:
        X_sub = X[remaining_features]
        
        if use_simple:
            imp_df = compute_feature_importance_simple(
                X_sub, y, sample_weight=sample_weight, use_maxent=use_maxent
            )
            least_important = imp_df.index[-1]
            importance_snapshot = imp_df["importance"].to_dict()
        else:
            imp_df = compute_feature_importance(
                X_sub, y, sample_weight=sample_weight, 
                n_splits=n_splits, random_state=random_state, use_maxent=use_maxent
            )
            least_important = imp_df.index[-1]
            importance_snapshot = imp_df["mean_importance"].to_dict()

        remaining_features.remove(least_important)

        history.append({
            "removed": least_important,
            "num_features": len(remaining_features),
            "importance_snapshot": importance_snapshot,
        })
        print(f"Removed {least_important}. Remaining: {len(remaining_features)}")

    if use_simple:
        final_importance = compute_feature_importance_simple(
            X[remaining_features], y, sample_weight=sample_weight, use_maxent=use_maxent
        )
    else:
        final_importance = compute_feature_importance(
            X[remaining_features], y, sample_weight=sample_weight,
            n_splits=n_splits, random_state=random_state, use_maxent=use_maxent
        )

    return {
        "selected_features": remaining_features,
        "final_importance": final_importance,
        "elimination_history": history,
    }

# -----------------------------
# 2) Ensemble vs average comparison
# -----------------------------

def compare_ensemble_vs_average(individual: List[np.ndarray], ensemble: np.ndarray) -> Dict[str, object]:
    # Align shapes: flatten
    ind_stack = np.vstack([arr.ravel() for arr in individual])  # shape: (M, N)
    ensemble_flat = ensemble.ravel()  # shape: (N,)

    avg_flat = ind_stack.mean(axis=0)
    diff = ensemble_flat - avg_flat

    metrics = {
        "mean_abs_diff": float(np.mean(np.abs(diff))),
        "rmse": float(np.sqrt(np.mean(diff**2))),
        "pearson_r": float(np.corrcoef(ensemble_flat, avg_flat)[0, 1]),
        "bias": float(np.mean(diff)),
    }

    return {
        "metrics": metrics,
        "diff_flat": diff,  # can be reshaped back by caller if needed
    }

# -----------------------------
# Example usage (enable by setting to True)
# -----------------------------
if True:
    # 1) Feature importance + elimination
    X, y, sample_weight = load_training_data()
    
    # Option 1: Use simple feature importance (recommended - faster and more reliable)
    if MAXENT_AVAILABLE:
        print("Using MaxEnt with simple feature importance...")
        imp = compute_feature_importance_simple(X, y, sample_weight=sample_weight, use_maxent=True)
        result = iterative_elimination(X, y, sample_weight=sample_weight, target_num_features=5, use_maxent=True, use_simple=True)
    else:
        print("Using RandomForest with simple feature importance...")
        imp = compute_feature_importance_simple(X, y, sample_weight=sample_weight, use_maxent=False)
        result = iterative_elimination(X, y, sample_weight=sample_weight, target_num_features=5, use_maxent=False, use_simple=True)
    
    print("\nTop 12 most important features:")
    print(imp.head(12))
    print(f"\nFinal selected features: {result['selected_features']}")

    # 2) Ensemble vs average comparison
    individual, ensemble = load_projections()
    cmp = compare_ensemble_vs_average(individual, ensemble)
    print("\nEnsemble vs Average comparison:")
    print(json.dumps(cmp["metrics"], indent=2))



Loaded training data: 9627 samples, 20 features
Feature columns: ['Unnamed: 0', 'ensemble_mean_bioclim_01', 'ensemble_mean_bioclim_02', 'ensemble_mean_bioclim_03', 'ensemble_mean_bioclim_04', 'ensemble_mean_bioclim_05', 'ensemble_mean_bioclim_06', 'ensemble_mean_bioclim_07', 'ensemble_mean_bioclim_08', 'ensemble_mean_bioclim_09', 'ensemble_mean_bioclim_10', 'ensemble_mean_bioclim_11', 'ensemble_mean_bioclim_12', 'ensemble_mean_bioclim_13', 'ensemble_mean_bioclim_14', 'ensemble_mean_bioclim_15', 'ensemble_mean_bioclim_16', 'ensemble_mean_bioclim_17', 'ensemble_mean_bioclim_18', 'ensemble_mean_bioclim_19']
Class distribution: {0: 9149, 1: 478}
Using MaxEnt with simple feature importance...
Using simple feature importance (no CV)...
MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Starting with 20 features, targeting 5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_18. Remaining: 18
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_17. Remaining: 17
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_16. Remaining: 16
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_15. Remaining: 15
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_14. Remaining: 14
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_13. Remaining: 13
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_12. Remaining: 12
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_11. Remaining: 11
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_10. Remaining: 10
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_09. Remaining: 9
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_08. Remaining: 8
Using simple feature importance (no CV)...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_07. Remaining: 7
Using simple feature importance (no CV)...
MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_06. Remaining: 6
Using simple feature importance (no CV)...
MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback
Removed ensemble_mean_bioclim_05. Remaining: 5
Using simple feature importance (no CV)...
MaxEnt model fitted successfully
MaxEnt doesn't have feature_importances_, using permutation importance...
Permutation importance failed: index 0 is out of bounds for axis 0 with size 0
Using equal importance as fallback

Top 12 most important features:
                          importance
feature                             
Unnamed: 0                       1.0
ensemble_mean_bioclim_01         1.0
ensemble_mean_bioclim_02         1.0
ensemble_mean_bioclim_03         1.0
ensemble_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x.drop(["geometry"], axis=1, errors="ignore", inplace=True)


NotImplementedError: Implement load_projections() to return (individual_list, ensemble)