In [1]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [2]:
def feature_selection_with_gradient_boosting(data: pd.DataFrame,
                                             target: str,
                                             drop_threshold: float = 0.95,
                                             max_iter_mice: int = 12,
                                             n_splits: int = 5,
                                             random_state: int = 42,
                                             gb_params: dict = None):
    """
    Perform feature selection based on Gradient Boosting feature importances.
    
    Process:
        1. Drop rows with missing ratio > drop_threshold.
        2. MICE imputation for missing values.
        3. Train a Gradient Boosting model using all features to obtain feature importances.
        4. Sort features by importance in descending order.
        5. For top X% of features (X from 10% to 50% with step=5%), re-train the model
           and evaluate performance via cross-validation.
        6. Print results for each subset of features.

    :param data: Input dataframe (includes features + target column).
    :param target: The name of the target column.
    :param drop_threshold: If row's missing ratio > drop_threshold, drop it. Default=0.95.
    :param max_iter_mice: Number of MICE iteration, default=12.
    :param n_splits: Number of folds for cross-validation, default=5.
    :param random_state: Seed for reproducibility, default=42.
    :param gb_params: A dict of hyperparameters for GradientBoostingClassifier, 
                      e.g., {"n_estimators":300, "learning_rate":0.05, "max_depth":4}.
                      If None, we use a default config.
    :return: None (prints out the performance results of different feature subsets).
    """

    # 1. Drop rows with too many missing values
    row_missing_ratio = data.isnull().sum(axis=1) / data.shape[1]
    data_cleaned = data.loc[row_missing_ratio <= drop_threshold].copy()
    
    # 2. Separate target and features
    y = data_cleaned[target]
    X = data_cleaned.drop(columns=[target])
    feature_names = X.columns

    # 3. MICE imputation
    imputer = IterativeImputer(max_iter=max_iter_mice, random_state=random_state)
    X_imputed = imputer.fit_transform(X)

    # 4. Train a Gradient Boosting model to get feature importances
    if gb_params is None:
        gb_params = {
            "n_estimators": 100,
            "learning_rate": 0.1,
            "max_depth": 3,
            "random_state": random_state
        }
    gb_clf = GradientBoostingClassifier(**gb_params)
    gb_clf.fit(X_imputed, y)

    # Get feature importances and sort
    importances = gb_clf.feature_importances_
    feat_imp_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False).reset_index(drop=True)

    # For convenience, get sorted feature names
    sorted_features = feat_imp_df["Feature"].tolist()

    # 5. For coverage from 10% to 50% in increments of 5%, select top features and re-train
    n_total_features = len(feature_names)
    coverage_list = range(10, 81, 5)  # 10%, 15%, 20%, ..., 50%

    results = []
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    for coverage in coverage_list:
        # Compute how many features to keep
        n_keep = max(1, int(np.ceil(n_total_features * (coverage / 100.0))))
        
        # Select top n_keep features
        selected_feats = sorted_features[:n_keep]
        
        # Build a subset of X_imputed with only those features
        selected_indices = [feature_names.get_loc(feat) for feat in selected_feats]
        X_sub = X_imputed[:, selected_indices]

        # Re-train Gradient Boosting on these top features using cross-validation
        model = GradientBoostingClassifier(**gb_params)
        scores = cross_val_score(model, X_sub, y, cv=cv, scoring="accuracy", n_jobs=-1)
        
        mean_acc = scores.mean()
        std_acc = scores.std()

        results.append({
            "Coverage(%)": coverage,
            "Num_Features": n_keep,
            "Mean_Accuracy": mean_acc,
            "Std_Accuracy": std_acc
        })

    # 6. Print feature importance + results
    print("\n=== Feature Importances (All Features) ===")
    print(feat_imp_df.to_string(index=False))

    print("\n=== Feature Selection Results (Top X% of Features) ===")
    results_df = pd.DataFrame(results)
    print(results_df.to_string(index=False))

In [3]:
def main():
    """
    Main function to demonstrate feature_selection_with_gradient_boosting usage.
    """

    # Example: read data (please modify the path as necessary)
    data = pd.read_csv(r"../data/final/nicu_120.csv")

    # Suppose your target column name is 'is_infected'
    target_column = "is_infected"

    # Example gradient boosting hyperparameters from prior knowledge or tuning
    gb_params = {
        "n_estimators": 150,
        "learning_rate": 0.05,
        "max_depth": 4,
        "subsample": 0.8, 
        "random_state": 42
    }

    # Call the feature selection function
    feature_selection_with_gradient_boosting(
        data=data,
        target=target_column,
        drop_threshold=0.95,
        max_iter_mice=12,
        n_splits=5,
        random_state=42,
        gb_params=gb_params
    )

In [4]:
if __name__ == "__main__":
    main()


=== Feature Importances (All Features) ===
                      Feature  Importance
          Lymphocytes_min_120    0.131130
                 SaO2_min_120    0.109776
          Neutrophils_min_120    0.063108
      Temp/Iso/Warmer_min_120    0.062591
                SaO2_mean_120    0.058145
   BP Cuff [Systolic]_min_120    0.043591
          Lymphocytes_max_120    0.038641
        Temp Skin [C]_min_120    0.034125
            Resp Rate_max_120    0.033755
         Lymphocytes_mean_120    0.032397
        Temp Skin [C]_max_120    0.027527
       Temp Skin [C]_mean_120    0.026703
       BP Cuff [Mean]_min_120    0.025174
           Glucometer_min_120    0.024658
           Glucometer_max_120    0.016230
                 SaO2_max_120    0.015273
    Temp Axilary [F]_mean_120    0.015119
          Heart Rate_mean_120    0.013139
      Temp/Iso/Warmer_max_120    0.012327
                   SUBJECT_ID    0.012019
     Temp Axilary [F]_max_120    0.011335
      Red Blood Cells_min_120   