In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
import math

In [2]:
def feature_selection_with_gradient_boosting_modified(
    data: pd.DataFrame,
    target: str,
    drop_threshold: float = 0.95,
    max_iter_mice: int = 12,
    n_splits: int = 5,
    random_state: int = 42,
    gb_params: dict = None
):
    """
    Perform feature selection based on Gradient Boosting feature importances, 
    then evaluate different coverage levels of top features via cross-validation.
    Finally, print a summary in the style:
      Fitting <n_splits> folds for each of 1 candidates, totalling <n_splits> fits
      Best Parameters: <gb_params>
      Best CV Score (accuracy): <score>
    """

    # 1. Drop rows with too many missing values
    row_missing_ratio = data.isnull().sum(axis=1) / data.shape[1]
    data_cleaned = data.loc[row_missing_ratio <= drop_threshold].copy()

    # 2. Separate target and features
    y = data_cleaned[target]
    X = data_cleaned.drop(columns=[target])
    feature_names = X.columns

    # 3. MICE imputation
    imputer = IterativeImputer(max_iter=max_iter_mice, random_state=random_state)
    X_imputed = imputer.fit_transform(X)

    # 4. Default or custom gb_params
    if gb_params is None:
        gb_params = {
            "n_estimators": 100,
            "learning_rate": 0.1,
            "max_depth": 3,
            "random_state": random_state
        }

    # Train a Gradient Boosting model to get feature importances
    gb_clf = GradientBoostingClassifier(**gb_params)
    gb_clf.fit(X_imputed, y)

    importances = gb_clf.feature_importances_
    feat_imp_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False).reset_index(drop=True)

    # For convenience, get sorted feature names by importance
    sorted_features = feat_imp_df["Feature"].tolist()

    # 5. Coverage from 10% to 80% in increments of 5%
    n_total_features = len(feature_names)
    coverage_list = range(10, 81, 5)

    results = []
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    for coverage in coverage_list:
        # how many features to keep
        n_keep = max(1, int(math.ceil(n_total_features * (coverage / 100.0))))
        selected_feats = sorted_features[:n_keep]

        # subset columns in X_imputed
        selected_indices = [feature_names.get_loc(feat) for feat in selected_feats]
        X_sub = X_imputed[:, selected_indices]

        # re-train Gradient Boosting on these top features using cross-validation
        model = GradientBoostingClassifier(**gb_params)
        scores = cross_val_score(model, X_sub, y, cv=cv, scoring="accuracy", n_jobs=-1)
        
        mean_acc = scores.mean()
        std_acc = scores.std()

        results.append({
            "Coverage(%)": coverage,
            "Num_Features": n_keep,
            "Mean_Accuracy": mean_acc,
            "Std_Accuracy": std_acc
        })

    # 6. Print feature importance + coverage results
    print("\n=== Feature Importances (All Features) ===")
    print(feat_imp_df.to_string(index=False))

    print("\n=== Feature Selection Results (Top X% of Features) ===")
    results_df = pd.DataFrame(results)
    print(results_df.to_string(index=False))

    # Identify coverage with highest mean accuracy
    best_row = results_df.loc[results_df["Mean_Accuracy"].idxmax()]
    best_coverage = best_row["Coverage(%)"]
    best_cv_score = best_row["Mean_Accuracy"]

    # Print summary in the style of scikit-learn single-candidate CV
    print(f"\nFitting {n_splits} folds for each of 1 candidates, totalling {n_splits} fits")
    print(f"Best Parameters: {gb_params}")
    print(f"Best CV Score (accuracy): {best_cv_score:.10f}")
    print(f"(This corresponds to retaining top {int(best_coverage)}% of features.)")


In [3]:
if __name__ == "__main__":
    # Suppose your dataset is named df, has a target column "is_infected_x"
    # and you want to use the new hyperparams:
    df = pd.read_csv(r"../data/final/nicu_120.csv")
    custom_params = {
        "n_estimators": 150,
        "learning_rate": 0.05,
        "max_depth": 4,
        "subsample": 0.8,
        "random_state": 42
    }
    feature_selection_with_gradient_boosting_modified(
        data=df, 
        target="is_infected", 
        drop_threshold=0.95, 
        max_iter_mice=12, 
        n_splits=10, 
        random_state=42,
        gb_params=custom_params
    )


=== Feature Importances (All Features) ===
                      Feature  Importance
          Lymphocytes_min_120    0.131130
                 SaO2_min_120    0.109776
          Neutrophils_min_120    0.063108
      Temp/Iso/Warmer_min_120    0.062591
                SaO2_mean_120    0.058145
   BP Cuff [Systolic]_min_120    0.043591
          Lymphocytes_max_120    0.038641
        Temp Skin [C]_min_120    0.034125
            Resp Rate_max_120    0.033755
         Lymphocytes_mean_120    0.032397
        Temp Skin [C]_max_120    0.027527
       Temp Skin [C]_mean_120    0.026703
       BP Cuff [Mean]_min_120    0.025174
           Glucometer_min_120    0.024658
           Glucometer_max_120    0.016230
                 SaO2_max_120    0.015273
    Temp Axilary [F]_mean_120    0.015119
          Heart Rate_mean_120    0.013139
      Temp/Iso/Warmer_max_120    0.012327
                   SUBJECT_ID    0.012019
     Temp Axilary [F]_max_120    0.011335
      Red Blood Cells_min_120   