In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [2]:
def feature_selection_with_catboost_modified(
    data: pd.DataFrame,
    target: str,
    best_params: dict,
    drop_threshold: float = 0.75,
    max_iter_mice: int = 12,
    n_splits: int = 5,
    random_state: int = 42
):
    """
    Re-train CatBoost with only the top X% features by importance, where X goes from 10% to 80%
    in steps of 5%. Evaluate each sub-model via cross-validation (accuracy). 
    Finally, print a summary in the style:
      Fitting 5 folds for each of 1 candidates, totalling 5 fits
      Best Parameters: {...}
      Best CV Score (accuracy): ...
    """
    # Step 1. Clean data by dropping rows with many missing values
    row_missing_ratio = data.isnull().sum(axis=1) / data.shape[1]
    data_cleaned = data.loc[row_missing_ratio <= drop_threshold].copy()
    
    # Step 2. Separate features and target
    y = data_cleaned[target]
    X = data_cleaned.drop(columns=[target])
    feature_names = X.columns

    # Step 3. Impute missing values using MICE
    imputer = IterativeImputer(max_iter=max_iter_mice, random_state=random_state)
    X_imputed = imputer.fit_transform(X)

    # Step 4. Train a CatBoost model (with best_params) on entire data to get feature importances
    base_model = CatBoostClassifier(verbose=0, random_state=random_state, **best_params)
    base_model.fit(X_imputed, y)
    importances = base_model.get_feature_importance()

    # Sort features by importance (descending)
    feat_imp_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False).reset_index(drop=True)

    # For convenience, get sorted feature names
    sorted_features = feat_imp_df["Feature"].tolist()

    # Step 5. For coverage from 10% to 80% in increments of 5%, select top features and re-train
    n_total_features = len(feature_names)
    coverage_values = range(10, 81, 5)  # 10%, 15%, ..., 80%

    results = []
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for coverage in coverage_values:
        # Compute how many features to keep
        n_keep = max(1, int(np.ceil(n_total_features * (coverage / 100.0))))
        
        # Select top n_keep features
        selected_feats = sorted_features[:n_keep]
        selected_indices = [feature_names.get_loc(feat) for feat in selected_feats]
        X_sub = X_imputed[:, selected_indices]

        # Evaluate CatBoost on these top features via cross-validation
        model = CatBoostClassifier(verbose=0, random_state=random_state, **best_params)
        scores = cross_val_score(model, X_sub, y, cv=cv, scoring="accuracy", n_jobs=-1)
        
        mean_acc = scores.mean()
        std_acc = scores.std()

        results.append({
            "Coverage(%)": coverage,
            "Num_Features": n_keep,
            "Mean_Accuracy": mean_acc,
            "Std_Accuracy": std_acc
        })

    # Step 6. Convert results to DataFrame and display
    results_df = pd.DataFrame(results)
    print("\n=== Feature Selection Results (Top X% of Features) ===")
    print(results_df.to_string(index=False))

    # Step 7. Identify coverage with the highest mean accuracy
    best_row = results_df.loc[results_df["Mean_Accuracy"].idxmax()]
    best_coverage = best_row["Coverage(%)"]
    best_cv_score = best_row["Mean_Accuracy"]

    # Step 8. Print summary in the requested style
    # We treat "1 candidate" as a placeholder to mimic the scikit-learn grid search style:
    print(f"\nFitting {n_splits} folds for each of 1 candidates, totalling {n_splits * 1} fits")
    print(f"Best Parameters: {best_params}")
    print(f"Best CV Score (accuracy): {best_cv_score:.10f}")
    print(f"(This corresponds to retaining top {int(best_coverage)}% of features.)")

In [3]:
def run_catboost_feature_selection_30():
    # 1. Load your 30-minute data
    df_30 = pd.read_csv("../data/final/nicu_30.csv")  # Replace with your actual CSV or DataFrame loading code
    
    # 2. Specify the target column
    target_col = "is_infected"  # Replace with your actual target column name

    # 3. Define best parameters for CatBoost
    cat_best_params = {
        "border_count": 64,
        "depth": 4,
        "iterations": 300,
        "l2_leaf_reg": 3,
        "learning_rate": 0.05,
    }

    # 4. Call the feature_selection_with_catboost_modified function
    feature_selection_with_catboost_modified(
        data=df_30,
        target=target_col,
        best_params=cat_best_params,
        drop_threshold=0.75,    # Adjust if needed
        max_iter_mice=12,
        n_splits=10,
    )

if __name__ == "__main__":
    run_catboost_feature_selection_30()



=== Feature Selection Results (Top X% of Features) ===
 Coverage(%)  Num_Features  Mean_Accuracy  Std_Accuracy
          10             9       0.775442      0.014879
          15            13       0.774731      0.017783
          20            17       0.774734      0.016165
          25            22       0.781477      0.015315
          30            26       0.780059      0.020058
          35            30       0.784671      0.019281
          40            34       0.781830      0.018053
          45            39       0.788925      0.014838
          50            43       0.789634      0.017687
          55            47       0.787860      0.020015
          60            51       0.783251      0.019055
          65            56       0.784306      0.022298
          70            60       0.788213      0.017788
          75            64       0.786089      0.017148
          80            68       0.783959      0.020596

Fitting 10 folds for each of 1 candidates, tota