In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

sfm_rf = SelectFromModel(rf, prefit=True, max_features=5, threshold=-np.inf)
feat_rf = X_train.columns[sfm_rf.get_support()]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_rf], y_train)
print("RF features:", list(feat_rf))
print("Accuracy (RF):", accuracy_score(y_test, model.predict(X_test[feat_rf])))

RF features: ['mean concavity', 'mean concave points', 'worst radius', 'worst perimeter', 'worst concave points']
Accuracy (RF): 0.9532163742690059


In [3]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(
    n_estimators=100, eval_metric='logloss', random_state=0
)
xgb_clf.fit(X_train, y_train)

sfm_xgb = SelectFromModel(xgb_clf, prefit=True, max_features=5, threshold=-np.inf)
feat_xgb = X_train.columns[sfm_xgb.get_support()]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_xgb], y_train)
print("XGB features:", list(feat_xgb))
print("Accuracy (XGB):", accuracy_score(y_test, model.predict(X_test[feat_xgb])))

XGB features: ['mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concave points']
Accuracy (XGB): 0.9590643274853801


In [4]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

# Parameters for stability selection
n_bootstraps    = 100
sample_fraction = 0.75
C               = 1.0

# Increase iterations and loosen tolerance
max_iter = 20000
tol      = 1e-3

n_features       = X_train.shape[1]
selection_counts = np.zeros(n_features, dtype=int)

for i in range(n_bootstraps):
    # a) bootstrap sample
    idx  = resample(
        np.arange(X_train.shape[0]),
        replace=True,
        n_samples=int(sample_fraction * X_train.shape[0]),
        random_state=i
    )
    X_bs = X_train.values[idx]
    y_bs = y_train.values[idx]

    # b) fit L1‐penalized logistic with more iterations and looser tol
    lr = LogisticRegression(
        penalty='l1',
        solver='saga',
        C=C,
        max_iter=max_iter,
        tol=tol,
        random_state=0
    )
    lr.fit(X_bs, y_bs)

    # c) tally nonzero coefficients
    nonzero = np.abs(lr.coef_)[0] > 1e-8
    selection_counts += nonzero.astype(int)

# Compute frequencies and pick top 5 stable features
selection_freq = selection_counts / n_bootstraps
top5_idx       = np.argsort(-selection_freq)[:5]
feat_rl        = X_train.columns[top5_idx]

# Retrain on the stable features
final_model = LogisticRegression(max_iter=5000).fit(X_train[feat_rl], y_train)
accuracy    = accuracy_score(y_test, final_model.predict(X_test[feat_rl]))

print("Stability features:", list(feat_rl))
print("Accuracy (Stability):", accuracy)

Stability features: ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean concavity']
Accuracy (Stability): 0.935672514619883


In [5]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=3, random_state=0)
dt.fit(X_train, y_train)

import numpy as np
idx_dt = np.argsort(dt.feature_importances_)[-5:]
feat_dt = X_train.columns[idx_dt]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_dt], y_train)
print("DT features:", list(feat_dt))
print("Accuracy (DT):", accuracy_score(y_test, model.predict(X_test[feat_dt])))


DT features: ['worst perimeter', 'worst area', 'worst radius', 'worst texture', 'mean concave points']
Accuracy (DT): 0.9707602339181286
