### Backward selection

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, roc_auc_score,
    classification_report, roc_curve
)

In [2]:
# Load data
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(X)

In [3]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
# Create logistic regression model
model = LogisticRegression(max_iter=5000)

In [5]:
# Backward Selection: start with all features and remove the least significant one by one
sfs_backward = SequentialFeatureSelector(
    model, n_features_to_select=5, direction='backward'
)
sfs_backward.fit(X_train, y_train)

selected_features = X.columns[sfs_backward.get_support()]
print("Backward Selection Chosen Features:", list(selected_features))

Backward Selection Chosen Features: [0, 2, 3, 21, 23]


### Automated Feature‐Count Tuning

In [7]:
best_k, best_acc = None, 0
for k in range(1, 5):
    sfs = SequentialFeatureSelector(
        model, n_features_to_select=k, direction='backward'
    ).fit(X_train, y_train)
    feats = X_train.columns[sfs.get_support()]
    model.fit(X_train[feats], y_train)
    acc = accuracy_score(y_test, model.predict(X_test[feats]))
    if acc > best_acc:
        best_k, best_acc = k, acc

print(f"Best feature count: {best_k} → Accuracy: {best_acc:.4f}")

Best feature count: 4 → Accuracy: 0.9720


### Stratified K-Fold Integration

In [8]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
sfs = SequentialFeatureSelector(
    model,
    n_features_to_select=5,
    direction='backward',
    scoring='accuracy',
    cv=kf
).fit(X, y)

feats = X.columns[sfs.get_support()]
scores = cross_val_score(model, X[feats], y, cv=kf)
print("Selected Features:", list(feats))
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

Selected Features: [0, 21, 22, 25, 26]
CV Accuracy: 0.9561 ± 0.0192


### Custom Scoring Function

In [9]:
sfs_bal = SequentialFeatureSelector(
    model,
    n_features_to_select=5,
    direction='backward',
    scoring='balanced_accuracy'
)
sfs_bal.fit(X_train, y_train)
feats_bal = X_train.columns[sfs_bal.get_support()]
model.fit(X_train[feats_bal], y_train)
print("Balanced Accuracy Features:", list(feats_bal))

from sklearn.metrics import balanced_accuracy_score
bal_acc = balanced_accuracy_score(y_test, model.predict(X_test[feats_bal]))
print(f"Balanced Accuracy on Test Set: {bal_acc:.4f}")

Balanced Accuracy Features: [0, 2, 3, 21, 23]
Balanced Accuracy on Test Set: 0.9666


### New Dataset

In [10]:
from sklearn.datasets import load_wine

X_wine, y_wine = load_wine(return_X_y=True, as_frame=True)
X_tr, X_ts, y_tr, y_ts = train_test_split(X_wine, y_wine, random_state=42)

sfs_wine = SequentialFeatureSelector(
    model, n_features_to_select=5, direction='backward'
).fit(X_tr, y_tr)

feats = X_tr.columns[sfs_wine.get_support()]
model.fit(X_tr[feats], y_tr)
acc = accuracy_score(y_ts, model.predict(X_ts[feats]))

print(f"Backward Features (Wine): {list(feats)}")
print(f"Wine Dataset Accuracy: {acc:.4f}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Backward Features (Wine): ['alcohol', 'malic_acid', 'flavanoids', 'od280/od315_of_diluted_wines', 'proline']
Wine Dataset Accuracy: 0.9778
