### Forward_Selection

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, roc_auc_score,
    classification_report, roc_curve
)

In [2]:
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(X)

In [3]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
model = LogisticRegression(max_iter=5000)
# model = LogisticRegression(max_iter=3999)-> [no change in accuracy when changed to 3999]

In [14]:
# Forward Selection: add features one by one that improve model performance
sfs_forward = SequentialFeatureSelector(
    # model, n_features_to_select=5, direction='forward'
    model, n_features_to_select=7, direction='forward'
)
sfs_forward.fit(X_train, y_train)

selected_features = X.columns[sfs_forward.get_support()]
print("Forward Selection Chosen Features:", list(selected_features))

Forward Selection Chosen Features: [0, 2, 4, 9, 21, 22, 25]


In [16]:
# Train model with selected features and evaluate
model.fit(X_train[selected_features], y_train)
y_pred = model.predict(X_test[selected_features])
acc = accuracy_score(y_test, y_pred)

print(f"Accuracy with Forward Selection (7 features): {acc:.4f}")

Accuracy with Forward Selection (7 features): 0.9720


#### Analysis

In [17]:
best_k, best_acc = None, 0
for k in range(1, 5):
    sfs = SequentialFeatureSelector(
        model, n_features_to_select=k, direction='forward'
    ).fit(X_train, y_train)
    feats = X_train.columns[sfs.get_support()]
    model.fit(X_train[feats], y_train)
    acc = accuracy_score(y_test, model.predict(X_test[feats]))
    if acc > best_acc:
        best_acc, best_k = acc, k

print(f"Optimal k: {best_k} features → Accuracy: {best_acc:.4f}")

Optimal k: 2 features → Accuracy: 0.9720


### Stratified K-Fold Integration

In [18]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
sfs = SequentialFeatureSelector(
    model,
    n_features_to_select=5,
    direction='forward',
    cv=kf,
    scoring='accuracy'
).fit(X, y)

feats = X.columns[sfs.get_support()]
scores = cross_val_score(model.fit(X[feats], y), X[feats], y, cv=kf)
print(f"Selected: {list(feats)}")
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

Selected: [3, 4, 20, 21, 26]
CV Accuracy: 0.9543 ± 0.0195


### Custom Scoring Function

In [19]:
# F1-score based forward selection
sfs_f1 = SequentialFeatureSelector(
    model, n_features_to_select=5,
    direction='forward',
    scoring='f1'
).fit(X_train, y_train)

feats_f1 = X_train.columns[sfs_f1.get_support()]
model.fit(X_train[feats_f1], y_train)
print("F1-based features:", list(feats_f1))
print("Test F1:", precision_score(y_test, model.predict(X_test[feats_f1]), average='binary'))

F1-based features: [4, 5, 20, 21, 26]
Test F1: 0.967032967032967


In [21]:
from sklearn.metrics import balanced_accuracy_score

sfs_bal = SequentialFeatureSelector(
    model, n_features_to_select=5,
    direction='forward',
    scoring='balanced_accuracy'
).fit(X_train, y_train)

feats_bal = X_train.columns[sfs_bal.get_support()]
model.fit(X_train[feats_bal], y_train)

print("Balanced Accuracy-based features:", list(feats_bal))
print("Test Balanced Accuracy:", balanced_accuracy_score(
    y_test, model.predict(X_test[feats_bal])
))


Balanced Accuracy-based features: [0, 21, 22, 23, 25]
Test Balanced Accuracy: 0.9480857261756138


### New Dataset

In [24]:
from sklearn.datasets import load_iris

X2, y2 = load_iris(return_X_y=True, as_frame=True)
model = LogisticRegression(max_iter=5000)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42)

sfs_iris = SequentialFeatureSelector(
    model, n_features_to_select=2, direction='forward'
).fit(X2_train, y2_train)

feats_iris = X2.columns[sfs_iris.get_support()]
model.fit(X2_train[feats_iris], y2_train)
acc2 = accuracy_score(y2_test, model.predict(X2_test[feats_iris]))
print("Iris selected features:", list(feats_iris))
print(f"Iris accuracy: {acc2:.4f}")

Iris selected features: ['petal length (cm)', 'petal width (cm)']
Iris accuracy: 1.0000
