In [1]:
# Forward Selection
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, roc_auc_score,
    classification_report, roc_curve
)

In [2]:
# Load data
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(X)

In [8]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)

In [9]:
# Create logistic regression model
model = LogisticRegression(max_iter=5000)

In [10]:
# Forward Selection: add features one by one that improve model performance
sfs_forward = SequentialFeatureSelector(
    model, n_features_to_select=5, direction='forward'
)
sfs_forward.fit(X_train, y_train)

selected_features = X.columns[sfs_forward.get_support()]
print("Forward Selection Chosen Features:", list(selected_features))

Forward Selection Chosen Features: [0, 1, 2, 21, 22]


In [11]:
# Train model with selected features and evaluate
model.fit(X_train[selected_features], y_train)
y_pred = model.predict(X_test[selected_features])
acc = accuracy_score(y_test, y_pred)

print(f"Accuracy with Forward Selection (5 features): {acc:.4f}")


Accuracy with Forward Selection (5 features): 0.9371


## Analysis:
- Forward selection generally starts from an empty set and adds features that best improve
- model performance incrementally. This can help identify a small subset of highly predictive features.
- You may observe slightly improved or comparable accuracy to using all features with fewer variables,indicating a good feature subset.

**bold text**## 1. Automated Feature‐Count Tuning
#### Adapt your script to automatically select the optimal n_features_to_select from a range (e.g., 1–30) by choosing the value that maximizes test accuracy.

- Loop over possible feature counts.

- Record test accuracy for each.

- Print the best feature count and its accuracy.



In [12]:
best_k, best_acc = None, 0
for k in range(1, 5):
    sfs = SequentialFeatureSelector(
        model, n_features_to_select=k, direction='forward'
    ).fit(X_train, y_train)
    feats = X_train.columns[sfs.get_support()]
    model.fit(X_train[feats], y_train)
    acc = accuracy_score(y_test, model.predict(X_test[feats]))
    if acc > best_acc:
        best_acc, best_k = acc, k

print(f"Optimal k: {best_k} features → Accuracy: {best_acc:.4f}")


Optimal k: 2 features → Accuracy: 0.9441


## 2. Stratified K-Fold Integration
#### Replace the single train/test split with Stratified K-Fold during feature selection:

- Use StratifiedKFold(n_splits=5) within SequentialFeatureSelector.

- Report mean and standard deviation of accuracy across folds for your final feature set.

In [13]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
sfs = SequentialFeatureSelector(
    model,
    n_features_to_select=5,
    direction='forward',
    cv=kf,
    scoring='accuracy'
).fit(X, y)

feats = X.columns[sfs.get_support()]
scores = cross_val_score(model.fit(X[feats], y), X[feats], y, cv=kf)
print(f"Selected: {list(feats)}")
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")


Selected: [3, 4, 20, 21, 26]
CV Accuracy: 0.9543 ± 0.0195


## 3. Custom Scoring Function
#### Experiment with a different scoring metric in forward selection (e.g., F1-score or balanced accuracy) to handle class imbalance:

- Pass scoring='f1' or scoring='balanced_accuracy' to SequentialFeatureSelector.

- Compare the selected feature lists and test performance under each metric.

In [14]:
# F1-score based forward selection
sfs_f1 = SequentialFeatureSelector(
    model, n_features_to_select=5,
    direction='forward',
    scoring='f1'
).fit(X_train, y_train)

feats_f1 = X_train.columns[sfs_f1.get_support()]
model.fit(X_train[feats_f1], y_train)
print("F1-based features:", list(feats_f1))
print("Test F1:", precision_score(y_test, model.predict(X_test[feats_f1]), average='binary'))


F1-based features: [0, 1, 2, 21, 22]
Test F1: 0.9361702127659575


## 4. Application to a New Dataset
#### Apply your forward‐selection pipeline unchanged to a different classification dataset (e.g., the Iris or Wine dataset):

- Load a new dataset from sklearn.datasets.

- Compare which features are selected and the resulting model accuracy.

- Each exercise reuses your original code structure and deepens your grasp of forward wrapper selection through tuning, validation, stability, and computational considerations.

In [18]:
from sklearn.datasets import load_iris

X2, y2 = load_iris(return_X_y=True, as_frame=True)
model = LogisticRegression(max_iter=5000)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=20)

sfs_iris = SequentialFeatureSelector(
    model, n_features_to_select=3, direction='forward'
).fit(X2_train, y2_train)

feats_iris = X2.columns[sfs_iris.get_support()]
model.fit(X2_train[feats_iris], y2_train)
acc2 = accuracy_score(y2_test, model.predict(X2_test[feats_iris]))
print("Iris selected features:", list(feats_iris))
print(f"Iris accuracy: {acc2:.4f}")


Iris selected features: ['sepal length (cm)', 'sepal width (cm)', 'petal width (cm)']
Iris accuracy: 0.9211
