In [None]:
# Forward Selection
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:
# Load data
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(X)

In [None]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create logistic regression model
model = LogisticRegression(max_iter=5000)

In [None]:
# Forward Selection: add features one by one that improve model performance
sfs_forward = SequentialFeatureSelector(
    model, n_features_to_select=5, direction='forward'
)
sfs_forward.fit(X_train, y_train)

selected_features = X.columns[sfs_forward.get_support()]
print("Forward Selection Chosen Features:", list(selected_features))

In [None]:
# Train model with selected features and evaluate
model.fit(X_train[selected_features], y_train)
y_pred = model.predict(X_test[selected_features])
acc = accuracy_score(y_test, y_pred)

print(f"Accuracy with Forward Selection (5 features): {acc:.4f}")


# Analysis:
## Forward selection generally starts from an empty set and adds features that best improve
## model performance incrementally. This can help identify a small subset of highly predictive features.
## You may observe slightly improved or comparable accuracy to using all features with fewer variables,
## indicating a good feature subset.