<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-Likhitha/Backward_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer, load_wine
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, classification_report
)
import warnings

# Suppress future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load breast cancer dataset
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Logistic regression model
model = LogisticRegression(max_iter=5000, solver='liblinear')

# 1. Backward selection (fixed 5 features)
sfs_backward = SequentialFeatureSelector(
    model, n_features_to_select=5, direction='backward'
)
sfs_backward.fit(X_train, y_train)
selected_features = X.columns[sfs_backward.get_support()]
model.fit(X_train[selected_features], y_train)
y_pred = model.predict(X_test[selected_features])
acc = accuracy_score(y_test, y_pred)
print("1️⃣ Backward Selection (5 features):")
print("Selected Features:", list(selected_features))
print(f"Test Accuracy: {acc:.4f}\n")

# 2. Auto-tune best number of features (FIXED loop condition)
best_k, best_acc = None, 0
for k in range(1, X.shape[1]):  # Must be < total features
    sfs = SequentialFeatureSelector(
        model, n_features_to_select=k, direction='backward'
    ).fit(X_train, y_train)
    feats = X_train.columns[sfs.get_support()]
    model.fit(X_train[feats], y_train)
    acc = accuracy_score(y_test, model.predict(X_test[feats]))
    if acc > best_acc:
        best_k, best_acc = k, acc
print("2️⃣ Auto-Tuned Feature Count:")
print(f"Best feature count: {best_k} → Accuracy: {best_acc:.4f}\n")

# 3. Stratified K-Fold with backward selection
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
sfs_cv = SequentialFeatureSelector(
    model, n_features_to_select=5, direction='backward',
    scoring='accuracy', cv=kf
).fit(X, y)
feats_cv = X.columns[sfs_cv.get_support()]
cv_scores = cross_val_score(model, X[feats_cv], y, cv=kf)
print("3️⃣ Stratified K-Fold Selection:")
print("Selected Features:", list(feats_cv))
print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}\n")

# 4. Custom scoring: balanced accuracy
sfs_bal = SequentialFeatureSelector(
    model, n_features_to_select=5, direction='backward',
    scoring='balanced_accuracy'
).fit(X_train, y_train)
feats_bal = X_train.columns[sfs_bal.get_support()]
model.fit(X_train[feats_bal], y_train)
y_bal_pred = model.predict(X_test[feats_bal])
bal_acc = balanced_accuracy_score(y_test, y_bal_pred)
print("4️⃣ Balanced Accuracy Scoring:")
print("Selected Features:", list(feats_bal))
print(f"Balanced Accuracy on Test Set: {bal_acc:.4f}\n")

# 5. Apply to Wine dataset
def apply_to_wine():
    print("5️⃣ Wine Dataset Evaluation:")
    X_wine, y_wine = load_wine(return_X_y=True, as_frame=True)
    X_tr, X_ts, y_tr, y_ts = train_test_split(X_wine, y_wine, random_state=42)
    sfs_wine = SequentialFeatureSelector(
        model, n_features_to_select=5, direction='backward'
    ).fit(X_tr, y_tr)
    feats = X_tr.columns[sfs_wine.get_support()]
    model.fit(X_tr[feats], y_tr)
    y_wine_pred = model.predict(X_ts[feats])
    acc = accuracy_score(y_ts, y_wine_pred)
    print("Selected Features:", list(feats))
    print(f"Wine Dataset Accuracy: {acc:.4f}")

apply_to_wine()


1️⃣ Backward Selection (5 features):
Selected Features: [12, 21, 22, 23, 26]
Test Accuracy: 0.9650

2️⃣ Auto-Tuned Feature Count:
Best feature count: 4 → Accuracy: 0.9720

3️⃣ Stratified K-Fold Selection:
Selected Features: [0, 3, 11, 22, 26]
CV Accuracy: 0.9508 ± 0.0162

4️⃣ Balanced Accuracy Scoring:
Selected Features: [0, 20, 21, 22, 23]
Balanced Accuracy on Test Set: 0.9481

5️⃣ Wine Dataset Evaluation:
Selected Features: ['alcalinity_of_ash', 'flavanoids', 'color_intensity', 'hue', 'proline']
Wine Dataset Accuracy: 0.9333
