<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-Likhitha/Exhaustive_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

In [3]:
def run_exhaustive_selector(X, y, model, min_k, max_k, scoring='accuracy', cv=5):
    efs = ExhaustiveFeatureSelector(
        model,
        min_features=min_k,
        max_features=max_k,
        scoring=scoring,
        cv=cv,
        print_progress=True,
        n_jobs=-1
    )
    efs.fit(X, y)
    best_idx = list(efs.best_idx_)
    best_score = efs.best_score_
    best_features = X.columns[best_idx]
    return best_features, best_score

In [4]:
def auto_feature_count_selection():
    print("\n--- Automated Feature Count Tuning (Breast Cancer Dataset) ---")
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
    model = LogisticRegression(max_iter=5000)

    best_features, best_score = None, 0
    best_k = 0
    for k in range(1, 4):  # You can increase this range for more thorough search
        print(f"\nEvaluating feature subsets of size {k}")
        feats, score = run_exhaustive_selector(X_train, y_train, model, k, k)
        if score > best_score:
            best_features, best_score, best_k = feats, score, k

    print(f"\nBest Feature Count: {best_k}")
    print(f"Best Features: {list(best_features)}")

    model.fit(X_train[best_features], y_train)
    y_pred = model.predict(X_test[best_features])
    acc = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {acc:.4f}")

In [5]:
def stratified_cv_selection():
    print("\n--- Stratified K-Fold Integration (Breast Cancer Dataset) ---")
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    model = LogisticRegression(max_iter=5000)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    best_features, best_score = run_exhaustive_selector(X, y, model, 1, 2, cv=kf)
    print("Selected Features (Stratified CV):", list(best_features))
    print(f"Mean CV Accuracy: {best_score:.4f}")

In [6]:
def custom_scoring_selection(scoring_metric='balanced_accuracy'):
    print(f"\n--- Custom Scoring ({scoring_metric}) ---")
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
    model = LogisticRegression(max_iter=5000)

    best_features, _ = run_exhaustive_selector(X_train, y_train, model, 1, 2, scoring=scoring_metric)
    print("Selected Features:", list(best_features))

    model.fit(X_train[best_features], y_train)
    y_pred = model.predict(X_test[best_features])
    acc = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {acc:.4f}")


In [7]:
def apply_to_iris():
    print("\n--- Applying Exhaustive Selection to Iris Dataset ---")
    X, y = load_iris(return_X_y=True, as_frame=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

    # Use default multi-class handling (no need to override)
    model = LogisticRegression(max_iter=5000)

    best_features, _ = run_exhaustive_selector(X_train, y_train, model, 1, 2)
    print("Selected Features (Iris):", list(best_features))

    model.fit(X_train[best_features], y_train)
    y_pred = model.predict(X_test[best_features])
    acc = accuracy_score(y_test, y_pred)
    print(f"Iris Test Accuracy: {acc:.4f}")

In [8]:
if __name__ == "__main__":
    auto_feature_count_selection()
    stratified_cv_selection()
    custom_scoring_selection(scoring_metric='f1_weighted')
    custom_scoring_selection(scoring_metric='balanced_accuracy')
    apply_to_iris()


--- Automated Feature Count Tuning (Breast Cancer Dataset) ---

Evaluating feature subsets of size 1


Features: 30/30


Evaluating feature subsets of size 2


Features: 435/435


Evaluating feature subsets of size 3


Features: 4060/4060


Best Feature Count: 3
Best Features: ['mean radius', 'worst texture', 'worst perimeter']
Test Accuracy: 0.9580

--- Stratified K-Fold Integration (Breast Cancer Dataset) ---


Features: 465/465

Selected Features (Stratified CV): ['worst texture', 'worst perimeter']
Mean CV Accuracy: 0.9456

--- Custom Scoring (f1_weighted) ---


Features: 465/465

Selected Features: ['worst texture', 'worst perimeter']
Test Accuracy: 0.9580

--- Custom Scoring (balanced_accuracy) ---


Features: 465/465

Selected Features: ['worst texture', 'worst perimeter']
Test Accuracy: 0.9580

--- Applying Exhaustive Selection to Iris Dataset ---


Features: 10/10

Selected Features (Iris): ['sepal length (cm)', 'petal length (cm)']
Iris Test Accuracy: 0.9211
