## Feature Selection - Wrapper Methode

### 1) Forward Feature Selection

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer

In [5]:
data = load_breast_cancer()

In [6]:
X = data.data
y = data.target

In [11]:
# 569 sample, 30 feature
X.shape

(569, 30)

In [9]:
# Initialize an empty list to store selected feature indices
selected_features = []

In [10]:
# Define the machine learning model (in this case, a Random Forest Classifier)
model = RandomForestClassifier()

In [12]:
# Define the number of features you want to select
num_features_to_select = 4

In [18]:
while len(selected_features) < num_features_to_select:
    best_score = -1
    best_feature = None

    for feature_idx in range(X.shape[1]):
        if feature_idx in selected_features:
            continue

        # Try adding the feature to the selected set
        candidate_features = selected_features + [feature_idx]

        # Evaluate the model's performance using cross-validation
        scores = cross_val_score(model, X[:, candidate_features], y, cv=5, scoring='accuracy')
        mean_score = np.mean(scores)

        # Keep track of the best-performing feature
        if mean_score > best_score:
            best_score = mean_score
            best_feature = feature_idx

    if best_feature is not None:
        selected_features.append(best_feature)
        print(f"Selected Feature {len(selected_features)}: {best_feature}, Mean Accuracy: {best_score:.4f}")

print("Selected feature indices:", selected_features)

Selected Feature 1: 22, Mean Accuracy: 0.8822
Selected Feature 2: 24, Mean Accuracy: 0.9542
Selected Feature 3: 1, Mean Accuracy: 0.9578
Selected Feature 4: 26, Mean Accuracy: 0.9666
Selected feature indices: [22, 24, 1, 26]


### Backward Elimination

Start with all available features.In each iteration, remove the least important feature and evaluate the model.

In [25]:
model = RandomForestClassifier()

In [23]:
# Initialize a list with all feature indices
all_features = list(range(X.shape[1]))

In [24]:
# Define the minimum number of features you want to retain
min_features_to_retain = 6

In [27]:
while len(all_features) > min_features_to_retain:
    worst_score = 1.0  # Initialize with a high value
    worst_feature = None

    for feature_idx in all_features:
        # Create a list of features without the current one
        candidate_features = [f for f in all_features if f != feature_idx]

        # Evaluate the model's performance using cross-validation
        scores = cross_val_score(model, X[:, candidate_features], y, cv=5, scoring='accuracy')
        mean_score = np.mean(scores)

        # Keep track of the worst-performing feature
        if mean_score < worst_score:
            worst_score = mean_score
            worst_feature = feature_idx

    if worst_feature is not None:
        all_features.remove(worst_feature)
        print(f"Removed Feature: {worst_feature}, Mean Accuracy: {worst_score:.4f}")

print("Remaining feature indices:", all_features) 
        
        

Removed Feature: 7, Mean Accuracy: 0.9543
Removed Feature: 21, Mean Accuracy: 0.9543
Removed Feature: 1, Mean Accuracy: 0.9473
Removed Feature: 28, Mean Accuracy: 0.9420
Removed Feature: 19, Mean Accuracy: 0.9455
Removed Feature: 13, Mean Accuracy: 0.9438
Removed Feature: 11, Mean Accuracy: 0.9368
Removed Feature: 5, Mean Accuracy: 0.9420
Removed Feature: 24, Mean Accuracy: 0.9438
Removed Feature: 20, Mean Accuracy: 0.9385
Removed Feature: 29, Mean Accuracy: 0.9368
Removed Feature: 6, Mean Accuracy: 0.9368


KeyboardInterrupt: 