# LAB 6 : Feature selection methods
All the different approaches to feature selection can be grouped into mainly two families of methods. There are unsupervised and supervised methods. The latter can be further divided into the wrapper and filter mothods. Let’s discuss them one by one.

In this lab, your task is to carefully review all instructions and fill the empty code cells with the necessary code to ensure everything functions correctly.


## Import necessary libraries

Requirement:
- pip install mlxtend
- pip install geneticalgorithm
- pip install sklearn-genetic-opt

In [7]:
!pip install sklearn-genetic-opt

Collecting sklearn-genetic-opt
  Using cached sklearn_genetic_opt-0.10.1-py3-none-any.whl (33 kB)
Collecting deap>=1.3.3 (from sklearn-genetic-opt)
  Using cached deap-1.4.1.tar.gz (1.1 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  ERROR: Error [WinError 225] Impossible de terminer l’opération, car le fichier contient un virus ou un logiciel potentiellement indésirable while executing command python setup.py egg_info
ERROR: Could not install packages due to an OSError: [WinError 225] Impossible de terminer l’opération, car le fichier contient un virus ou un logiciel potentiellement indésirable



In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, mutual_info_classif, f_classif
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn_genetic import GAFeatureSelectionCV

## Load the Iris dataset

In [16]:
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')

## Split the dataset into training and testing sets

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Function to evaluate the performance of a model

In [18]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, X_test)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

## Unsupervised feature selection methods


Feature selection using variance threshold

In [19]:
def variance_threshold_selector(X, threshold=0.0):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return X[X.columns[selector.get_support(indices=True)]]

In [20]:
X_var = variance_threshold_selector(X, threshold=0.1)

Feature selection using correlation

In [21]:
def correlation_selector(X, threshold=0.8):
#     calculate corr matrix 
    corr_matrix = X.corr().abs()
#     create a mask to select the upper triangle of the corr matrix 
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
#     find the columns to drop based on the corr threshold 
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    return X.drop(to_drop, axis=1)

In [68]:
X_corr = correlation_selector(X, threshold=0.8)
X_corr

Unnamed: 0,sepal length (cm),sepal width (cm)
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


# supervised feature selection methods

Feature selection using mutual information

In [53]:
# Feature selection using mutual information

def mutual_info_selector(X, y, top_k=4):
    selector = SelectKBest(mutual_info_classif, k=top_k)
    selector.fit(X, y)
    return X[X.columns[selector.get_support(indices=True)]]


In [52]:
# Feature selection using mutual information
X_mi = mutual_info_selector(X,y, top_k=2)
X_mi

Unnamed: 0,petal length (cm),petal width (cm)
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


Feature selection using filter methods: information gain

In [54]:
def information_gain_selector(X, y, top_k=4): 
    selector = SelectKBest(f_classif, k=top_k)
    selector.fit(X, y)
    return X[X.columns[selector.get_support(indices=True)]]

In [56]:
# Feature selection using information gain
X_ig = information_gain_selector(X, y, top_k=2)
X_ig

Unnamed: 0,petal length (cm),petal width (cm)
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


Feature selection using filter methods: chi-squared test

In [27]:
def chi_squared_selector(X, y, top_k=4):
    selector = SelectKBest(chi2, k=top_k)
    selector.fit(X, y)
    return X[X.columns[selector.get_support(indices=True)]]

In [57]:
# Feature selection using chi-squared test
X_chi2 = chi_squared_selector(X, y, top_k=2)
X_chi2

Unnamed: 0,petal length (cm),petal width (cm)
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


In [29]:
# Create a random forest classifier (you can replace this with any other classifier of your choice)
model = RandomForestClassifier(n_estimators=100, random_state=42)

Feature selection using wrapper methods: forward selection

In [59]:
# Feature selection using wrapper methods: forward selection
def forward_selection(X, y, model):
    sfs = SequentialFeatureSelector(model, k_features='best', forward=True, scoring='accuracy', cv=5)
    sfs.fit(X, y)

    return X[X.columns[list(sfs.k_feature_idx_)]]

In [60]:
# Feature selection using forward selection
X_forward = forward_selection(X, y, model)
X_forward

Unnamed: 0,petal length (cm),petal width (cm)
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


Feature selection using wrapper methods: backward elimination

In [66]:
def backward_elimination(X, y, model):
    sfs = SequentialFeatureSelector(model, k_features='best', forward=False, scoring='accuracy', cv=5)
    sfs.fit(X, y)
    return X[X.columns[list(sfs.k_feature_idx_)]]

In [69]:
# Feature selection using backward elimination
X_backward = backward_elimination(X, y,model)
X_backward

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


Feature selection using wrapper methods: genetic algorithm

In [63]:
def genetic_algorithm_selector(X,y,model):

    selector = GAFeatureSelectionCV(model,
                                         cv=5,
                                         scoring='neg_root_mean_squared_error',
                                         population_size=10,
                                         generations=5,
                                         tournament_size=5,
                                         elitism=True,
                                         crossover_probability=0.9,
                                         mutation_probability=0.1,
                                         criteria='max',
                                         algorithm='eaMuPlusLambda',
                                         n_jobs=1,
                                         verbose=True,
                                         keep_top_k=4)

    selector.fit(X,y)
    return X[X.columns[selector.support_]]


In [65]:
X_genetic = genetic_algorithm_selector(X,y,model)
X_genetic

gen	nevals	fitness  	fitness_std	fitness_max	fitness_min
0  	10    	-0.330761	0.276047   	-0.161184  	-1.03342   
1  	20    	-0.163855	0.00267041 	-0.161184  	-0.166525  
2  	20    	-0.161718	0.00160225 	-0.161184  	-0.166525  
3  	20    	-0.161184	2.77556e-17	-0.161184  	-0.161184  
4  	20    	-0.161184	2.77556e-17	-0.161184  	-0.161184  
5  	20    	-0.161184	2.77556e-17	-0.161184  	-0.161184  


Unnamed: 0,petal length (cm),petal width (cm)
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


## Evaluate the performance of the model after feature selection

In [72]:
selected_features = [X_var,X_corr,X_mi,X_ig,X_chi2,X_forward,X_backward,X_genetic]
selected_feature_names = ['Variance Threshold', 'Correlation', 'Mutual Information', 'Information Gain',
                           'Chi-squared Test', 'Forward Selection', 'Backward Elimination', 'Genetic Algorithm']

results = []

for name, X_selected in zip(selected_feature_names, selected_features):
    X_test_selected = X_test[X_test.X_selected.columns]  # Apply the same feature selection to the test set
    accuracy = evaluate_model(model, X_selected, X_test_selected, y, y_test)
    results.append({'Method': name, 'Accuracy': accuracy})


AttributeError: 'DataFrame' object has no attribute 'X_selected'

In [None]:
# Display the results
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# Plot the results
plt.figure(figsize=(10, 6))
sns.barplot(x='Accuracy', y='Method', data=results_df.sort_values(by='Accuracy', ascending=False))
plt.title('Accuracy after Feature Selection')
plt.show()