# LAB 6 : Feature selection methods
All the different approaches to feature selection can be grouped into mainly two families of methods. There are unsupervised and supervised methods. The latter can be further divided into the wrapper and filter mothods. Let’s discuss them one by one.

In this lab, your task is to carefully review all instructions and fill the empty code cells with the necessary code to ensure everything functions correctly.


## Import necessary libraries

Requirement:
- pip install mlxtend
- pip install geneticalgorithm
- pip install sklearn-genetic-opt

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, mutual_info_classif, f_classif
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn_genetic import GAFeatureSelectionCV

## Load the Iris dataset

In [None]:
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')

## Split the dataset into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split('FIXME', 'FIXME', test_size=0.2, random_state=42)

## Function to evaluate the performance of a model

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit('FIXME', 'FIXME')
    y_pred = 'FIXME'
    accuracy = accuracy_score('FIXME', 'FIXME')
    return accuracy

## Unsupervised feature selection methods


Feature selection using variance threshold

In [None]:
def variance_threshold_selector(X, threshold=0.0):
    selector = VarianceThreshold('FIXME')
    selector.'FIXME'
    return X[X.columns[selector.get_support(indices=True)]]

In [None]:
X_var = variance_threshold_selector('FIXME', threshold=0.1)

Feature selection using correlation

In [None]:
def correlation_selector(X, threshold=0.8):
    corr_matrix = X.'FIXME'.abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in 'FIXME' if any(upper[column] > threshold)]
    return X.drop(to_drop, axis=1)

In [None]:
X_corr = correlation_selector('FIXME', threshold=0.8)

# supervised feature selection methods

Feature selection using mutual information

In [None]:
# Feature selection using mutual information
def mutual_info_selector('FIXME', 'FIXME', top_k=4):
    selector = SelectKBest(mutual_info_classif, k='FIXME')
    selector.'FIXME'
    return X[X.columns[selector.get_support(indices=True)]]

In [None]:
# Feature selection using mutual information
X_mi = mutual_info_selector('FIXME', 'FIXME', top_k=2)

Feature selection using filter methods: information gain

In [None]:

def information_gain_selector('FIXME', 'FIXME', top_k=4):
    selector = SelectKBest('FIXME', k='FIXME')
    selector.'FIXME'
    return X[X.columns[selector.get_support(indices=True)]]

In [None]:
# Feature selection using information gain
X_ig = information_gain_selector('FIXME', 'FIXME', top_k=2)

Feature selection using filter methods: chi-squared test

In [None]:
def chi_squared_selector(X, y, top_k=4):
    selector = SelectKBest('FIXME', k='FIXME')
    selector.'FIXME'
    return X[X.columns[selector.get_support(indices=True)]]

In [None]:
# Feature selection using chi-squared test
X_chi2 = chi_squared_selector('FIXME', 'FIXME', top_k=2)

In [None]:
# Create a random forest classifier (you can replace this with any other classifier of your choice)
model = RandomForestClassifier(n_estimators=100, random_state=42)

Feature selection using wrapper methods: forward selection

In [None]:
# Feature selection using wrapper methods: forward selection
def forward_selection(X, y, 'FIXME'):
    sfs = SequentialFeatureSelector('FIXME', k_features='best', forward='FIXME', scoring='accuracy', cv=5)
    sfs.fit('FIXME')
    return X[X.columns[list('FIXME'.k_feature_idx_)]]

In [None]:
# Feature selection using forward selection
X_forward = forward_selection('FIXME', 'FIXME', 'FIXME')

Feature selection using wrapper methods: backward elimination

In [None]:
def backward_elimination(X, y, 'FIXME'):
    sfs = SequentialFeatureSelector('FIXME', k_features='best', forward='FIXME', scoring='accuracy', cv=5)
    sfs.fit('FIXME')
    return X[X.columns[list('FIXME'.k_feature_idx_)]]

In [None]:
# Feature selection using backward elimination
X_backward = backward_elimination('FIXME', 'FIXME', 'FIXME')

Feature selection using wrapper methods: genetic algorithm

In [None]:
def genetic_algorithm_selector('FIXME', 'FIXME', 'FIXME'):

    selector = GAFeatureSelectionCV('FIXME',
                                         cv=5,
                                         scoring='neg_root_mean_squared_error',
                                         population_size=10,
                                         generations=5,
                                         tournament_size=5,
                                         elitism=True,
                                         'FIXME'=0.9,
                                         'FIXME'=0.1,
                                         criteria='max',
                                         algorithm='eaMuPlusLambda',
                                         n_jobs=1,
                                         verbose=True,
                                         keep_top_k=4)

    selector.fit('FIXME')
    return X[X.columns['FIXME'.support_]]

In [None]:
X_genetic = genetic_algorithm_selector('FIXME', 'FIXME', 'FIXME')

## Evaluate the performance of the model after feature selection

In [None]:
selected_features = 'FIXME'
selected_feature_names = ['Variance Threshold', 'Correlation', 'Mutual Information', 'Information Gain',
                           'Chi-squared Test', 'Forward Selection', 'Backward Elimination', 'Genetic Algorithm']

results = []

for name, X_selected in zip(selected_feature_names, selected_features):
    X_test_selected = X_test['FIXME'.columns]  # Apply the same feature selection to the test set
    accuracy = evaluate_model(model, 'FIXME', 'FIXME', 'FIXME', 'FIXME')
    results.append({'Method': name, 'Accuracy': accuracy})


In [None]:
# Display the results
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# Plot the results
plt.figure(figsize=(10, 6))
sns.barplot(x='Accuracy', y='Method', data=results_df.sort_values(by='Accuracy', ascending=False))
plt.title('Accuracy after Feature Selection')
plt.show()