# Feature Selection

- starts with the evaluation of each individual feature and choose one that results in the best 
- best depends on the chosen criteria e.g: scoring='accuracy'
- Next, all possible combinations of the first feature (from step1) and a second feature is selected based on evaluation
- this goes on until predefined number of feature is selected

In [7]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# Importing the dataset - https://archive.ics.uci.edu/ml/datasets/wine
dataset = pd.read_csv('Data/Dimensionality/Wine.csv')
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values

X = X.astype(float)
y = y.astype(float)

# Splitting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [8]:
# Build Random Forest classifier for feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Forward Feature Selection
sfs_c = sfs(clf,
           k_features=6,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=4)

# Perform SFFS
sfs_c = sfs_c.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:   18.6s finished

[2020-04-30 12:22:52] Features: 1/6 -- score: 0.7745098039215685[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   16.9s finished

[2020-04-30 12:23:09] Features: 2/6 -- score: 0.9625668449197862[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   15.2s finished

[2020-04-30 12:23:24] Features: 3/6 -- score: 0.9848484848484849[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [9]:
feature_select = list(sfs_c.k_feature_idx_)
print(feature_select)

[2, 3, 4, 6, 8, 9]


In [11]:
# Build full model with selected features
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
#Only take the selected columns, else use X_train
clf.fit(X_train[:, feature_select], y_train)

#Only take the selected columns, else use X_train
y_train_pred = clf.predict(X_train[:, feature_select])
print('Training accuracy(feature_select): %.3f' % acc(y_train, y_train_pred))

#Only take the selected columns, else use X_train
y_test_pred = clf.predict(X_test[:, feature_select])
print('Testing accuracy(feature_select): %.3f' % acc(y_test, y_test_pred))

Training accuracy(feature_select): 0.992
Testing accuracy(feature_select): 0.867
