# Feature Selection

- starts with the evaluation of each individual feature and choose one that results in the best 
- best depends on the chosen criteria e.g: scoring='accuracy'
- Next, all possible combinations of the first feature (from step1) and a second feature is selected based on evaluation
- this goes on until predefined number of feature is selected

In [6]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

# Importing the dataset - https://archive.ics.uci.edu/ml/datasets/wine
dataset = pd.read_csv('Data/Dimensionality/Wine.csv')
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values

X = X.astype(float)
y = y.astype(float)

# Splitting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [7]:
# Build Random Forest classifier for feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Forward Feature Selection
sfs_c = sfs(clf,
           k_features=6,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=4)

# Perform SFFS
sfs_c = sfs_c.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:   22.0s finished

[2020-07-28 14:02:59] Features: 1/6 -- score: 0.7671568627450982[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:   17.5s finished

[2020-07-28 14:03:17] Features: 2/6 -- score: 0.9549910873440286[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   15.8s finished

[2020-07-28 14:03:33] Features: 3/6 -- score: 0.9924242424242424[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [8]:
feature_select = list(sfs_c.k_feature_idx_)
print(feature_select)

[0, 1, 4, 5, 6, 9]


In [9]:
# Build full model with selected features
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
#Only take the selected columns, else use X_train
clf.fit(X_train[:, feature_select], y_train)

#Only take the selected columns, else use X_train
y_train_pred = clf.predict(X_train[:, feature_select])
print('Training accuracy(feature_select): %.3f' % acc(y_train, y_train_pred))

#Only take the selected columns, else use X_train
y_test_pred = clf.predict(X_test[:, feature_select])
print('Testing accuracy(feature_select): %.3f' % acc(y_test, y_test_pred))

Training accuracy(feature_select): 1.000
Testing accuracy(feature_select): 0.956


## Taking a different dataset

In [16]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

#Read the data
df = pd.read_csv("Data/Classification/pima-data.csv")

#Check the Correlation
df.corr()
#Delete the correlated feature
del df['skin']

#Data Molding
diabetes_map = {True : 1, False : 0}
df['diabetes'] = df['diabetes'].map(diabetes_map)

#Splitting the data
from sklearn.model_selection import train_test_split

#This will copy all columns from 0 to 7(8 - second place counts from 1)
X = df.iloc[:, 0:8].values
y = df.iloc[:, 8].values

# Splitting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [20]:
# Build Random Forest classifier for feature selection
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()

# Forward Feature Selection
sfs_c = sfs(nb_model,
           k_features=6,
           forward=True,
           floating=False,
           verbose=3,
           scoring='accuracy',
           cv=4)

# Perform SFFS
sfs_c = sfs_c.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s finished

[2020-07-28 14:10:06] Features: 1/6 -- score: 0.7465277777777778[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s finished

[2020-07-28 14:10:06] Features: 2/6 -- score: 0.7569444444444444[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parall

In [18]:
feature_select = list(sfs_c.k_feature_idx_)
print(feature_select)

[0, 1, 2, 5, 6, 7]


In [19]:
# Build full model with selected features
nb_model = GaussianNB()
#Only take the selected columns, else use X_train
nb_model.fit(X_train[:, feature_select], y_train)

#Only take the selected columns, else use X_train
y_train_pred = nb_model.predict(X_train[:, feature_select])
print('Training accuracy(feature_select): %.3f' % acc(y_train, y_train_pred))

#Only take the selected columns, else use X_train
y_test_pred = nb_model.predict(X_test[:, feature_select])
print('Testing accuracy(feature_select): %.3f' % acc(y_test, y_test_pred))

Training accuracy(feature_select): 0.766
Testing accuracy(feature_select): 0.766
