### WRAPPER METHODS FOR FEATURE SELECTION

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [3]:
# Data
from sklearn.datasets import load_wine
data = load_wine()

In [4]:
data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [5]:
print(data.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [6]:
# Predictors
X = pd.DataFrame(data.data, columns = data.feature_names)

X.head()


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [7]:
# target
y = data.target
print(pd.Series(y).value_counts())
print(y[0:20])

1    71
0    59
2    48
dtype: int64
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [8]:
# Missing values
X.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
alcohol                         178 non-null float64
malic_acid                      178 non-null float64
ash                             178 non-null float64
alcalinity_of_ash               178 non-null float64
magnesium                       178 non-null float64
total_phenols                   178 non-null float64
flavanoids                      178 non-null float64
nonflavanoid_phenols            178 non-null float64
proanthocyanins                 178 non-null float64
color_intensity                 178 non-null float64
hue                             178 non-null float64
od280/od315_of_diluted_wines    178 non-null float64
proline                         178 non-null float64
dtypes: float64(13)
memory usage: 18.2 KB


In [10]:
# Data Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((142, 13), (36, 13))

### Sequential Forward Selection (SFS)

In [11]:
sfs = SFS(estimator=RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = (4, 8), # best subset of features between 4 and 8 features 
         forward = True, 
         verbose = 2, 
         scoring = 'accuracy',
         cv = 4,
         n_jobs = -1)
sfs.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   25.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   25.4s finished

[2019-10-19 16:54:20] Features: 1/8 -- score: 0.7744132788250435[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    7.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    7.4s finished

[2019-10-19 16:54:28] Features: 2/8 -- score: 0.965233931410402[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    6.7s finished

[2019-10-19 16:54:34] Features: 3/8 -- score: 0.9861003861003861[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.5s finished

[2019-10-19 16:54:41] Features: 4/8 -- score: 0.9793543

SequentialFeatureSelector(clone_estimator=True, cv=4,
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
             floating=False, forward=True, k_features=(4, 8), n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='accuracy', verbose=2)

In [12]:
# Slected faeatures
sfs.k_feature_names_

('alcohol',
 'magnesium',
 'flavanoids',
 'color_intensity',
 'od280/od315_of_diluted_wines')

In [13]:
# cross validated socre
sfs.k_score_

0.9932432432432432

In [14]:
# Training Results
pd.DataFrame(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(6,)","[0.7567567567567568, 0.8333333333333334, 0.742...",0.774413,"(flavanoids,)",0.0559516,0.0349047,0.0201522
2,"(6, 9)","[0.918918918918919, 1.0, 0.9714285714285714, 0...",0.965234,"(flavanoids, color_intensity)",0.0468773,0.0292438,0.0168839
3,"(4, 6, 9)","[0.972972972972973, 1.0, 0.9714285714285714, 1.0]",0.9861,"(magnesium, flavanoids, color_intensity)",0.022298,0.0139103,0.00803114
4,"(0, 4, 6, 9)","[0.972972972972973, 0.9444444444444444, 1.0, 1.0]",0.979354,"(alcohol, magnesium, flavanoids, color_intensity)",0.0368329,0.0229778,0.0132662
5,"(0, 4, 6, 9, 11)","[0.972972972972973, 1.0, 1.0, 1.0]",0.993243,"(alcohol, magnesium, flavanoids, color_intensi...",0.0187598,0.011703,0.00675676
6,"(0, 2, 4, 6, 9, 11)","[0.972972972972973, 0.9722222222222222, 1.0, 1.0]",0.986299,"(alcohol, ash, magnesium, flavanoids, color_in...",0.0219669,0.0137038,0.00791188
7,"(0, 2, 3, 4, 6, 9, 11)","[0.972972972972973, 1.0, 1.0, 1.0]",0.993243,"(alcohol, ash, alcalinity_of_ash, magnesium, f...",0.0187598,0.011703,0.00675676
8,"(0, 2, 3, 4, 6, 9, 10, 11)","[0.9459459459459459, 1.0, 1.0, 1.0]",0.986486,"(alcohol, ash, alcalinity_of_ash, magnesium, f...",0.0375195,0.0234061,0.0135135


In [15]:
# performance evaluation with the selected features
X_train_sfs = sfs.transform(X_train)
X_test_sfs = sfs.transform(X_test)

Rf_model = RandomForestClassifier(n_estimators=100, random_state = 0, n_jobs = -1)
Rf_model.fit(X_train_sfs, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [16]:
# Evaluating the performace on the test set
pred_test = Rf_model.predict(X_test_sfs)

print(f'Accuracy: {np.round(accuracy_score(y_test, pred_test),2)}')

Accuracy: 0.94


In [17]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       1.00      0.88      0.93        16
           2       0.86      1.00      0.92         6

   micro avg       0.94      0.94      0.94        36
   macro avg       0.93      0.96      0.94        36
weighted avg       0.95      0.94      0.94        36



### Sequential Backward Selection (SBS)

In [18]:
sbs = SFS(estimator=RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = (1, 8), # best subset of features between 1 and 8 features 
         forward = False, 
         verbose = 2, 
         scoring = 'accuracy',
         cv = 4,
         n_jobs = -1)
sbs.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    8.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    8.5s finished

[2019-10-19 16:55:15] Features: 12/1 -- score: 0.9862987987987988[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    7.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    7.2s finished

[2019-10-19 16:55:22] Features: 11/1 -- score: 0.9862987987987988[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    6.7s finished

[2019-10-19 16:55:29] Features: 10/1 -- score: 0.9862987987987988[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.6s finished

[2019-10-19 16:55:36] Features: 9/1 -- score: 0.993

SequentialFeatureSelector(clone_estimator=True, cv=4,
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
             floating=False, forward=False, k_features=(1, 8), n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='accuracy', verbose=2)

In [19]:
# features selected
sbs.k_feature_names_

('alcohol',
 'malic_acid',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'color_intensity')

In [20]:
# cross validated score
sbs.k_score_

0.9932432432432432

We have same accuracy for both forward and backward selection, but the subset of features are slight different.

In [21]:
# performance evaluation with the selected features
X_train_sbs = sbs.transform(X_train)
X_test_sbs = sbs.transform(X_test)

Rf_model = RandomForestClassifier(n_estimators=100, random_state = 0, n_jobs = -1)
Rf_model.fit(X_train_sbs, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [22]:
# Evaluating the performace on the test set
pred_test = Rf_model.predict(X_test_sbs)

print(f'Accuracy: {np.round(accuracy_score(y_test, pred_test),2)}')

Accuracy: 0.92


In [23]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.88      1.00      0.93        14
           1       1.00      0.81      0.90        16
           2       0.86      1.00      0.92         6

   micro avg       0.92      0.92      0.92        36
   macro avg       0.91      0.94      0.92        36
weighted avg       0.93      0.92      0.92        36



### Exhaustive Feature Selection(EFS)

Exhaustive Feature selection performs the search over all possible subsets of features to select the best subset of features. It is computationally very time consuming but gives the best result.

In [24]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [25]:
efs = EFS(estimator=RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1),
         min_features = 3, # minimum number of features in a feature subset
         max_features = 5, # maximum number of features in a feature subset
         scoring = 'accuracy',
         cv = None,
         n_jobs = -1)
efs.fit(X_train,y_train)

Features: 2288/2288

ExhaustiveFeatureSelector(clone_estimator=True, cv=None,
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
             max_features=5, min_features=3, n_jobs=-1,
             pre_dispatch='2*n_jobs', print_progress=True,
             scoring='accuracy')

In [26]:
# best score
efs.best_score_

1.0

In [27]:
# best subset of features
efs.best_feature_names_

('alcohol', 'malic_acid', 'ash')

Here just by selecting the 3 features we recieve an accuracy score of 100%. But, for time saving we have not performed cross validation, so the result may not be very good with the test data. 

In [28]:
# Final Modeling
X_train_efs = efs.transform(X_train)
X_test_efs = efs.transform(X_test)

In [29]:
Rf_model = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
Rf_model.fit(X_train_efs, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [30]:
# Evaluating the performace on the test set
pred_test = Rf_model.predict(X_test_efs)

print(f'Accuracy: {accuracy_score(y_test,pred_test)}')

Accuracy: 0.8888888888888888


In [31]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.93      0.88      0.90        16
           2       0.62      0.83      0.71         6

   micro avg       0.89      0.89      0.89        36
   macro avg       0.85      0.88      0.86        36
weighted avg       0.91      0.89      0.89        36



In [32]:
confusion_matrix(y_test, pred_test)

array([[13,  0,  1],
       [ 0, 14,  2],
       [ 0,  1,  5]], dtype=int64)