In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [3]:
!pip install mlxtend



In [9]:
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler

In [10]:
data = load_wine()

In [11]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [7]:
print(data.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

:Number of Instances: 178
:Number of Attributes: 13 numeric, predictive attributes and the class
:Attribute Information:
    - Alcohol
    - Malic acid
    - Ash
    - Alcalinity of ash
    - Magnesium
    - Total phenols
    - Flavanoids
    - Nonflavanoid phenols
    - Proanthocyanins
    - Color intensity
    - Hue
    - OD280/OD315 of diluted wines
    - Proline
    - class:
        - class_0
        - class_1
        - class_2

:Summary Statistics:

                                Min   Max   Mean     SD
Alcohol:                      11.0  14.8    13.0   0.8
Malic Acid:                   0.74  5.80    2.34  1.12
Ash:                          1.36  3.23    2.36  0.27
Alcalinity of Ash:            10.6  30.0    19.5   3.3
Magnesium:                    70.0 162.0    99.7  14.3
Total Phenols:                0.98  3.88    2.29  0.63
Flavanoids:                   0.34  5.08    2.03  1.00

In [12]:
X = pd.DataFrame(data.data)
y = data.target

In [13]:
X.columns = data.feature_names
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [14]:
X.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((142, 13), (36, 13))

# Step Forward Feature Selection (SFS)
Here, we are using SequentialFeatureSelector() and passing Random Forest Classifier in this we are passing number of estimators, random_state and number of jobs.

k number of features are the required number of features.

In this case, since it is forward step method, forward is equal to True.

For verbose it is for log here we are using 2.

Cross validation set,here we are choosing as 4.

Number of jobs means how many cores we will use, here -1 means use all the available core in this system.

In [12]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = 7,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:    9.2s remaining:   14.7s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   11.4s finished

[2025-03-30 02:19:29] Features: 1/7 -- score: 0.7674603174603174[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    2.3s remaining:    4.7s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    3.9s finished

[2025-03-30 02:19:33] Features: 2/7 -- score: 0.9577380952380952[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    2.6s remaining:   12.1s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    2.7s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    4.1s finished

[2025-03-30 02:19:38] Features: 3/7 -- score: 0.9859126984126985[Parallel(n_jobs=-1)]: Using backend Lok

In [13]:
sfs.k_feature_names_

('alcohol',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'proanthocyanins',
 'color_intensity',
 'hue')

In [14]:
sfs.k_feature_idx_

(0, 4, 5, 6, 8, 9, 10)

In [15]:
sfs.k_score_

np.float64(0.9861111111111112)

In [16]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(6,)","[0.7222222222222222, 0.8333333333333334, 0.742...",0.76746,"(flavanoids,)",0.06709,0.041853,0.024164
2,"(6, 9)","[0.9166666666666666, 1.0, 0.9714285714285714, ...",0.957738,"(flavanoids, color_intensity)",0.049936,0.031152,0.017986
3,"(4, 6, 9)","[0.9722222222222222, 1.0, 0.9714285714285714, ...",0.985913,"(magnesium, flavanoids, color_intensity)",0.022586,0.01409,0.008135
4,"(4, 5, 6, 9)","[0.9722222222222222, 1.0, 0.9714285714285714, ...",0.97877,"(magnesium, total_phenols, flavanoids, color_i...",0.019655,0.012262,0.007079
5,"(4, 5, 6, 8, 9)","[1.0, 1.0, 0.9714285714285714, 0.9714285714285...",0.985714,"(magnesium, total_phenols, flavanoids, proanth...",0.0229,0.014286,0.008248
6,"(0, 4, 5, 6, 8, 9)","[1.0, 1.0, 0.9714285714285714, 1.0]",0.992857,"(alcohol, magnesium, total_phenols, flavanoids...",0.019832,0.012372,0.007143
7,"(0, 4, 5, 6, 8, 9, 10)","[0.9444444444444444, 1.0, 1.0, 1.0]",0.986111,"(alcohol, magnesium, total_phenols, flavanoids...",0.038562,0.024056,0.013889


In [17]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = (1, 8),
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:    2.6s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    4.8s finished

[2025-03-30 02:19:58] Features: 1/8 -- score: 0.7674603174603174[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    2.3s remaining:    4.8s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    3.9s finished

[2025-03-30 02:20:02] Features: 2/8 -- score: 0.9577380952380952[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    3.7s remaining:   16.9s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    3.8s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    5.6s finished

[2025-03-30 02:20:08] Features: 3/8 -- score: 0.9859126984126985[Parallel(n_jobs=-1)]: Using backend Lok

In [18]:
# Let's go ahead and see the accuracy with this 7 features

sfs.k_score_

np.float64(0.9928571428571429)

In [19]:
# Now, we can see here selected feature from this algorithm

sfs.k_feature_names_

('alcohol',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'proanthocyanins',
 'color_intensity')

# Step Backward Selection (SBS)

Let's go ahead work with the Step Backward Selection. Have a look at the following script.

The only thing change here compared to Step Forward Selection, keep forward as False.

In [20]:
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs = -1),
         k_features = (1, 8),
          forward= False,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:    2.5s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    4.5s finished

[2025-03-30 02:20:31] Features: 12/1 -- score: 0.9861111111111112[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:    2.8s remaining:    5.8s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.8s finished

[2025-03-30 02:20:36] Features: 11/1 -- score: 0.9861111111111112[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    2.8s remaining:   12.9s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    2.9s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    4.4s finished

[2025-03-30 02:20:41] Features: 10/1 -- score: 0.9791666666666666[Parallel(n_jobs=-1)]: Using backend 

In [21]:
sbs = sfs
sbs.k_score_

np.float64(0.9859126984126985)

In [22]:
#Let's get the selected features.

sbs.k_feature_names_

('alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'flavanoids',
 'nonflavanoid_phenols',
 'color_intensity')

# Exhaustive Feature Selection (EFS)

Let's go ahead and learn about the Exhaustive Feature Selection(EFS).

In [2]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [None]:
#It will start with the subset of minimum features to maximum subset of features.

efs = EFS(RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1),
         min_features= 4,
          max_features= 5,
          scoring='accuracy',
          cv = None,
          n_jobs=-1
         ).fit(X_train, y_train)

In [29]:
# So, while training with exauhstive feature selection with minimum subset of 4 and 5 it has trained for 2002 subsets.
#C(13, 4) + C(13, 5) = 715 + 1287

715 + 1287

2002

In [None]:
# Let's find out best accuracy for EFS algorithm with the following code.

efs.best_score_

In [None]:
# Now get the selected features for the best score.

efs.best_feature_names_

In [None]:
rom mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

In [None]:
#Now, try to plot the graph of the performance with changing number of features.

plot_sfs(efs.get_metric_dict(), kind='std_dev')
plt.title('Performance of the EFS algorithm with changing number of features')
plt.show()