# Technique of feature selection

* SelectKBest - univiariate - Statistical method
* Recursive feature elimination
* PCA
* Tree based section

In [4]:
import pandas as pd
import numpy as np

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
filename = 'https://gist.githubusercontent.com/SoumenAtta/e00bd5cb6ed13a983bf48b845325c837/raw/d5e3560b8bfe97ba126b5e94bed4487c2b1ed787/diabetes.csv'
df = pd.read_csv(filename, names=names, skiprows=1)

In [5]:
array = df.values
X = array[:, 0:8]
y = array[:, 8]

# Univriate feature selection method

In [19]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2

In [20]:
X.shape

(768, 8)

In [21]:
selectkbest = SelectKBest(score_func=chi2, k=4) # score_func=f_regression

In [22]:
sfit = selectkbest.fit(X, y)

In [23]:
sfit.scores_

array([ 111.51969064, 1411.88704064,   17.60537322,   53.10803984,
       2175.56527292,  127.66934333,    5.39268155,  181.30368904])

In [24]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [25]:
sfit.transform(X)

array([[148. ,   0. ,  33.6,  50. ],
       [ 85. ,   0. ,  26.6,  31. ],
       [183. ,   0. ,  23.3,  32. ],
       ...,
       [121. , 112. ,  26.2,  30. ],
       [126. ,   0. ,  30.1,  47. ],
       [ 93. ,   0. ,  30.4,  23. ]])

# Recursive Feature elimination

In [26]:
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [32]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

from warnings import filterwarnings
filterwarnings('ignore')

In [33]:
rfe = RFE(LogisticRegression(), n_features_to_select=4)

In [34]:
rfe.fit(X, y)

In [31]:
rfe.ranking_

array([1, 1, 3, 4, 5, 1, 1, 2])

In [35]:
rfe.support_

array([ True,  True, False, False, False,  True,  True, False])

In [36]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [37]:
rfe.transform(X)

array([[  6.   , 148.   ,  33.6  ,   0.627],
       [  1.   ,  85.   ,  26.6  ,   0.351],
       [  8.   , 183.   ,  23.3  ,   0.672],
       ...,
       [  5.   , 121.   ,  26.2  ,   0.245],
       [  1.   , 126.   ,  30.1  ,   0.349],
       [  1.   ,  93.   ,  30.4  ,   0.315]])

In [38]:
X[0, :]

array([  6.   , 148.   ,  72.   ,  35.   ,   0.   ,  33.6  ,   0.627,
        50.   ])

# Principal Component Analysis (PCA)

In [39]:
from sklearn.decomposition import PCA

In [40]:
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [41]:
pca = PCA(n_components=3)

In [42]:
pca.fit(X, y)

In [43]:
pca.components_

array([[-2.02176587e-03,  9.78115765e-02,  1.60930503e-02,
         6.07566861e-02,  9.93110844e-01,  1.40108085e-02,
         5.37167919e-04, -3.56474430e-03],
       [-2.26488861e-02, -9.72210040e-01, -1.41909330e-01,
         5.78614699e-02,  9.46266913e-02, -4.69729766e-02,
        -8.16804621e-04, -1.40168181e-01],
       [-2.24649003e-02,  1.43428710e-01, -9.22467192e-01,
        -3.07013055e-01,  2.09773019e-02, -1.32444542e-01,
        -6.39983017e-04, -1.25454310e-01]])

In [44]:
pca.explained_variance_

array([13456.57298102,   932.76013231,   390.57783115])

In [45]:
pca.transform(X)

array([[-75.71465491, -35.95078264,  -7.26078895],
       [-82.3582676 ,  28.90821322,  -5.49667139],
       [-74.63064344, -67.90649647,  19.46180812],
       ...,
       [ 32.11319827,   3.3766648 ,  -1.58786446],
       [-80.21449431, -14.18601977,  12.3512639 ],
       [-81.30814972,  21.62149606,  -8.15276833]])

# Variance threshold - Feature selection

In [47]:
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [48]:
from sklearn.feature_selection import VarianceThreshold

In [49]:
vt = VarianceThreshold(threshold=200)

In [50]:
vt_fit = vt.fit(X, y)

In [51]:
vt_fit.variances_

array([1.13392724e+01, 1.02091726e+03, 3.74159449e+02, 2.54141900e+02,
       1.32638869e+04, 6.20790465e+01, 1.09635697e-01, 1.38122964e+02])

In [52]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Tree based Method for Feature selection

In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [61]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()

In [62]:
rf_fit = rf.fit(X, y)
et_fit = et.fit(X, y)

In [63]:
et_fit.feature_importances_

array([0.11163929, 0.23728995, 0.10266789, 0.07790683, 0.07596906,
       0.13881141, 0.11561966, 0.1400959 ])

In [58]:
rf_fit.feature_importances_

array([0.08735311, 0.25715049, 0.09360162, 0.06845722, 0.07201015,
       0.17150311, 0.12183458, 0.12808972])

In [59]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']