# Technique of feature selection

* SelectKBest - univiariate - Statistical method
* Recursive feature elimination
* PCA
* Tree based section

In [4]:
import pandas as pd
import numpy as np

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
filename = 'https://gist.githubusercontent.com/SoumenAtta/e00bd5cb6ed13a983bf48b845325c837/raw/d5e3560b8bfe97ba126b5e94bed4487c2b1ed787/diabetes.csv'
df = pd.read_csv(filename, names=names, skiprows=1)

In [5]:
array = df.values
X = array[:, 0:8]
y = array[:, 8]

# Univriate feature selection method

In [19]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2

In [20]:
X.shape

(768, 8)

In [21]:
selectkbest = SelectKBest(score_func=chi2, k=4) # score_func=f_regression

In [22]:
sfit = selectkbest.fit(X, y)

In [23]:
sfit.scores_

array([ 111.51969064, 1411.88704064,   17.60537322,   53.10803984,
       2175.56527292,  127.66934333,    5.39268155,  181.30368904])

In [24]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [25]:
sfit.transform(X)

array([[148. ,   0. ,  33.6,  50. ],
       [ 85. ,   0. ,  26.6,  31. ],
       [183. ,   0. ,  23.3,  32. ],
       ...,
       [121. , 112. ,  26.2,  30. ],
       [126. ,   0. ,  30.1,  47. ],
       [ 93. ,   0. ,  30.4,  23. ]])

# Recursive Feature elimination

In [26]:
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [32]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

from warnings import filterwarnings
filterwarnings('ignore')

In [33]:
rfe = RFE(LogisticRegression(), n_features_to_select=4)

In [34]:
rfe.fit(X, y)

In [31]:
rfe.ranking_

array([1, 1, 3, 4, 5, 1, 1, 2])

In [35]:
rfe.support_

array([ True,  True, False, False, False,  True,  True, False])

In [36]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [37]:
rfe.transform(X)

array([[  6.   , 148.   ,  33.6  ,   0.627],
       [  1.   ,  85.   ,  26.6  ,   0.351],
       [  8.   , 183.   ,  23.3  ,   0.672],
       ...,
       [  5.   , 121.   ,  26.2  ,   0.245],
       [  1.   , 126.   ,  30.1  ,   0.349],
       [  1.   ,  93.   ,  30.4  ,   0.315]])

In [38]:
X[0, :]

array([  6.   , 148.   ,  72.   ,  35.   ,   0.   ,  33.6  ,   0.627,
        50.   ])