# Univariate Feature Selection

In [1]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
dataframe = pd.read_csv('pima-indians-diabetes.data.csv')
dataframe.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [4]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv('pima-indians-diabetes.data.csv',names=names)
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
array = dataframe.values
array

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [6]:
X = array[:, 0:8]
Y = array[:, 8]

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [7]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Recursive Feature Elimination

In [8]:
# Feature Extraction with RFE
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [9]:
# Load Data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv('pima-indians-diabetes.data.csv',names=names)

array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

In [12]:
# Feature extraction 
model = LogisticRegression(max_iter=400)
rfe = RFE(model,n_features_to_select=  3)
fit = rfe.fit(X, Y)

In [13]:
# Num Features:
fit.n_features_

3

In [14]:
# Selected Features:
fit.support_

array([ True, False, False, False, False,  True,  True, False])

In [15]:
# Feature Ranking:
fit.ranking_

array([1, 2, 4, 6, 5, 1, 1, 3])

In [16]:
names

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Feature Importance using Decision Tree

In [17]:
# Feature Importance with Extra Trees Classifier
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [18]:
# Load Data
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv('pima-indians-diabetes.data.csv',names=names)

array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

In [19]:
# feature extraction
model = DecisionTreeClassifier()

In [20]:
model.fit(X,Y)

DecisionTreeClassifier()

In [21]:
print(model.feature_importances_)

[0.061 0.319 0.101 0.025 0.038 0.208 0.124 0.124]
