# Feature Selection using concepts from [this](http://scikit-learn.org/stable/modules/feature_selection.html#removing-features-with-low-variance) and [this](http://machinelearningmastery.com/feature-selection-machine-learning-python/) site 

In [1]:
import pandas
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
import numpy 

In [2]:
names = ['edge_count','Pentagons','Triangles','Squares','Circle','HalfCircles','STYLE']
dataset = pandas.read_csv("final.csv", names=names)

In [3]:
# Split-out validation dataset
array = dataset.values
#Extracting the data features
X = array[:,0:6]

#Creating label array
Y = array[:,6]


##  Recursive feature elimination

In [4]:
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

Num Features: 3
Selected Features: [False  True False False  True  True]
Feature Ranking: [3 1 2 4 1 1]


## Principal Component Analysis

In [5]:
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s") % fit.explained_variance_ratio_
print(fit.components_)

Explained Variance: [ 0.88614694  0.10627074  0.00667554]
[[ 0.49622896  0.02169217  0.02046038  0.84761601  0.03321113  0.18251512]
 [-0.83309823 -0.01984062 -0.02191307  0.52577002 -0.02303145 -0.16765724]
 [ 0.24433406 -0.11021018 -0.12527079  0.07121203 -0.14583545 -0.9413409 ]]


## Feature Importance

In [6]:
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[ 0.24214763  0.12944489  0.1298218   0.14837885  0.14960963  0.2005972 ]


##  Univariate Selection

In [7]:
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:6,:])

[ 82632.408   2232.008   2307.303  92321.41    3328.157  23473.658]
[[  1.927e+03   5.880e+03   1.180e+02   6.020e+02]
 [  2.670e+02   8.400e+01   8.000e+00   4.500e+01]
 [  4.380e+02   2.800e+02   2.000e+01   1.100e+02]
 [  3.890e+02   3.110e+02   1.500e+01   1.130e+02]
 [  2.600e+01   3.000e+01   3.000e+00   1.900e+01]
 [  2.350e+02   6.100e+01   5.000e+00   9.000e+00]]


## Removing features with low variance

In [8]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

array([[  1.927e+03,   9.700e+01,   4.700e+01,   5.880e+03,   1.180e+02,
          6.020e+02],
       [  2.670e+02,   6.000e+00,   5.000e+00,   8.400e+01,   8.000e+00,
          4.500e+01],
       [  4.380e+02,   1.600e+01,   1.100e+01,   2.800e+02,   2.000e+01,
          1.100e+02],
       ..., 
       [  4.020e+02,   7.000e+00,   7.000e+00,   1.440e+02,   8.000e+00,
          5.800e+01],
       [  7.770e+02,   3.000e+00,   6.000e+00,   1.940e+02,   1.100e+01,
          7.800e+01],
       [  1.940e+02,   3.000e+00,   1.000e+01,   1.030e+02,   7.000e+00,
          3.700e+01]])