# Feature Selection using concepts from [this](http://scikit-learn.org/stable/modules/feature_selection.html#removing-features-with-low-variance) and [this](http://machinelearningmastery.com/feature-selection-machine-learning-python/) site 

In [1]:
import pandas
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
import numpy 

## Loading the data

In [2]:
dataset = pandas.read_csv("./features_with_headers.csv")
print(dataset.shape)
features = list(dataset.columns.values)
features

(2785, 9)


['edge_count',
 'pentagons',
 'triangles',
 'squares',
 'circles',
 'halfcircles',
 'blob_count',
 'color_count',
 'class']

In [3]:
features.remove("class")
X=dataset[features]
X[:2]

Unnamed: 0,edge_count,pentagons,triangles,squares,circles,halfcircles,blob_count,color_count
0,986,61,86,641,53,252,178,2
1,217,10,11,360,8,84,30,3


In [4]:
le =LabelEncoder()
Y = le.fit_transform(dataset['class'])
Y

array([6, 6, 2, ..., 7, 3, 2])

##  Recursive feature elimination

In [5]:
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

Num Features: 3
Selected Features: [False  True False False False False  True  True]
Feature Ranking: [5 1 2 6 3 4 1 1]


## Principal Component Analysis

In [6]:
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s") % fit.explained_variance_ratio_
print(fit.components_)

Explained Variance: [ 0.77565207  0.203751    0.01435521]
[[  5.76642214e-01   2.70005517e-02   2.92523910e-02   7.89640086e-01
    3.10978745e-02   1.80485680e-01   9.39436024e-02   8.54144313e-06]
 [  7.93543203e-01   1.09189888e-02   1.17983390e-02  -6.01644946e-01
    2.29658720e-03   5.68540300e-02   6.94007117e-02  -6.93764275e-05]
 [ -1.94123153e-01   1.37929179e-01   1.79998007e-01  -1.13321023e-01
    1.04182347e-01   7.55352332e-01   5.62707404e-01   2.74940630e-04]]


## Feature Importance

In [7]:
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[ 0.18307776  0.11050935  0.11881409  0.12935678  0.11747446  0.14978981
  0.14570256  0.0452752 ]


##  Univariate Selection

In [8]:
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:6,:])

[  5.838e+05   2.207e+04   3.546e+04   6.217e+05   2.474e+04   1.427e+05
   9.529e+04   7.608e+01]
[[986 641 252 178]
 [217 360  84  30]
 [507  12  41  16]
 [258 102  36   3]
 [ 15   0   1   0]
 [876 356 154 115]]


## Removing features with low variance

In [9]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

array([[ 986,   61,   86, ...,  252,  178,    2],
       [ 217,   10,   11, ...,   84,   30,    3],
       [ 507,    3,    1, ...,   41,   16,    2],
       ..., 
       [ 289,    1,    0, ...,   10,    5,    1],
       [1126,   58,   68, ...,  303,  151,    2],
       [ 269,   10,    6, ...,   59,   23,    1]])