In [1]:
from sklearn import feature_selection

In [14]:
######################################################Removing features with low variance#########################################

In [15]:
#Boolean features are Bernoulli random variables, and the variance of such variables is given by

                                            #{Var}[X] = p(1 - p)

#As an example, suppose that we have a dataset with boolean features, and we want to remove all features that 
#are either one or zero (on or off) in more than 80% of the samples.so we can select using the threshold .8 * (1 - .8)

In [16]:
clf=feature_selection.VarianceThreshold(.8*(1-.8))

In [17]:
array=[[1,1,2],[0,0,2],[1,1,2],[0,0,2]]

In [18]:
clf.fit_transform(array)

array([[1, 1],
       [0, 0],
       [1, 1],
       [0, 0]])

In [19]:
#As expected, VarianceThreshold has removed the third column, which has a probability p = 4/4 > .8 of containing a two.

In [20]:
######################################################Univariate feature selection###############################################

In [22]:
#Univariate feature selection works by selecting the best features based on univariate statistical tests.

#. SelectKBest removes all but the k highest scoring features
#. SelectPercentile removes all but a user-specified highest scoring percentage of features
#. using common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, 
#  or family wise error SelectFwe.
#. GenericUnivariateSelect allows to perform univariate feature selection with a configurable strategy. This allows to select 
#  the best univariate selection strategy with hyper-parameter search estimator.

In [27]:
from sklearn.datasets import load_iris

In [76]:
from sklearn.feature_selection import SelectKBest,chi2,mutual_info_classif

In [31]:
from sklearn.svm import SVC

In [32]:
clf=SVC()

In [33]:
data=load_iris()

In [36]:
labels=data.target


In [39]:
data_features=data.data

In [41]:
from sklearn.model_selection import train_test_split as splitter

In [45]:
x_train,x_test,y_train,y_test=splitter(data_features,labels,test_size=0.3,random_state=21)

In [50]:
clf.fit(x_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [51]:
clf.score(x_test,y_test)

0.9333333333333333

In [79]:
x_new=SelectKBest(chi2,k=2).fit_transform(data_features,labels)

In [80]:
x_train,x_test,y_train,y_test=splitter(x_new,labels,test_size=0.3,random_state=21)

In [81]:
clf.fit(x_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [82]:
clf.score(x_test,y_test)

0.9111111111111111

In [111]:
from sklearn.feature_selection import GenericUnivariateSelect as gus

In [120]:
clf_3=gus(mode="k_best",param=2)

In [121]:
clf_3.fit_transform(data_features,labels)

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.7, 0.4],
       [1.4, 0.3],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.5, 0.1],
       [1.5, 0.2],
       [1.6, 0.2],
       [1.4, 0.1],
       [1.1, 0.1],
       [1.2, 0.2],
       [1.5, 0.4],
       [1.3, 0.4],
       [1.4, 0.3],
       [1.7, 0.3],
       [1.5, 0.3],
       [1.7, 0.2],
       [1.5, 0.4],
       [1. , 0.2],
       [1.7, 0.5],
       [1.9, 0.2],
       [1.6, 0.2],
       [1.6, 0.4],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.6, 0.2],
       [1.6, 0.2],
       [1.5, 0.4],
       [1.5, 0.1],
       [1.4, 0.2],
       [1.5, 0.1],
       [1.2, 0.2],
       [1.3, 0.2],
       [1.5, 0.1],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.3, 0.3],
       [1.3, 0.3],
       [1.3, 0.2],
       [1.6, 0.6],
       [1.9, 0.4],
       [1.4, 0.3],
       [1.6, 0.2],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [4.7, 1.4],
       [4.5, 1.5],
       [4.9,

In [123]:
clf_3.get_params()

{'mode': 'k_best',
 'param': 2,
 'score_func': <function sklearn.feature_selection.univariate_selection.f_classif(X, y)>}

In [124]:
from sklearn.feature_selection import SelectFpr

In [133]:
clf_4=SelectFpr(alpha=0.001)

In [134]:
clf_4.fit_transform(data_features,labels)

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

<p>Mutual info 
Estimate mutual information for a continuous target variable.between two random variables is a non-negative value, 
which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, 
and higher values mean higher dependency.The function relies on nonparametric methods based on entropy estimation 
from k-nearest neighbors distances</p>


In [141]:
from sklearn.feature_selection import mutual_info_regression as mutual

In [143]:
#clff=mutual()