# Chapter10 Dimensionality Reduction Using Feature Selection

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from sklearn import datasets
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import RFECV

import numpy as np
import pandas as pd

## 10.1 Thresholding Numeric Feature Variance
Variance Thresholding (VT) is motivated by the idea that features with low variance are likely less useful than that with high variance.

First, calculate the variance of each feature, $x$ is the feature vector, $x_i$ is the feature, $\mu$ is the mean of that feature.
$$
\text{Var}(x)=\frac{1}{n}\sum_{i=1}^{n}(x_i-\mu)^2
$$
Second, drop all features whose variance lower than threshold.

In [2]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
# create threshold
threshold = VarianceThreshold(threshold=0.5)
# create high variance feature matrix
features_high_variance = threshold.fit_transform(features)
# view high variance feature matrix
print(features_high_variance[0:3])
print(features_high_variance.shape)

[[5.1 1.4 0.2]
 [4.9 1.4 0.2]
 [4.7 1.3 0.2]]
(150, 3)


In [3]:
# view variance
threshold.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [4]:
# do not standardize it
VarianceThreshold().fit(StandardScaler().fit_transform(features)).variances_

array([1., 1., 1., 1.])

## 10.2 Thresholding Binary Feature Variance

In [5]:
# feature 0: 0.8 class 0
# feature 1: 0.2 class 0
# feature 2: 0.6 class 0
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 0],
            [1, 0, 1]]
p = 0.75
threshold = VarianceThreshold(threshold=(p * (1 - p)))
threshold.fit_transform(features)
print(p * (1 - p))
print(threshold.fit(features).variances_)

array([[0],
       [1],
       [0],
       [0],
       [1]])

0.1875
[0.16 0.16 0.24]


## 10.3 Handling Highly Correlated Features

In [6]:
features = np.array(
            [[1, 1, 1],
            [2, 2, 0],
            [3, 3, 1],
            [4, 4, 0],
            [5, 5, 1],
            [6, 6, 0],
            [7, 7, 1],
            [8, 7, 0],
            [9, 7, 1]])
df = pd.DataFrame(features)
# create correlation matrix
corr_matrix = df.corr().abs()
df.corr()
# discard diagnol of corr_matrix
corr_matrix[np.eye(corr_matrix.shape[0], dtype=bool)] = np.nan
# find index of feature columns with correlation greater than 0.95
corr_matrix = corr_matrix[corr_matrix > 0.95]
df.iloc[:, corr_matrix.columns[corr_matrix.any()]]

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


Unnamed: 0,0,1
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6
6,7,7
7,8,7
8,9,7


## 10.4 Removing Irrelevant Features for Classification

**This part is about Analysis of Variance.**

Chi-square ($\chi^2$) statistics examines the independence of two categorical vectors
$$
\chi^2=\sum_{i=1}^n\frac{(O_i-E_i)^2}{E_i}
$$
$O_i$ is the number of observation in class $i$, $E_i$ is the number of observations in class $i$ we would expect if there is no relationship between the feature and target.

Also, we can use `f_classif` to calculate the F-value statistics.

In [7]:
iris = datasets.load_iris()
features = iris.data
target = iris.target
# convert to categorical data by converting data to int
features = features.astype(int)
# select two features with highest chi2 statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)
print("Original number of features =", features.shape[1])
print("Reduced number of features =", features_kbest.shape[1])
print(chi2(features, target))

Original number of features = 4
Reduced number of features = 2
(array([ 10.28712871,   5.02267003, 133.06854839,  74.27906977]), array([5.83684799e-03, 8.11598175e-02, 1.27213107e-29, 7.42172639e-17]))


In [8]:
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)
print("Original number of features =", features.shape[1])
print("Reduced number of features =", features_kbest.shape[1])
print(f_classif(features, target))

Original number of features = 4
Reduced number of features = 2
(array([  81.19715 ,   33.715004, 1160.0116  ,  385.483   ], dtype=float32), array([1.7586086e-24, 8.8784123e-13, 0.0000000e+00, 0.0000000e+00],
      dtype=float32))


In [9]:
# select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)
print("Original number of features =", features.shape[1])
print("Reduced number of features =", features_kbest.shape[1])

Original number of features = 4
Reduced number of features = 3


## 10.5 Recursively Eliminating Features
Recursive Feature Elimination using Cross Validation (RFECV) will repeatedly train a model, each time removing a feature until model performance becomes worse.

In [10]:
features, target = datasets.make_regression(n_samples=10000, n_features=100, n_informative=2, random_state=1)
# create a linear regression
ols = linear_model.LinearRegression()
# recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)
# numbor of best features
rfecv.n_features_

RFECV(estimator=LinearRegression(), scoring='neg_mean_squared_error')

array([[ 0.00850799,  0.7031277 ],
       [-1.07500204,  2.56148527],
       [ 1.37940721, -1.77039484],
       ...,
       [-0.80331656, -1.60648007],
       [ 0.39508844, -1.34564911],
       [-0.55383035,  0.82880112]])

2

In [11]:
# which categories are best
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [12]:
# rank of each feature, (1) means best
rfecv.ranking_

array([54, 53, 79, 35, 44,  1,  3, 21, 13, 78, 14, 50, 80, 38, 45, 41, 20,
        9, 96, 43, 32, 91, 34, 85, 27, 98, 49, 15,  5, 10, 99, 28,  7, 19,
       30, 46, 60, 87, 11,  1, 12, 31, 97, 70, 33, 89, 65, 42, 76, 75, 56,
       84,  2, 16, 47, 81, 90, 23, 93, 36, 82, 37,  6, 62, 51, 39, 94, 83,
       73, 67, 92, 61, 59, 57, 24, 48, 71, 66, 25, 26, 22, 72,  8, 29, 77,
       40, 74, 88, 86, 18, 69, 52, 64,  4, 58, 95, 17, 68, 55, 63])