In [1]:
#方差阈值法
from sklearn.datasets import load_iris
from sklearn.feature_selection import VarianceThreshold
import numpy as np

# 加载数据集
iris = load_iris()
X = iris.data
y = iris.target

# 打印数据集中的特征数和每个特征的方差
print('原数据集中的特征数：\n', X.shape[1], '\n')
print('原数据集中不同特征的方差：\n', np.var(X, axis=0), '\n')

# 使用VarianceThreshold来过滤掉方差在0.6以下的特征
selector = VarianceThreshold(threshold=0.6)
X_new = selector.fit_transform(X)

# 打印新数据集的特征数
print('方差阈值法选择的特征数：\n', X_new.shape[1])

原数据集中的特征数：
 4 

原数据集中不同特征的方差：
 [0.68112222 0.18871289 3.09550267 0.57713289] 

方差阈值法选择的特征数：
 2


In [3]:
#SelectKBest
from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

boston = load_boston()
X = boston.data
y = boston.target
print(X.shape)
print(y.shape)

(506, 13)
(506,)



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1001)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R2: ', r2)
print('MSE: ', mse)

R2:  0.6783942923302058
MSE:  29.824006898863182


In [5]:
selector = SelectKBest(f_regression, k=10) #选择10个特征
X_new = selector.fit_transform(X, y)
print(X_new.shape)

(506, 10)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, random_state=1001)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R2: ', r2)
print('MSE: ', mse)

R2:  0.682644378927699
MSE:  29.429876418646238


SelectFpr方法则要求对假设检验有一定的认识。这种方法基于FPR测试，即False Positive Rate，其含义为假阳性率，是指被我们预测为正但实际为负的样本的比例，也就是假设检验中的一类错误发生的比例。

In [8]:
#SelectFpr
from sklearn.feature_selection import SelectFpr
selector = SelectFpr(f_regression, alpha=0.00001)
X_new = selector.fit_transform(X, y)
print(X_new.shape)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, random_state=1001)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R2: ', r2)
print('MSE: ', mse)

(506, 12)
R2:  0.6935405006596658
MSE:  28.419427903725335


SelectFdr则是根据错误发现率（FDR，False Discovery Rate）来为特征排序。错误发现率与假阳性率非常相似，但是却有着本质的不同。假阳性率是在所有样本中一类错误发生的概率，而错误发现率则仅仅关注在我们拒绝原假设(预测为正)的样本中，有多大比例是犯了一类错误的（即在"发现"的样本里，有多少是错误的"发现"）。

In [10]:
#SelectFdr
from sklearn.feature_selection import SelectFdr
selector = SelectFdr(f_regression, alpha=0.00001)
X_new = selector.fit_transform(X, y)
print(X_new.shape)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, random_state=1001)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R2: ', r2)
print('MSE: ', mse)

(506, 12)
R2:  0.6935405006596658
MSE:  28.419427903725335


在多重假设检验中，总体错误率(族系误差率，FWER，family-wise error rate)是另一个常用的错误控制指标，它与FDR的区别在于，FWER是指至少出现一次一类错误的概率，而FDR则是关注预测为正的样本中一类错误发生的比例。

In [11]:
#SelectFwe
from sklearn.feature_selection import SelectFwe
selector = SelectFwe(f_regression, alpha=0.0000001)
X_new = selector.fit_transform(X, y)
print(X_new.shape)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, random_state=1001)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R2: ', r2)
print('MSE: ', mse)

(506, 11)
R2:  0.690619122897421
MSE:  28.69034097665137


这种方法将上述五种方法集成到了一起，然后我们将评分方法、选择方法以参数的形式传递进来即可。

In [9]:
#GenericUnivariateSelect
from sklearn.feature_selection import GenericUnivariateSelect
selector = GenericUnivariateSelect(f_regression, mode='fpr', param=0.0000001)
X_new = selector.fit_transform(X, y)
print(X_new.shape)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, random_state=1001)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R2: ', r2)
print('MSE: ', mse)

(506, 12)
R2:  0.6935405006596658
MSE:  28.419427903725335
