In [None]:
import pandas as pd


def load_data():
    """用来生成训练、测试数据"""
    from sklearn.datasets import make_classification
    data_x, data_y = make_classification(n_samples=1000, n_classes=4, n_features=10, n_informative=8)
    df_x = pd.DataFrame(data_x, columns=['f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', "f_7", "f_8", "f_9", "f_10"])
    df_y = pd.Series(data_y)
    return df_x, df_y


def logistic_rfe(x_data, y_data): # RFE模型
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(random_state=0)
    selector = RFE(clf, 5)
    selector = selector.fit(x_data, y_data)
    print(selector.support_)  # 保留的特征为True
    # x_data.loc[:, selector.support_] # 保留的x
    print(selector.ranking_)  # 保留的特征等级为1，重要性依次向下
    for i in range(selector.ranking_.min(), selector.ranking_.max()):
        print(i, x_data.columns[selector.ranking_ == i].tolist())


def cv_ref_select(x_data, y_data): # RFECV模型
    """通过交叉验证来筛选模型，依次删除每个特征，然后训练模型，如果被删除特征对之后结果的影响很大，则说明很重要"""
    from sklearn.model_selection import StratifiedKFold
    from sklearn.feature_selection import RFECV
    from sklearn.tree import DecisionTreeClassifier
    cv_selector = RFECV(estimator=DecisionTreeClassifier(), step=1, cv=StratifiedKFold(10),
                        scoring='accuracy', n_jobs=1)
    cv_selector.fit(x_data, y_data)
    print("select feature: ", x_data.columns[cv_selector.get_support()])
    return cv_selector.transform(x_data)


if __name__ == '__main__':
    value_x, value_y = load_data()
    logistic_rfe(value_x, value_y)
    cv_ref_select(value_x, value_y)

