In [None]:
import csv
import time
import pandas as pd
import numpy as np
from scipy import interp
from math import isnan
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc, f1_score
from sklearn.externals import joblib
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


# 特征选择

In [None]:
def gbdt_feature_selection(fe_name, matrix_x_temp, label_y, th):
    """
    使用GBDT(梯度提升决策树)进行特征选择
    
    参数:
    - fe_name: 特征名称列表
    - matrix_x_temp: 特征矩阵
    - label_y: 标签数据
    - th: 特征重要性阈值
    """
    
    # 第一部分：模型训练和特征选择
    clf = GradientBoostingClassifier(n_estimators=50, random_state=100)  # 创建GBDT分类器
    clf.fit(matrix_x_temp, label_y)  # 训练模型
    # 使用SelectFromModel进行特征选择，只保留重要性大于阈值的特征
    sfm = SelectFromModel(clf, prefit=True, threshold=th)  
    matrix_x = sfm.transform(matrix_x_temp)  # 转换特征矩阵，只保留选中的特征

    # 第二部分：统计非零特征数量
    feature_score_dict = {}
    # 将特征名称和对应的重要性分数组合成字典
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    # 计算重要性为0的特征数量
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print('number of not-zero features:' + str(len(feature_score_dict) - m))

    # 第三部分：特征重要性排序和输出
    # 按重要性降序排序
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                     key=lambda d: d[1], reverse=True)
    # 打印特征重要性
    print('feature_importance:')
    for ii in range(len(feature_score_dict_sorted)):
        print(feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1])
    
    # 将特征重要性保存到文件
    f = open('../eda/gbdt_feature_importance.txt', 'w')
    f.write('Rank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + 
                str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # 第四部分：获取选中的特征
    how_long = matrix_x.shape[1]  # 获取转换后的特征数量
    # 获取重要性最高的前how_long个特征
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    
    # 打印选中的特征
    print('feature_chooesed:')
    for ii in range(len(feature_used_name)):
        print(feature_used_name[ii])
    
    # 将选中的特征保存到文件
    f = open('../eda/gbdt_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # 第五部分：获取未被选中的特征
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    # 返回：转换后的特征矩阵、未使用的特征名称列表、选中的特征数量
    return matrix_x, feature_not_used_name, len(feature_used_name)

## `gbdt_feature_selection` 函数详解

此函数 `gbdt_feature_selection`  使用梯度提升决策树（GBDT）模型进行特征选择，并返回选中的特征、未选中的特征以及其他相关信息。该函数的主要步骤包括：模型训练和特征选择、统计非零特征数量、特征重要性排序和输出、获取选中的特征以及获取未被选中的特征。

### 函数定义

```python
def gbdt_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # 函数体
```

*   **输入参数：**
    *   `fe_name`: 特征名称的列表。
    *   `matrix_x_temp`: 原始特征矩阵，是训练模型的输入。
    *   `label_y`: 目标变量，是监督学习模型的训练标签。
    *   `th`: 特征选择的阈值。
*   **返回值：**
    *   `matrix_x`: 经过特征选择后的特征矩阵。
    *   `feature_not_used_name`: 未被选中的特征名称列表。
    *  `len(feature_used_name)`: 选中的特征的数量。

### 第一部分：模型训练和特征选择

使用 GBDT 模型进行特征选择，并根据重要性阈值筛选特征。

1.  **创建 GBDT 分类器：**
    ```python
    clf = GradientBoostingClassifier(n_estimators=50, random_state=100)
    ```
    *   使用 `GradientBoostingClassifier` 创建一个 GBDT 分类器，设置 `n_estimators=50` 表示使用 50 个决策树，`random_state=100` 用于保证结果的可重复性。
2.  **训练 GBDT 模型：**
    ```python
    clf.fit(matrix_x_temp, label_y)
    ```
    *   使用原始特征矩阵 `matrix_x_temp` 和目标变量 `label_y` 训练 GBDT 模型。
3.  **使用 `SelectFromModel` 进行特征选择：**
    ```python
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)
    ```
    *   使用 `SelectFromModel` 类，并传入已经训练好的 GBDT 模型 `clf` 和指定的阈值 `th`。`prefit=True` 表示模型已经训练好。`threshold` 参数用于指定特征重要性的阈值，只有重要性大于 `th` 的特征才会被保留。
    *  `sfm.transform(matrix_x_temp)` 使用训练好的模型对特征矩阵进行转换，只保留重要性大于阈值的特征。
    
**目的:** 通过训练 GBDT 模型，并设置特征重要性阈值 `th`，来筛选出对模型有重要贡献的特征。

### 第二部分：统计非零特征数量

统计特征重要性不为零的特征数量。

1.  **创建特征重要性字典：**
    ```python
     feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    ```
    *   将特征名称 `fe_name` 和对应的重要性分数 `clf.feature_importances_` 组合成一个字典 `feature_score_dict`。
2.  **计算重要性为零的特征数量：**
    ```python
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print('number of not-zero features:' + str(len(feature_score_dict) - m))
    ```
    *   遍历特征重要性字典，计算重要性为零的特征数量 `m`，并打印非零特征的数量。

**目的:** 统计有多少特征对模型训练有贡献。

### 第三部分：特征重要性排序和输出

将特征按照重要性排序，并将结果打印并保存到文件中。

1.  **按重要性降序排序：**
    ```python
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                     key=lambda d: d[1], reverse=True)
    ```
    *   使用 `sorted` 函数对特征重要性字典进行排序，根据重要性分数降序排列。
2.  **打印特征重要性：**
    ```python
    print('feature_importance:')
    for ii in range(len(feature_score_dict_sorted)):
        print(feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1])
    ```
    *   遍历排序后的特征列表，打印特征名称和对应的特征重要性。
3.  **将特征重要性保存到文件：**
    ```python
    f = open('../eda/gbdt_feature_importance.txt', 'w')
    f.write('Rank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + 
                str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()
    ```
    *   将特征重要性排名写入到 `../eda/gbdt_feature_importance.txt` 文件中，包括排名、特征名称和特征重要性分数。

**目的:** 输出特征的重要性排序，方便查看哪些特征对模型的贡献最大，并保存结果以便后续分析。

### 第四部分：获取选中的特征

获取经过特征选择后保留的特征。

1.  **获取转换后的特征数量：**
    ```python
    how_long = matrix_x.shape[1]
    ```
    *   获取转换后特征矩阵 `matrix_x` 的列数，也就是保留的特征数量。
2.  **获取重要性最高的前 `how_long` 个特征：**
    ```python
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    ```
    *   从排序后的特征重要性列表中，取出前 `how_long` 个特征，并将特征名称存储到 `feature_used_name` 列表中。
3.  **打印选中的特征：**
    ```python
    print('feature_chooesed:')
    for ii in range(len(feature_used_name)):
        print(feature_used_name[ii])
    ```
    *   打印选中的特征名称。
4.  **将选中的特征保存到文件：**
    ```python
    f = open('../eda/gbdt_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()
    ```
    *   将选中的特征名称写入到 `../eda/gbdt_feature_chose.txt` 文件中。

**目的:**  获取经过 GBDT 模型选择后保留下来的特征，用于后续模型训练。

### 第五部分：获取未被选中的特征

获取被 GBDT 模型排除掉的特征。

1.  **获取未被选中的特征名称：**
    ```python
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])
    ```
    *  遍历原始特征列表 `fe_name`，如果特征名称没有在选中的特征名称列表 `feature_used_name` 中，则将其添加到 `feature_not_used_name` 列表中。

**目的:** 方便查看哪些特征在模型中被认为不重要。

### 返回值

```python
 return matrix_x, feature_not_used_name, len(feature_used_name)
```

*   函数返回经过特征选择后的特征矩阵 `matrix_x`，未被选中的特征名称列表 `feature_not_used_name`，以及选中的特征数量 `len(feature_used_name)`。

**总结:**

`gbdt_feature_selection`  函数使用 GBDT 模型进行特征选择，并返回选择后的特征矩阵和未使用的特征名称列表。该函数的主要步骤包括训练模型、计算特征重要性、根据阈值选择特征、打印特征重要性、保存选择的特征。

In [None]:
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # SelectfromModel
    clf = LGBMClassifier(n_estimators=50)
    clf.fit(matrix_x_temp, label_y)
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)

    # 打印出有多少特征重要性非零的特征
    feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print('number of not-zero features:' + str(len(feature_score_dict) - m))

    # 打印出特征重要性
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                       key=lambda d: d[1], reverse=True)
    print('feature_importance:')
    for ii in range(len(feature_score_dict_sorted)):
        print(feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1])
    print('\n')

    f = open('../eda/lgb_feature_importance.txt', 'w')
    f.write('Rank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # 打印具体使用了哪些字段
    how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    print('feature_chooesed:')
    for ii in range(len(feature_used_name)):
        print(feature_used_name[ii])
    print('\n')

    f = open('../eda/lgb_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # 找到未被使用的字段名
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    # 生成一个染色体（诸如01011100这样的）
    chromosome_temp = ''
    feature_name_ivar = fe_name[:-1]
    for ii in range(len(feature_name_ivar)):
        if feature_name_ivar[ii] in feature_used_name:
            chromosome_temp += '1'
        else:
            chromosome_temp += '0'
    print('Chromosome:')
    print(chromosome_temp)
    joblib.dump(chromosome_temp, '../config/chromosome.pkl')
    print('\n')
    return matrix_x, feature_not_used_name[:], len(feature_used_name)



In [None]:
def xgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # SelectfromModel
    clf = XGBClassifier(n_estimators=50)
    clf.fit(matrix_x_temp, label_y)
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)

    # 打印出有多少特征重要性非零的特征
    feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print('number of not-zero features:' + str(len(feature_score_dict) - m))

    # 打印出特征重要性
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                       key=lambda d: d[1], reverse=True)
    print('xgb_feature_importance:')
    for ii in range(len(feature_score_dict_sorted)):
        print(feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1])
    print('\n')

    f = open('../eda/xgb_feature_importance.txt', 'w')
    f.write('Rank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # 打印具体使用了哪些字段
    how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    print('feature_chooesed:')
    for ii in range(len(feature_used_name)):
        print(feature_used_name[ii])
    print('\n')

    f = open('../eda/xgb_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # 找到未被使用的字段名
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    # 生成一个染色体（诸如01011100这样的）
    chromosome_temp = ''
    feature_name_ivar = fe_name[:-1]
    for ii in range(len(feature_name_ivar)):
        if feature_name_ivar[ii] in feature_used_name:
            chromosome_temp += '1'
        else:
            chromosome_temp += '0'
    print('Chromosome:')
    print(chromosome_temp)
    joblib.dump(chromosome_temp, '../config/chromosome.pkl')
    print('\n')
    return matrix_x, feature_not_used_name[:], len(feature_used_name)



In [None]:
def data_test_feature_drop(data_test, feature_name_drop):
    # print feature_name_drop
    for col in feature_name_drop:
        data_test.drop(col, axis=1, inplace=True)
    print("data_test_shape:")
    print(data_test.shape)
    return data_test.as_matrix()



In [None]:
def write_predict_results_to_csv(csv_name, uid, prob_list):

    csv_file = file(csv_name, 'wb')
    writer = csv.writer(csv_file)
    combined_list = [['ID', 'pred']]
    if len(uid) == len(prob_list):
        for i in range(len(uid)):
            combined_list.append([str(uid[i]), str(prob_list[i])])
        writer.writerows(combined_list)
        csv_file.close()
    else:
        print('no和pred的个数不一致')



In [None]:

def xgb_lgb_cv_modeling():
    """

    :return:
    """

    '''Data input'''
    data_train = pd.read_csv('../data/train.csv', index_col='ID')
    data_predict = pd.read_csv('../data/pred.csv', index_col='ID')

    '''trainset feature engineering 根据具体的数据集进行编写'''
    data_train_without_label = data_train.drop('Label', axis=1)
    
    '''Sample'''
    # s = 0
    # np.random.seed(s)
    # sampler = np.random.permutation(len(data_train_without_label.values))
    # data_train_randomized = data_train_without_label.take(sampler)

    feature_name = list(data_train_without_label.columns.values)
    data_predict_user_id = list(data_predict.index.values)

    '''fillna'''
    frames = [data_train_without_label, data_predict]
    data_all = pd.concat(frames)
    data_train_filled = data_train_without_label.fillna(value=data_all.median())

    '''construct train and test dataset'''
    x_temp = data_train_filled.iloc[:, :].as_matrix()  # 自变量
    y = data_train.iloc[:, -1].as_matrix()  # 因变量

    '''Feature selection'''
    X, dropped_feature_name, len_feature_choose = xgb_feature_selection(feature_name, x_temp, y, '0.1*mean')
    # 0.1*mean可以选出10个特征
    # 0.00001*mean可以选出14个特征

    '''online test dataset -- B_test'''
    # del data_predict['V17']
    # data_predict['UserInfo_242x40'] = data_predict['UserInfo_242'] * data_predict['UserInfo_40']

    data_predict_filled = data_predict.fillna(value=data_all.median())
    data_predict_filled_after_feature_selection = data_test_feature_drop(data_predict_filled, dropped_feature_name)

    '''Split train/test data sets'''
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 分层抽样  cv的意思是cross-validation

    '''Choose a classification model'''
    parameter_n_estimators = 100
    classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1)

    '''hyperparameter optimization'''
    # param = {
    #     'max_depth': 6,
    #     'num_leaves': 64,
    #     'learning_rate': 0.03,
    #     'scale_pos_weight': 1,
    #     'num_threads': 40,
    #     'objective': 'binary',
    #     'bagging_fraction': 0.7,
    #     'bagging_freq': 1,
    #     'min_sum_hessian_in_leaf': 100
    # }
    #
    # param['is_unbalance'] = 'true'
    # param['metric'] = 'auc'

    # （1）num_leaves
    #
    # LightGBM使用的是leaf - wise的算法，因此在调节树的复杂程度时，使用的是num_leaves而不是max_depth。
    #
    # 大致换算关系：num_leaves = 2 ^ (max_depth)
    #
    # （2）样本分布非平衡数据集：可以param[‘is_unbalance’]=’true’
    #
    # （3）Bagging参数：bagging_fraction + bagging_freq（必须同时设置）、feature_fraction
    #
    # （4）min_data_in_leaf、min_sum_hessian_in_leaf

    '''Model fit, predict and ROC'''
    colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue'])
    lw = 2
    mean_f1 = 0.0
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 500)
    i_of_roc = 0
    a = 0

    th = 0.5

    for (train_indice, test_indice), color in zip(cv.split(X, y), colors):
        a_model = classifier.fit(X[train_indice], y[train_indice])

        # y_predict_label = a_model.predict(X[test_indice])

        probas_ = a_model.predict_proba(X[test_indice])

        fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1])

        a += 1

        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0

        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc))
        i_of_roc += 1

        label_transformed = probas_[:, 1]
        for i in range(len(label_transformed)):
            if label_transformed[i] > th:
                label_transformed[i] = 1
            else:
                label_transformed[i] = 0
        lt = label_transformed.astype('int32')
        f1 = f1_score(y[test_indice], lt)
        mean_f1 += f1

    plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')

    mean_tpr /= cv.get_n_splits(X, y)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print('mean_auc=' + str(mean_auc))
    print('mean_f1=' + str(mean_f1/5))
    plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw)
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate mean_f1:'+str(mean_f1))
    plt.ylabel('True Positive Rate')

    plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1/5))
    plt.legend(loc="lower right")
    plt.savefig('../result/pred_ROC_XL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) +
                '_proba_to_label_using_th_' + str(th) + '.png')
    # plt.show()

    a_model = classifier.fit(X, y)

    # label_predict = a_model.predict(data_predict_filled_after_feature_selection)  # 对B_test进行预测
    proba_predict = a_model.predict_proba(data_predict_filled_after_feature_selection)

    '''proba result'''
    result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba.csv'
    write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist())

    # '''写入要提交的结果'''
    # result_file_name = '../result/pred_result_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '.csv'
    # write_predict_results_to_csv(result_file_name, data_predict_user_id, label_predict.tolist())

    '''results file'''
    label_transformed = proba_predict[:, 1]
    for i in range(len(label_transformed)):
        if label_transformed[i] > th:
            label_transformed[i] = 1
        else:
            label_transformed[i] = 0
    lt = label_transformed.astype('int32')
    result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \
                       '_proba_to_label_using_th_' + str(th) + '.csv'
    write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist())

