## 准备工作

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# 读取数据集
data = pd.read_csv(r'data/application_train.csv')
data.columns = [col.upper() for col in data.columns]

In [3]:
# 导入gbdt_var的方法
from gbdt_var import get_gbdt_path_var, get_data_gbdt, get_head_rule, get_rule_df, get_lr_model, get_lr_proba

# 导入数据集比较的代码
os.chdir('../common_function')
from common_function import dataframe_compare_all, dataframe_compare_sigle

# 导入自动建模代码
os.chdir('../../Auto-Modeling/auto_modeling')
import auto_modeling as am

In [4]:
# 划分数据集
X_train, Y_train, X_test, Y_test, X_time_test, Y_time_test = am.data_split(
    data, test_size=0.3, time_test_type=False)

>> 数据集划分成功,无跨时间测试集
>> 训练集样本数: 43051
>> 测试集样本数: 18451
>> 训练集目标情况如下:
   训练集TARGET为 0 的数量: 39583  占比: 91.94%
   训练集TARGET为 1 的数量: 3468  占比: 8.06%


In [6]:
# 特征选择
keep_cols = am.feature_select(X_train, Y_train, cols_number=50, auto_iteration=True, verbose=False)[0]

>> 根据缺失率删除 0 个变量
>> 根据同质性删除 2 个变量
>> 根据最大计数类别删除 0 个变量
>> 根据相关性删除 0 个变量
>> 特征重要度迭代次数 |■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■   根据条件提前停止迭代
>> 根据特征重要度最终保留 37 个变量
>> 具体保留变量如下:
   ['EXT_SOURCE_3', 'EXT_SOURCE_2', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_ID_PUBLISH', 'DAYS_BIRTH', 'AMT_GOODS_PRICE', 'DAYS_REGISTRATION', 'DAYS_LAST_PHONE_CHANGE', 'SK_ID_CURR', 'DAYS_EMPLOYED', 'OWN_CAR_AGE', 'OCCUPATION_TYPE', 'REGION_POPULATION_RELATIVE', 'AMT_INCOME_TOTAL', 'YEARS_BEGINEXPLUATATION_AVG', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'LANDAREA_AVG', 'TOTALAREA_MODE', 'BASEMENTAREA_MODE', 'HOUR_APPR_PROCESS_START', 'NONLIVINGAREA_AVG', 'LIVINGAREA_MODE', 'APARTMENTS_AVG', 'OBS_30_CNT_SOCIAL_CIRCLE', 'CODE_GENDER', 'NAME_EDUCATION_TYPE', 'LIVINGAPARTMENTS_AVG', 'YEARS_BUILD_AVG', 'NAME_INCOME_TYPE', 'COMMONAREA_AVG', 'DEF_60_CNT_SOCIAL_CIRCLE', 'APARTMENTS_MODE', 'FLAG_DOCUMENT_3', 'REGION_RATING_CLIENT_W_CITY']


In [6]:
# 转换数据集
X_train_trans, X_test_trans, X_time_test_trans, num_cols, char_cols = am.get_coltype_datalist(
    X_train, X_test, X_time_test, keep_cols)

# 封装DataFrameMapper
mapper = am.get_mapper(num_cols, char_cols)

In [9]:
# 建立深度为4的模型模型，也就是让GBDT的路径长度不超过4
# 并且确保每条路径的样本覆盖率超过0.01
# 路径个数限制在160条内
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(max_depth=4, min_samples_leaf=0.01, n_estimators=10, random_state=1234)

## 衍生GBDT变量

In [10]:
# 衍生gbdt变量
gbdt_path_var = get_gbdt_path_var(X_train_trans, model, y=Y_train, one_hot=False)
gbdt_path_var.head()

Unnamed: 0,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.36813;EXT_SOURCE_2 <= 0.108422;EXT_SOURCE_3 <= -4.499736,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.36813;EXT_SOURCE_2 <= 0.108422;EXT_SOURCE_3 > -4.499736,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.36813;EXT_SOURCE_2 > 0.108422;EXT_SOURCE_3 <= -4.499736,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.36813;EXT_SOURCE_2 > 0.108422;EXT_SOURCE_3 > -4.499736,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 > 0.36813;EXT_SOURCE_2 <= 0.137969;DAYS_BIRTH <= -14238.0,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 > 0.36813;EXT_SOURCE_2 <= 0.137969;DAYS_BIRTH > -14238.0,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 > 0.36813;EXT_SOURCE_2 > 0.137969;EXT_SOURCE_3 <= 0.5718,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 > 0.36813;EXT_SOURCE_2 > 0.137969;EXT_SOURCE_3 > 0.5718,EXT_SOURCE_2 > 0.356838;EXT_SOURCE_3 <= 0.42325;EXT_SOURCE_2 <= 0.597194;EXT_SOURCE_3 <= -4.499736,EXT_SOURCE_2 > 0.356838;EXT_SOURCE_3 <= 0.42325;EXT_SOURCE_2 <= 0.597194;EXT_SOURCE_3 > -4.499736,...,EXT_SOURCE_2 <= 0.170427;EXT_SOURCE_3 > 0.245184;DAYS_REGISTRATION <= -2977.0;AMT_ANNUITY > 24781.5,EXT_SOURCE_2 <= 0.170427;EXT_SOURCE_3 > 0.245184;DAYS_REGISTRATION > -2977.0,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 <= 0.343342;EXT_SOURCE_3 <= 0.008772;DAYS_ID_PUBLISH <= -4176.5,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 <= 0.343342;EXT_SOURCE_3 <= 0.008772;DAYS_ID_PUBLISH > -4176.5,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 <= 0.343342;EXT_SOURCE_3 > 0.008772;EXT_SOURCE_3 <= 0.145992,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 <= 0.343342;EXT_SOURCE_3 > 0.008772;EXT_SOURCE_3 > 0.145992,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 > 0.343342;EXT_SOURCE_2 <= 0.55861;EXT_SOURCE_1 <= 0.530291,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 > 0.343342;EXT_SOURCE_2 <= 0.55861;EXT_SOURCE_1 > 0.530291,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 > 0.343342;EXT_SOURCE_2 > 0.55861;OCCUPATION_TYPE != Drivers,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 > 0.343342;EXT_SOURCE_2 > 0.55861;OCCUPATION_TYPE == Drivers
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [11]:
# 回溯gbdt变量给其他数据集
X_train_gbdt = get_data_gbdt(X_train_trans, gbdt_path_var.columns)
X_train_gbdt.head()

Unnamed: 0,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.36813;EXT_SOURCE_2 <= 0.108422;EXT_SOURCE_3 <= -4.499736,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.36813;EXT_SOURCE_2 <= 0.108422;EXT_SOURCE_3 > -4.499736,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.36813;EXT_SOURCE_2 > 0.108422;EXT_SOURCE_3 <= -4.499736,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.36813;EXT_SOURCE_2 > 0.108422;EXT_SOURCE_3 > -4.499736,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 > 0.36813;EXT_SOURCE_2 <= 0.137969;DAYS_BIRTH <= -14238.0,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 > 0.36813;EXT_SOURCE_2 <= 0.137969;DAYS_BIRTH > -14238.0,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 > 0.36813;EXT_SOURCE_2 > 0.137969;EXT_SOURCE_3 <= 0.5718,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 > 0.36813;EXT_SOURCE_2 > 0.137969;EXT_SOURCE_3 > 0.5718,EXT_SOURCE_2 > 0.356838;EXT_SOURCE_3 <= 0.42325;EXT_SOURCE_2 <= 0.597194;EXT_SOURCE_3 <= -4.499736,EXT_SOURCE_2 > 0.356838;EXT_SOURCE_3 <= 0.42325;EXT_SOURCE_2 <= 0.597194;EXT_SOURCE_3 > -4.499736,...,EXT_SOURCE_2 <= 0.170427;EXT_SOURCE_3 > 0.245184;DAYS_REGISTRATION <= -2977.0;AMT_ANNUITY > 24781.5,EXT_SOURCE_2 <= 0.170427;EXT_SOURCE_3 > 0.245184;DAYS_REGISTRATION > -2977.0,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 <= 0.343342;EXT_SOURCE_3 <= 0.008772;DAYS_ID_PUBLISH <= -4176.5,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 <= 0.343342;EXT_SOURCE_3 <= 0.008772;DAYS_ID_PUBLISH > -4176.5,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 <= 0.343342;EXT_SOURCE_3 > 0.008772;EXT_SOURCE_3 <= 0.145992,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 <= 0.343342;EXT_SOURCE_3 > 0.008772;EXT_SOURCE_3 > 0.145992,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 > 0.343342;EXT_SOURCE_2 <= 0.55861;EXT_SOURCE_1 <= 0.530291,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 > 0.343342;EXT_SOURCE_2 <= 0.55861;EXT_SOURCE_1 > 0.530291,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 > 0.343342;EXT_SOURCE_2 > 0.55861;OCCUPATION_TYPE != Drivers,EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 > 0.343342;EXT_SOURCE_2 > 0.55861;OCCUPATION_TYPE == Drivers
53107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
42038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
48694,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
39556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
25479,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [12]:
# 对两个数据集是否一致进行检验
# 数据集还与下面代码操作后的数据集一致
# X_train_gbdt = model.apply(X_train_mapper)[:, :, 0]
# enc = OneHotEncoder()
# X_train_gbdt = np.array(enc.fit_transform(X_train_gbdt).toarray())
# 但由于get_gbdt_path_var有进行一个去重的操作，列数可能会不同，这里不再检验

gbdt_path_var.index = X_train_gbdt.index
compare_all = dataframe_compare_all(gbdt_path_var.reset_index(), X_train_gbdt.reset_index(), 'index', verbose=1)

# 两个数据集一定是完全一致的，如果不一致一般因为有变量的小数位过长，在提取子树结构时，每个节点判断的阈值精度取了6位
# 这精准的6位有可能让路径的信息损失，需让原数据集的小数位进行四舍五入，或者将get_gbdt_path_var的参数precision增大

第一个数据集的维度: (43051, 153)
第二个数据集的维度: (43051, 153)
两个数据集公共变量数: 152
两个数据集相同 index 样本数: 43051
所有变量均一致


## 规则提取

In [13]:
# 查看目标占比前5的规则
get_head_rule(X_train_gbdt, Y_train, head=5)

总样本目标为 1 的样本占比为: 0.0806

根据目标占比,提取前 5 个规则,具体规则如下:
>> 规则 1: 覆盖率 0.0108  目标占比 0.4176
   EXT_SOURCE_2 <= 0.170427;EXT_SOURCE_3 <= 0.245184;EXT_SOURCE_3 > -4.499736

>> 规则 2: 覆盖率 0.0157  目标占比 0.3813
   EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.265753;EXT_SOURCE_3 > -4.499736;EXT_SOURCE_2 <= 0.208904

>> 规则 3: 覆盖率 0.0185  目标占比 0.3538
   EXT_SOURCE_2 <= 0.247911;EXT_SOURCE_3 <= 0.264348;EXT_SOURCE_3 > -4.499736

>> 规则 4: 覆盖率 0.0167  目标占比 0.3537
   EXT_SOURCE_2 <= 0.483946;EXT_SOURCE_3 <= 0.265753;EXT_SOURCE_3 > -4.499736;EXT_SOURCE_3 <= 0.150546

>> 规则 5: 覆盖率 0.018  目标占比 0.3479
   EXT_SOURCE_3 <= 0.319379;EXT_SOURCE_2 <= 0.483946;EXT_SOURCE_3 > -4.499736;EXT_SOURCE_3 <= 0.156165



In [14]:
# 查看目标占比前5的规则，增加覆盖率阈值的过滤
get_head_rule(X_train_gbdt, Y_train, head=5, cover=0.02)

总样本目标为 1 的样本占比为: 0.0806

根据目标占比,提取前 5 个规则,具体规则如下:
>> 规则 1: 覆盖率 0.0241  目标占比 0.3157
   EXT_SOURCE_2 <= 0.247911;NAME_INCOME_TYPE == Working;EXT_SOURCE_1 <= 0.387005;EXT_SOURCE_3 <= 0.260162

>> 规则 2: 覆盖率 0.0235  目标占比 0.3126
   EXT_SOURCE_2 <= 0.392158;EXT_SOURCE_3 <= 0.400542;EXT_SOURCE_2 <= 0.168078;NAME_INCOME_TYPE == Working

>> 规则 3: 覆盖率 0.027  目标占比 0.3006
   EXT_SOURCE_2 <= 0.462132;EXT_SOURCE_3 <= 0.474951;EXT_SOURCE_2 <= 0.168078;NAME_INCOME_TYPE == Working

>> 规则 4: 覆盖率 0.0285  目标占比 0.2349
   EXT_SOURCE_2 > 0.170427;EXT_SOURCE_3 <= 0.343342;EXT_SOURCE_3 > 0.008772;EXT_SOURCE_3 <= 0.145992

>> 规则 5: 覆盖率 0.0308  目标占比 0.2221
   EXT_SOURCE_2 <= 0.483946;EXT_SOURCE_3 <= 0.265753;EXT_SOURCE_3 > -4.499736;EXT_SOURCE_3 > 0.150546



In [15]:
# 获取所有规则信息的数据集
rule_df = get_rule_df(X_train_gbdt, Y_train)
rule_df.head()

Unnamed: 0,rule,cover,target
0,EXT_SOURCE_2 <= 0.170427;EXT_SOURCE_3 <= 0.245...,0.0108,0.4176
1,EXT_SOURCE_2 <= 0.356838;EXT_SOURCE_3 <= 0.265...,0.0157,0.3813
2,EXT_SOURCE_2 <= 0.247911;EXT_SOURCE_3 <= 0.264...,0.0185,0.3538
3,EXT_SOURCE_2 <= 0.483946;EXT_SOURCE_3 <= 0.265...,0.0167,0.3537
4,EXT_SOURCE_3 <= 0.319379;EXT_SOURCE_2 <= 0.483...,0.018,0.3479


## 逻辑回归

In [16]:
# 训练逻辑回归模型
# 选择的变量数与C呈正相关，C越大选择的变量数越多，C越大选择的变量数越多，C越小选择的变量数越少
intercept, coef, cols = get_lr_model(X_train_gbdt, Y_train, C=0.005)

在L1正则项系数为 0.005 下,训练出的逻辑回归模型共选择了 8 个变量
交叉验证结果为:  训练集平均AUC: 0.676  验证集平均AUC: 0.6758

逻辑回归模型如下:
1 / (1 + e^(2.1771689556886153 + 0.06399857871544218 * X[1] - 0.08091065632026458 * X[2] + 0.2747370870213347 * X[3] + 0.13878685953331038 * X[4] - 0.35530045551031225 * X[5] + 0.05277391627131956 * X[6] + 0.08015968023680686 * X[7] + 0.6171095868614703 * X[8]))

各变量对应关系如下
X[1] ==> EXT_SOURCE_2 > 0.247911;EXT_SOURCE_3 > 0.425012;EXT_SOURCE_2 <= 0.623373;CODE_GENDER != M
X[2] ==> EXT_SOURCE_2 <= 0.462132;EXT_SOURCE_3 <= 0.474951;EXT_SOURCE_2 <= 0.168078;NAME_INCOME_TYPE == Working
X[3] ==> EXT_SOURCE_2 > 0.462132;EXT_SOURCE_3 > 0.390197;CODE_GENDER != M;OCCUPATION_TYPE != Sales staff
X[4] ==> EXT_SOURCE_3 > 0.319379;EXT_SOURCE_2 > 0.247911;EXT_SOURCE_1 <= 0.590292;EXT_SOURCE_3 > 0.628833
X[5] ==> EXT_SOURCE_2 <= 0.462132;EXT_SOURCE_3 <= 0.474951;EXT_SOURCE_2 > 0.079814;EXT_SOURCE_1 <= 0.425951
X[6] ==> EXT_SOURCE_2 > 0.483946;EXT_SOURCE_3 <= 0.390197;EXT_SOURCE_3 <= 0.010348;REGION_RATING_CLIEN

In [17]:
# 计算预测的概率值
lr_proba = get_lr_proba(intercept, coef, cols, X_train_gbdt)
lr_proba[:5]

53107    0.089372
42038    0.050286
48694    0.101820
39556    0.097094
25479    0.097094
dtype: float64

In [18]:
# 对测试集回溯GBDT衍生变量，并预测概率值，查看AUC、KS
X_test_gbdt = get_data_gbdt(X_test_trans, gbdt_path_var.columns)
test_proba = get_lr_proba(intercept, coef, cols, X_test_gbdt, Y_test)

AUC: 0.6797  KS: 0.2925
