In [37]:
#!/usr/bin/env python
# ! -*- coding: utf-8 -*-

'''
@File: model_fit_v3.py
@Author: RyanZheng
@Email: ryan.zhengrp@gmail.com
@Created Time on: 2020-07-26
'''

import os
import warnings
from datetime import datetime

import joblib
from sklearn.preprocessing import LabelEncoder

from model_code import report_out_yx, tree_selection
from model_code.woe_transformer import *
from model_code.feature_binning import *
from model_code.bayes_opt_tuner import classifiers_model
from model_code.utils import *
from model_code.detector import detect

warnings.filterwarnings('ignore')

from model_code.logger_utils import Logger

log = Logger(level='info', name=__name__).logger

if __name__ == '__main__':

    # =========================step 1 相关配置=========================
    log.info('step 1 相关配置')
    feature_type = 'lhpdat'  # 什么数据
    cust_id = 'apply_no'  # 主键
    target = 'target'  # 目标变量
    data_type = 'type'  # 区分数据集变量
    apply_time = 'apply_time'  # 时间

    client = 'lhp09'
    batch = 'p23'

    to_model_var_num = 30 #不限制的话修改为None
    is_model_data_to_woe = False  # 喂入模型的数据是否需要转化为woe值，False不需要，即原始数据入模型
    fillna_value = -999999  # 缺失值填充的值

    # 阈值配置
    exclude_cols = [apply_time, cust_id, target, data_type, 'apply_month']
    feature_missing_threshould = 0.95  # 缺失率大于等于该阈值的变量剔除
    
    # 用于训练模型的数据
    label_encoder_dict = {}
    to_model_data_path = '/Users/ryanzheng/PycharmProjects/data_to_treemodel_v1/to_model_data/lhp_amount_rule.csv'
    
    # =========================后续代码基本可以不用动=========================
    

    # 基本不用动
    project_name = '{}{}'.format(client, batch)
    client_batch = '{}{}'.format(client, batch)
    project_dir = 'model_result_data/{}/{}/'.format(client, batch)
    output_dir = '{}model/{}/'.format(project_dir, feature_type)

    os.makedirs(project_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(project_dir + 'data/score/', exist_ok=True)
    os.makedirs(project_dir + 'data/xgb_score/', exist_ok=True)
    # 基本不用动
    # =========================相关配置=========================

    

2020-10-21 18:49:41,284 - __main__[line:34] - INFO: step 1 相关配置


In [38]:
    # =========================step 2 读取数据集=========================
    log.info('step 2 开始读取数据集')
    # 读取宽表数据
    log.info('读取样本&特征数据集：{}|{}|{}为样本数据，其他为特征数据'.format(cust_id, apply_time, target))
    all_data = pd.read_csv(to_model_data_path)

    # drop_cols = ['xy_black_version', 'tzre_version']
    # all_data.drop(columns=drop_cols, axis=1, inplace=True)
    all_data.drop(['applthst_loan_amount', 'tzre_report_info_report_no','xy_black_trade_no','tzre_id','xy_black_version', 'tzre_version', 'tzre_bi_phone_number'],axis=1,inplace=True)

    all_data.set_index(cust_id, inplace=True)
    selected_features = all_data.columns.format()
    selected_features = list(set(selected_features) - set(exclude_cols))
    log.info('特征的个数：{}'.format(len(selected_features)))
    
    
    # =========================读取字典进行重命名=========================
#     ##读取字典进行重命名
#     fea_dict_df = pd.read_excel('/home/marketingscore/ryanzheng/fit_model_project/新特征数据字典v3.xlsx')
#     fea_dict = fea_dict_df[['feature_code','feature_id']].set_index('feature_code')['feature_id'].to_dict()
#     all_data.rename(columns=fea_dict, inplace=True)

#     selected_features = all_data.columns.format()
#     selected_features = list(set(selected_features) - set(exclude_cols))
# #     if exclude_vars:
# #         selected_features = list(set(selected_features) - set(exclude_vars))

#     ##仅使用数据字典中有的变量
#     fea_dict_df_list = fea_dict_df['feature_id'].tolist()
#     selected_features = list(set(selected_features).intersection(set(fea_dict_df_list)))
#     print(len(selected_features))
#     ##仅使用数据字典中有的变量
    
    # =========================读取字典进行重命名=========================
    
    
    #删除特征全为空的样本量
    log.info('删除特征全为空的样本量')
    print('删除特征全为空的样本之前的数据集行列：', all_data.shape)
    all_data.dropna(subset=selected_features, how='all', inplace=True)
    print('删除特征全为空的样本之后的数据集行列：', all_data.shape)

    log.info('样本数据集情况：')
    log.info(all_data[target].value_counts())
    # =========================读取数据集=========================

    log.info('EDA，整体数据探索性数据分析')
    all_data_eda = detect(all_data)
    all_data_eda.to_excel('{}{}_{}_all_data_eda.xlsx'.format(
        output_dir, project_name, feature_type))

    # =========================step 3 划分训练集和测试集=========================
    log.info('step 3 划分训练集和测试集')
    if data_type not in all_data.columns:
        df_sample = all_data[[target, apply_time]]
        df_sample.reset_index(inplace=True)
        
        #随机切分train、test
        df_sample = split_data_type(df_sample, key_col=cust_id, target=target, apply_time=apply_time, test_size=0.25)
        df_sample.to_csv(project_dir + 'data/{}_split.csv'.format(client_batch), index=False)
        
#         #按时间切分
#         df_oot = df_sample[df_sample['apply_time']>= '2020-04-01']
#         X_train = df_sample[df_sample['apply_time']<= '2020-02-01']
#         X_test = df_sample[(df_sample['apply_time']> '2020-02-01') & (df_sample['apply_time']< '2020-04-01')]

#         df_sample.loc[df_oot.index,'type'] = 'oot'
#         df_sample.loc[X_train.index,'type'] = 'train'
#         df_sample.loc[X_test.index,'type'] = 'test'
        
        df_sample.to_csv(project_dir + 'data/{}_split.csv'.format(client_batch), index=False)
        df_sample.set_index(cust_id, inplace=True)
        print(df_sample['type'].value_counts())

2020-10-21 18:49:41,571 - __main__[line:2] - INFO: step 2 开始读取数据集
2020-10-21 18:49:41,573 - __main__[line:4] - INFO: 读取样本&特征数据集：apply_no|apply_time|target为样本数据，其他为特征数据
2020-10-21 18:49:42,167 - __main__[line:14] - INFO: 特征的个数：1103
2020-10-21 18:49:42,168 - __main__[line:38] - INFO: 删除特征全为空的样本量
2020-10-21 18:49:42,272 - __main__[line:43] - INFO: 样本数据集情况：
2020-10-21 18:49:42,275 - __main__[line:44] - INFO: 0    4623
1     595
Name: target, dtype: int64
2020-10-21 18:49:42,277 - __main__[line:47] - INFO: EDA，整体数据探索性数据分析


删除特征全为空的样本之前的数据集行列： (5218, 1105)
删除特征全为空的样本之后的数据集行列： (5218, 1105)


2020-10-21 18:49:46,257 - __main__[line:53] - INFO: step 3 划分训练集和测试集


样本y值在0，1
样本情况： (5218, 3)
分布情况： target
0    4623
1     595
Name: apply_no, dtype: int64
样本drop_duplicates情况： (5218, 3)
样本y值在0，1的样本情况： (5218, 3)
              count      mean       std  min  25%  50%  75%  max
apply_month                                                     
2019/1/       515.0  0.102913  0.304140  0.0  0.0  0.0  0.0  1.0
2019/2/      1241.0  0.089444  0.285499  0.0  0.0  0.0  0.0  1.0
2019/3/      1989.0  0.120161  0.325231  0.0  0.0  0.0  0.0  1.0
2019/4/       588.0  0.141156  0.348479  0.0  0.0  0.0  0.0  1.0
2019/5/       885.0  0.123164  0.328811  0.0  0.0  0.0  0.0  1.0
        count      mean       std  min  25%  50%  75%  max
type                                                      
test   1305.0  0.114176  0.318147  0.0  0.0  0.0  0.0  1.0
train  3913.0  0.113979  0.317826  0.0  0.0  0.0  0.0  1.0
train    3913
test     1305
Name: type, dtype: int64


In [39]:
    #将数据集类别和数据集合并
    #df_sample = all_data[[target, apply_time, data_type]]
    all_data = pd.merge(df_sample[['type']], all_data, left_index=True, right_index=True, how='inner')
    
    log.info('分开训练集和测试集为两个df')
    train_data = all_data[all_data['type'] == 'train']
    #test_data = all_data[all_data['type'] == 'test']

    log.info('EDA，训练集探索性数据分析')
    detect(train_data).to_excel('{}{}_{}_train_data_eda.xlsx'.format(
        output_dir, project_name, feature_type))
#     detect(test_data).to_excel('{}{}_{}_test_data_eda.xlsx'.format(
#         output_dir, project_name, feature_type))

    # =========================step 4 初筛=========================
    log.info('step 4 变量初筛')
    # selected_features = train_data_eda[train_data_eda['missing_q'] <= 0.95].index.to_list()
    print('删除缺失率前变量数量：', len(selected_features))
    selected_features = filter_miss(train_data[selected_features], miss_threshold=feature_missing_threshould)
    print('删除缺失率后变量数量：', len(selected_features))
    train_data = train_data[selected_features + [target]]
    #test_data = test_data[selected_features + [target]]
    # =========================初筛=========================
    
    

    # =========================step 5 数据处理=========================
    log.info('step 5 数据woe处理')

    # 离散变量数据处理
    #selected_features = list(set(selected_features) - set(exclude_cols))
    continuous_cols, category_cols, date_cols = select_features_dtypes(train_data[selected_features])

    train_data.loc[:, continuous_cols] = train_data.loc[:, continuous_cols].fillna(fillna_value)
    #test_data.loc[:, continuous_cols] = test_data.loc[:, continuous_cols].fillna(fillna_value)
    all_data.loc[:, continuous_cols] = all_data.loc[:, continuous_cols].fillna(fillna_value)
    # data.loc[:, continuous_cols] = data.loc[:, continuous_cols].fillna(-999)
    
    # =========================labelencode=========================
#     def category_to_labelencoder(data, labelencoder=[]):
#         label_encoder_dict = {}
#         le = LabelEncoder()
#         for col in labelencoder:
#             print('{} in process!!!'.format(col))
#             data[col] = le.fit_transform(data[col].values)
#             number = [i for i in range(0, len(le.classes_))]
#             key = list(le.inverse_transform(number))
#             label_encoder_dict[col] = dict(zip(key, number))
#         return label_encoder_dict


#     def category_to_labelencoder_apply(data, labelencoder_dict={}):
#         for col, mapping in labelencoder_dict.items():
#             print('{} in process!!!'.format(col))
#             data[col] = data[col].map(mapping).fillna(-1)
#             data[col] = data[col].astype(int)
            
    
#     if category_cols:
#         train_data.loc[:, category_cols] = train_data.loc[:, category_cols].fillna('-1007')
#         all_data.loc[:, category_cols] = all_data.loc[:, category_cols].fillna('-1007')
#         label_encoder_dict = category_to_labelencoder(train_data, category_cols)
#         category_to_labelencoder_apply(all_data, label_encoder_dict)
        
    # =========================labelencode=========================
        
    

    if category_cols and not label_encoder_dict:
        log.info('step 5.1 类别变量数据处理')
        # train_data.loc[:, category_cols] = train_data.loc[:, category_cols].fillna('miss')
        # test_data.loc[:, category_cols] = test_data.loc[:, category_cols].fillna('miss')

        var_value_woe = category_2_woe(train_data, category_cols, target=target)
        category_2_woe_save(var_value_woe, '{}'.format(output_dir))
        # var_value_woe = category_2_woe_load('{}'.format(output_dir))
        train_data = WoeTransformer().transform(train_data, var_value_woe)
        #test_data = WoeTransformer().transform(test_data, var_value_woe)
        all_data = WoeTransformer().transform(all_data, var_value_woe)

    # 离散变量数据处理

2020-10-21 18:49:46,568 - __main__[line:5] - INFO: 分开训练集和测试集为两个df
2020-10-21 18:49:46,610 - __main__[line:9] - INFO: EDA，训练集探索性数据分析
2020-10-21 18:49:50,195 - __main__[line:16] - INFO: step 4 变量初筛


删除缺失率前变量数量： 1103


2020-10-21 18:49:50,419 - __main__[line:28] - INFO: step 5 数据woe处理
2020-10-21 18:49:51,629 - __main__[line:70] - INFO: step 5.1 类别变量数据处理


删除缺失率后变量数量： 1020


In [40]:
    if is_model_data_to_woe:
        log.info('将箱子转woe')
        log.info('============入模数据需要转化为woe值===========')
#         train_data_to_model = WoeTransformer().transform(train_data_bin, fb.get_var_bin_woe())
#         test_data_to_model = WoeTransformer().transform(test_data_bin, fb.get_var_bin_woe())
        all_data_to_model = WoeTransformer().transform(all_data_bin, fb.get_var_bin_woe())
    else:
        log.info('============入模数据不需要转化为woe值===========')
#         train_data_to_model = train_data.copy()
#         test_data_to_model = test_data.copy()
        all_data_to_model = all_data.copy()



In [41]:
    def statistics_model_result(all_data=pd.DataFrame()):
        # ===========================step 6 统计=================================
        all_data['score'] = all_data[feature_type].map(lambda v: to_score(v))
        log.info('模型相关结果统计！！！')
        df_splitted_type_auc_ks = all_data.groupby(data_type).apply(
            lambda df: pd.Series({'auc': get_roc_auc_score(df[target], df['score']),
                                  'ks': get_ks(df[target], df['score'])}))
        df_splitted_type_auc_ks = df_splitted_type_auc_ks.reindex(['train', 'test', 'oot', 'cv'])

        log.info('模型效果：')
        print(df_splitted_type_auc_ks)

        all_data['month'] = all_data[apply_time].map(lambda s: s[:7])
        df_monthly_auc_ks = all_data.groupby('month').apply(
            lambda df: pd.Series({'auc': get_roc_auc_score(df[target], df['score']),
                                  'ks': get_ks(df[target], df['score'])}))
        del all_data['month']
        log.info('不同月份的模型效果：')
        print(df_monthly_auc_ks)

        df_desc = all_data[[feature_type, 'score']].describe()
        df_desc.loc['coverage'] = df_desc.loc['count'] / all_data.shape[0]
        log.info('分数describe')
        print(df_desc)

        all_data[data_type] = all_data[data_type].map(lambda s: s.lower())
        all_data['client_batch'] = client_batch
        # df_psi,df_psi_details = psi_statis(all_data, splitted_types=['train','test','oot'], scores=[feature_type])
        df_psi, df_psi_details = psi_statis(all_data, splitted_types=['train', 'test'], scores=[feature_type])
        del all_data['client_batch']
        log.info('模型psi：')
        print(df_psi[['train_test_psi']])
        # log.info(df_psi[['train_test_psi','train_oot_psi']])

        df_output_statis = df_splitted_type_auc_ks.reset_index()
        df_output_statis['feature'] = feature_type
        df_output_statis['project_name'] = project_name
        df_output_statis['client_batch'] = client_batch
        df_output_statis = df_output_statis.pivot_table(
            index=['project_name', 'client_batch', 'feature'],
            columns=data_type,
            values=['auc', 'ks'])
        df_output_statis.columns = ['_'.join(reversed(x)) for x in df_output_statis.columns]
        df_output_statis['feature_cnt'] = len(selected_features)
        df_output_statis['n_estimators'] = model.get_params()['n_estimators']

        log.info('统计结束')
        return df_output_statis
        # ===========================统计=================================

In [42]:
    # =========================step 6 训练模型=========================
    X_all, y_all, X_train, y_train, X_test, y_test, X_oot, y_oot = get_splitted_data(
    all_data_to_model, target=target, selected_features=selected_features)

    print('整体数据集大小：', X_all.shape)
    print('训练集大小：', X_train.shape)
    print('测试集大小：', X_test.shape)
    if X_oot is None:
        print('无oot数据集')
    else:
        print('oot集大小：', X_oot.shape)

    pd.Series(X_test.index).to_csv('{}{}_{}_X_test_key_{}.csv'.format(
        output_dir, project_name, feature_type, cust_id), header=cust_id, index=False)

    log.info('step 6 开始训练模型')
    start = datetime.now()

    log.info('step 6.1 ===筛选变量===')

    # ===========================================

    log.info('step 6.1 ===筛选变量===10折交叉后，计算变量的平均重要性')
    # feature_imp = tree_selection.kfold_xgb_model(train_data=(del_corr_df, y_train))
    log.info('筛选前数据集大小：{}'.format(X_train.shape))
    feature_imp = tree_selection.change_col_subsample_fit_model(train_data=(X_train, y_train),
                                                                test_data=(X_test, y_test))

    log.info('将特征重要性持久化')
    feature_imp.to_csv('{}{}_{}_xgb_allfeature_mean_imp_df.csv'.format(
        output_dir, project_name, feature_type))

    log.info('根据10折拟合模型处理后的变量重要性进行变量相关性筛选')
    del_corr_df = tree_selection.drop_corr(X_train, by=feature_imp, threshold=0.9)
    # del_corr_df = tree_selection.drop_corr(del_corr_df, by=feature_imp, threshold=0.8)
    log.info('筛选后数据集大小：{}'.format(del_corr_df.shape))

    # ===========================================

    selected_features = list(del_corr_df.columns)
    log.info('最终入模变量的数量：{}'.format(len(selected_features)))
    log.info('最终入模变量：{}'.format(selected_features))

    feature_imp = tree_selection.change_col_subsample_fit_model(train_data=(del_corr_df, y_train),
                                                                test_data=(X_test[del_corr_df.columns], y_test))

    log.info('将待入模特征重要性持久化')
    feature_imp.to_csv('{}{}_{}_xgb_tomodel_feature_mean_imp_df.csv'.format(
        output_dir, project_name, feature_type))

    log.info('贝叶斯进行模型调参')
    model = classifiers_model(train_data=(X_train[selected_features], y_train),
                              test_data=(X_test[selected_features], y_test),
                              init_points=5, iterations=8, verbose=1)
    log.info('模型调参完成！！！')
    log.info('模型参数：{}'.format(model.get_xgb_params()))
    log.info('模型参数：{}'.format(model.get_params()))
    
    df_featurescore = pd.DataFrame(list(model._Booster.get_fscore().items()), columns=['特征名称', '特征权重值']
                                   ).sort_values('特征权重值', ascending=False)
    df_featurescore.to_csv('{}{}_{}_xgb_featurescore_first.csv'.format(
        output_dir, project_name, feature_type), index=False)
    
    end = datetime.now()
    log.info('模型训练完成, 使用 {} 秒'.format((end - start).seconds))
    
    #X_all = pd.concat([X_train, X_test])
    X_all[feature_type] = model.predict_proba(X_all[selected_features])[:, 1]
    all_data = pd.concat([all_data_to_model, X_all[feature_type]], axis=1)
    
    statistics_model_result(all_data=all_data)

    # X_all.to_csv('{}{}_{}_X_all.csv'.format(output_dir, project_name, feature_type))
    # all_data.to_csv('{}{}_{}_all_data.csv'.format(output_dir, project_name, feature_type))
    
    
    
    if to_model_var_num:
        start = datetime.now()

        print('过滤前{}个特征出来，再次训练'.format(to_model_var_num))
#         importance = model._Booster.get_fscore()
#         importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
#         features_importance = pd.DataFrame()
#         features_importance = features_importance.append(importance, ignore_index=True)
#         features_importance.columns = ['特征名称', '特征权重值']
#         # features_importance.to_csv(
#         #     '{}{}_{}_xgb_features_importance.csv'.format(output_dir, project_name, feature_type))
#         selected_features = features_importance.iloc[:to_model_var_num]['特征名称'].tolist()
    
        selected_features = df_featurescore.iloc[:to_model_var_num]['特征名称'].tolist()
    
        print('过滤后的特征：', selected_features)

        X_all, y_all, X_train, y_train, X_test, y_test, X_oot, y_oot = get_splitted_data(
            all_data_to_model, target=target, selected_features=selected_features)
        print('整体数据集大小：', X_all.shape)
        print('训练集大小：', X_train.shape)
        print('测试集大小：', X_test.shape)
        if X_oot is None:
            print('无oot数据集')
        else:
            print('oot集大小：', X_oot.shape)

        # 手动指定调参
        # model = xgb.XGBClassifier(**ini_params)
        # model.fit(X_train, y_train)

        # 贝叶斯调参
        log.info('贝叶斯进行模型调参')
        model = classifiers_model(train_data=(X_train[selected_features], y_train),
                              test_data=(X_test[selected_features], y_test),
                              init_points=5, iterations=8, verbose=1)
        log.info('模型调参完成！！！')
        log.info('模型参数：{}'.format(model.get_xgb_params()))
        log.info('模型参数：{}'.format(model.get_params()))

        end = datetime.now()
        log.info('模型训练完成, 使用 {} 秒'.format((end - start).seconds))
        
    #X_all = pd.concat([X_train, X_test])
    X_all[feature_type] = model.predict_proba(X_all[selected_features])[:, 1]
    all_data = pd.concat([all_data_to_model, X_all[feature_type]], axis=1)

    df_output_statis = statistics_model_result(all_data=all_data)
    
    # X_all.to_csv('{}{}_{}_X_all.csv'.format(output_dir, project_name, feature_type))
    # all_data.to_csv('{}{}_{}_all_data.csv'.format(output_dir, project_name, feature_type))

    
    # ==========================训练模型=========================

2020-10-21 18:49:52,571 - __main__[line:16] - INFO: step 6 开始训练模型
2020-10-21 18:49:52,572 - __main__[line:19] - INFO: step 6.1 ===筛选变量===
2020-10-21 18:49:52,572 - __main__[line:23] - INFO: step 6.1 ===筛选变量===10折交叉后，计算变量的平均重要性
2020-10-21 18:49:52,573 - __main__[line:25] - INFO: 筛选前数据集大小：(3913, 1020)


整体数据集大小： (5218, 1020)
训练集大小： (3913, 1020)
测试集大小： (1305, 1020)
无oot数据集
subsample is 0.3 and colsample_bytree is 0.3 model result is : 
xgb train auc is ： 0.8597112299050237
xgb train ks is ： 0.5697557107953142
xgb test auc is ： 0.5456387450361115
xgb test ks is ： 0.09678711595178932
subsample is 0.4 and colsample_bytree is 0.4 model result is : 
xgb train auc is ： 0.9413315294364158
xgb train ks is ： 0.7332051980169206
xgb test auc is ： 0.553400989294257
xgb test ks is ： 0.13323541023199648
subsample is 0.5 and colsample_bytree is 0.5 model result is : 
xgb train auc is ： 0.9768599776754823
xgb train ks is ： 0.8564692598115997
xgb test auc is ： 0.5731172058242959
xgb test ks is ： 0.171413808318432
subsample is 0.6 and colsample_bytree is 0.6 model result is : 
xgb train auc is ： 0.986704236355335
xgb train ks is ： 0.8829398518510854
xgb test auc is ： 0.5266250203200111
xgb test ks is ： 0.07559044146675653
subsample is 0.7 and colsample_bytree is 0.7 model result is : 
xgb train auc is ：

2020-10-21 18:51:17,884 - __main__[line:29] - INFO: 将特征重要性持久化
2020-10-21 18:51:17,892 - __main__[line:33] - INFO: 根据10折拟合模型处理后的变量重要性进行变量相关性筛选


subsample is 1.0 and colsample_bytree is 1.0 model result is : 
xgb train auc is ： 0.9979751429558127
xgb train ks is ： 0.961937731927294
xgb test auc is ： 0.5336267155895126
xgb test ks is ： 0.10213998745965025


2020-10-21 18:51:18,474 - __main__[line:36] - INFO: 筛选后数据集大小：(3913, 730)
2020-10-21 18:51:18,475 - __main__[line:41] - INFO: 最终入模变量的数量：730
2020-10-21 18:51:18,475 - __main__[line:42] - INFO: 最终入模变量：['td_idcard_credit_overdue_num', 'td_email_lend_third_pay_num_d90', 'td_mobile_lend_big_data_finance_num_d90', 'td_company_name_over5_d7', 'td_idcard_conn_mother_mobile_over2_d90', 'td_device_conn_mobile_num_d7', 'td_email_loan_bank_finance_num_d90', 'td_mobile_loan_vertivcal_electricity_num_d7', 'td_mobile_lend_finance_lease_num_d90', 'td_idcard_loan_asset_transfer_num_d7', 'tzre_rf_m3_max_overdue_repay_amount_level', 'td_email_loan_gurantee_num_d30', 'tzre_es_m1_overdue_repay_sum', 'sd_loaninfo_pid_d360_overduetenantcount', 'td_idcard_loan_bank_person_num_d7', 'sd_loaninfo_pid_d360_loancount', 'td_idcard_lend_traffic_lease_num_d90', 'td_mobile_credit_overdue_repayment', 'xy_black_currently_performance', 'td_idcard_company_person_owe', 'td_mobile_loan_car_finance_num_d90', 'td_idcard_over_d

subsample is 0.3 and colsample_bytree is 0.3 model result is : 
xgb train auc is ： 0.7559778876039428
xgb train ks is ： 0.3820836044136839
xgb test auc is ： 0.5212315087898562
xgb test ks is ： 0.06423445809433126
subsample is 0.4 and colsample_bytree is 0.4 model result is : 
xgb train auc is ： 0.8336053837527696
xgb train ks is ： 0.5282703931107005
xgb test auc is ： 0.5080757530015559
xgb test ks is ： 0.05055618773368013
subsample is 0.5 and colsample_bytree is 0.5 model result is : 
xgb train auc is ： 0.8861239411698513
xgb train ks is ： 0.616786588733491
xgb test auc is ： 0.5217714405146188
xgb test ks is ： 0.05715728849771251
subsample is 0.6 and colsample_bytree is 0.6 model result is : 
xgb train auc is ： 0.9129838541740769
xgb train ks is ： 0.6602637811214254
xgb test auc is ： 0.5256032140451917
xgb test ks is ： 0.07240310257541627
subsample is 0.7 and colsample_bytree is 0.7 model result is : 
xgb train auc is ： 0.9409861849261648
xgb train ks is ： 0.7471910039695216
xgb test a

2020-10-21 18:52:16,953 - __main__[line:47] - INFO: 将待入模特征重要性持久化
2020-10-21 18:52:16,961 - __main__[line:51] - INFO: 贝叶斯进行模型调参


subsample is 1.0 and colsample_bytree is 1.0 model result is : 
xgb train auc is ： 0.9650978928811174
xgb train ks is ： 0.8019391029579339
xgb test auc is ： 0.5364453914214719
xgb test ks is ： 0.07742504818745499
Optimizing <model_code.bayes_opt_tuner.XGBClassifierTuner object at 0x1c20aed518>...
params_optimizer is :  ['colsample_bytree', 'gamma', 'learning_rate', 'max_depth', 'min_child_weight', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample']
begain optimizer params!!!
|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
test_auc is :  0.5324162234968998
train_auc is :  0.623214264927096
best model result is -487.8781482481331
best model result is : 
{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode':

2020-10-21 18:53:49,819 - __main__[line:55] - INFO: 模型调参完成！！！
2020-10-21 18:53:49,821 - __main__[line:56] - INFO: 模型参数：{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.5033906284091754, 'gamma': 1.2285439493231152, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 0.1134984748795785, 'max_delta_step': 0, 'max_depth': 5, 'min_child_weight': 122.44451721637986, 'monotone_constraints': '()', 'n_jobs': -1, 'num_parallel_tree': 1, 'random_state': 0, 'reg_alpha': 1.3942308568375266, 'reg_lambda': 0.8840385124340608, 'scale_pos_weight': 1, 'subsample': 0.6125536413016809, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'nthread': -1}
2020-10-21 18:53:49,823 - __main__[line:57] - INFO: 模型参数：{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.5033906284091754, 'gamma': 1.22854394932

test_auc is :  0.5438418754789717
train_auc is :  0.5667614316146731
best model result is 49.48699264932216
best model result is : 
{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.5033906284091754, 'gamma': 1.2285439493231152, 'gpu_id': -1, 'importance_type': 'gain', 'interaction_constraints': '', 'learning_rate': 0.1134984748795785, 'max_delta_step': 0, 'max_depth': 5, 'min_child_weight': 122.44451721637986, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 290, 'n_jobs': -1, 'num_parallel_tree': 1, 'random_state': 0, 'reg_alpha': 1.3942308568375266, 'reg_lambda': 0.8840385124340608, 'scale_pos_weight': 1, 'subsample': 0.6125536413016809, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'nthread': -1}
current obj_fun result is :  49.48699264932216
| [95m 13      [0m | [95m 49.49   [0m | [95m 0.5034  [0m | [95m 1.229   [0m | [95m 0.1135  [0m | [95m 5.

2020-10-21 18:53:50,080 - __main__[line:10] - INFO: 模型效果：
2020-10-21 18:53:50,133 - __main__[line:18] - INFO: 不同月份的模型效果：
2020-10-21 18:53:50,145 - __main__[line:23] - INFO: 分数describe


            auc        ks
type                     
train  0.566469  0.117417
test   0.542756  0.125473
oot         NaN       NaN
cv          NaN       NaN
              auc        ks
month                      
2019/1/  0.541718  0.113330
2019/2/  0.568313  0.137192
2019/3/  0.557708  0.108930
2019/4/  0.504628  0.052225
2019/5/  0.587593  0.165279
               lhpdat        score
count     5218.000000  5218.000000
mean         0.111254   644.916251
std          0.013268     9.666218
min          0.082527   622.000000
25%          0.102010   638.000000
50%          0.110581   645.000000
75%          0.120255   652.000000
max          0.146373   668.000000
coverage     1.000000     1.000000


2020-10-21 18:53:50,331 - __main__[line:31] - INFO: 模型psi：
2020-10-21 18:53:50,347 - __main__[line:47] - INFO: 统计结束
2020-10-21 18:53:50,371 - __main__[line:110] - INFO: 贝叶斯进行模型调参


   train_test_psi
0        0.002771
过滤前30个特征出来，再次训练
过滤后的特征： ['sd_loaninfo_pid_d360_averageloangapdays', 'ten_fraud_risk_score', 'tzre_rf_m3_max_repay_amount_level', 'tzre_rf_m6_max_repay_amount_level', 'tzre_score_l', 'td_final_score', 'tzre_rf_m24_max_repay_amount_level']
整体数据集大小： (5218, 7)
训练集大小： (3913, 7)
测试集大小： (1305, 7)
无oot数据集
Optimizing <model_code.bayes_opt_tuner.XGBClassifierTuner object at 0x1c1e3af048>...
params_optimizer is :  ['colsample_bytree', 'gamma', 'learning_rate', 'max_depth', 'min_child_weight', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample']
begain optimizer params!!!
|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
test_auc is :  0.5177306611551056
train_auc is :  0.5835892159386192
best model result is -44.28637291088941
best model result

2020-10-21 18:53:53,388 - __main__[line:114] - INFO: 模型调参完成！！！
2020-10-21 18:53:53,390 - __main__[line:115] - INFO: 模型参数：{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.32733834826301766, 'gamma': 0.3396608391291378, 'gpu_id': -1, 'interaction_constraints': '', 'learning_rate': 0.1768470756515885, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 126.9111798765106, 'monotone_constraints': '()', 'n_jobs': -1, 'num_parallel_tree': 1, 'random_state': 0, 'reg_alpha': 1.0663305699460341, 'reg_lambda': 1.3837542279009467, 'scale_pos_weight': 1, 'subsample': 0.5893093786036377, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'nthread': -1}
2020-10-21 18:53:53,391 - __main__[line:116] - INFO: 模型参数：{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.32733834826301766, 'gamma': 0.3396608

test_auc is :  0.5339315157567172
train_auc is :  0.5812862724910464
current obj_fun result is :  26.754010276176693
| [0m 13      [0m | [0m 26.75   [0m | [0m 0.7417  [0m | [0m 1.594   [0m | [0m 0.07886 [0m | [0m 8.0     [0m | [0m 91.25   [0m | [0m 184.0   [0m | [0m 1.155   [0m | [0m 0.4163  [0m | [0m 0.5393  [0m |
optimizer params over!!! 共耗时0.050219400723775225 分钟
the best params is : {'colsample_bytree': 0.32733834826301766, 'gamma': 0.3396608391291378, 'learning_rate': 0.1768470756515885, 'max_depth': 2.7867746706644008, 'min_child_weight': 126.9111798765106, 'n_estimators': 289.4723825376255, 'reg_alpha': 1.0663305699460341, 'reg_lambda': 1.3837542279009467, 'subsample': 0.5893093786036377}
Maximum xgb value is : 49.332840955045945
            auc        ks
type                     
train  0.556975  0.098135
test   0.538405  0.079097
oot         NaN       NaN
cv          NaN       NaN
              auc        ks
month                      
2019/1/  0.507106  

2020-10-21 18:53:53,635 - __main__[line:31] - INFO: 模型psi：
2020-10-21 18:53:53,650 - __main__[line:47] - INFO: 统计结束


   train_test_psi
0        0.004588


In [43]:
    # ===========================step 7 模型持久化=================================

    log.info('模型相关结果持久化')
    all_data[feature_type].to_frame().to_csv(
        '{}/data/score/{}_{}_score.csv'.format(project_dir, project_name, feature_type))
    all_data[feature_type].to_frame().to_csv(
        '{}/data/xgb_score/{}_{}_score.csv'.format(project_dir, project_name, feature_type))
    all_data[feature_type].to_frame().to_csv('{}{}_{}_score.csv'.format(
        output_dir, project_name, feature_type))

    joblib.dump(model._Booster, '{}{}_{}_xgb.ml'.format(
        output_dir, project_name, feature_type))
    json.dump(model.get_params(), open('{}{}_{}_xgb.params'.format(
        output_dir, project_name, feature_type), 'w'))

    model._Booster.dump_model('{}{}_{}_xgb.txt'.format(output_dir, project_name, feature_type))

    df_featurescore = pd.DataFrame(list(model._Booster.get_fscore().items()), columns=['特征名称', '特征权重值']
                                   ).sort_values('特征权重值', ascending=False)
    df_featurescore.to_csv('{}{}_{}_xgb_featurescore.csv'.format(
        output_dir, project_name, feature_type), index=False)

    df_corr = X_all.corr()
    df_corr.to_csv('{}{}_{}_xgb_corr.csv'.format(
        output_dir, project_name, feature_type), index_label='feature')

    df_rawdata = all_data[selected_features]
    df_rawdata.reset_index(inplace=True)
    df_rawdata_col_name = df_rawdata.columns.tolist()
    df_rawdata_col_name.insert(len(df_rawdata_col_name) - 1,
                               df_rawdata_col_name.pop(df_rawdata_col_name.index(cust_id)))
    df_rawdata = df_rawdata[df_rawdata_col_name]
    df_rawdata.head(100).to_csv('{}{}_{}_xgb_rawdata.csv'.format(
        output_dir, project_name, feature_type), index=False)

    df_output_statis.to_csv('{}{}_{}_xgb_output_statis.csv'.format(
        output_dir, project_name, feature_type))

    os.makedirs(project_dir + 'data/statis/auc_ks', exist_ok=True)
    df_output_statis.to_csv('{}data/statis/auc_ks/{}.csv'.format(
        project_dir, feature_type))

    log.info('模型相关结果持久化完成')
    # ===========================模型持久化=================================

2020-10-21 18:53:53,661 - __main__[line:3] - INFO: 模型相关结果持久化
2020-10-21 18:53:53,703 - __main__[line:43] - INFO: 模型相关结果持久化完成
