In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[37]:


# !/usr/bin/env python
# ! -*- coding: utf-8 -*-

'''
@File: model_fit_v3.py
@Author: RyanZheng
@Email: ryan.zhengrp@gmail.com
@Created Time on: 2020-07-26

y值加密混淆
ok 可以跑通
'''

import json
import logging
import math
import operator
import os
import sys
import time
import warnings
from datetime import datetime

import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
from bayes_opt import BayesianOptimization
from joblib import Parallel, delayed
from lightgbm import LGBMClassifier
from sklearn.base import TransformerMixin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

warnings.filterwarnings('ignore')


# 日志输出
class Logger():
    # 日志级别关系映射
    level_relations = {
        "debug": logging.DEBUG,
        "info": logging.INFO,
        "warning": logging.WARNING,
        "error": logging.ERROR,
        "critical": logging.CRITICAL
    }

    def __init__(self, level="info", name=None,
                 fmt="%(asctime)s - %(name)s[line:%(lineno)d] - %"
                     "(levelname)s: %(message)s"):
        logging.basicConfig(level=self.level_relations.get(level), format=fmt)
        self.logger = logging.getLogger(name)


log = Logger(level='info', name=__name__).logger


def get_ks(target, y_pred):
    df = pd.DataFrame({
        'y_pred': y_pred,
        'target': target,
    })
    df = df.sort_values(by='y_pred', ascending=False)
    df['good'] = 1 - df['target']
    df['bad_rate'] = df['target'].cumsum() / df['target'].sum()
    df['good_rate'] = df['good'].cumsum() / df['good'].sum()
    df['ks'] = df['bad_rate'] - df['good_rate']
    return max(abs(df['ks']))


# auc
def get_roc_auc_score(target, y_pred):
    if target.nunique() != 2:
        raise ValueError('the target is not 2 classier target')
    else:
        return roc_auc_score(target, y_pred)


def get_splitted_data(df_selected, target, selected_features):
    X = {}
    y = {}

    X['all'] = df_selected[selected_features]
    y['all'] = df_selected[target]

    for name, df in df_selected.groupby('type'):
        X[name] = df[selected_features]
        y[name] = df[target]

    if not X.__contains__('oot'):
        X['oot'] = None
        y['oot'] = None

    return X['all'], y['all'], X['train'], y['train'], X['test'], y['test'], X['oot'], y['oot']


def to_score(x):
    import math
    if x <= 0.001:
        x = 0.001
    elif x >= 0.999:
        x = 0.999

    A = 404.65547022
    B = 72.1347520444
    result = int(round(A - B * math.log(x / (1 - x))))

    if result < 0:
        result = 0
    if result > 1200:
        result = 1200
    result = 1200 - result
    return result


def psi_statis(df_src, splitted_types, scores):
    def bin_psi(x, y):

        if pd.isnull(y) or y == 0 or pd.isnull(x) or x == 0:
            return None
        else:
            return (x - y) * math.log(x / y)

    if 'train' not in splitted_types:
        print('Error: failt to get psi, for train is not in splitted_types')
        return

    bins = list(range(300, 951, 50))
    l = []
    for (client_batch, splitted_type), df_type in df_src.groupby(['client_batch', 'type']):
        for score in scores:
            df_score = df_type[df_type[score].notnull()]
            df = pd.cut(df_score[score].map(to_score), bins=bins, right=False).value_counts().map(
                lambda v: v / df_score.shape[0] if df_score.shape[0] > 0 else np.nan).to_frame('pct')
            df.index.name = 'bin'
            df.index = df.index.astype(str)
            df = df.reset_index()

            df['client_batch'] = client_batch
            df['type'] = splitted_type
            df['feature'] = score

            l.append(df)

    df_psi_detail = pd.concat(l, ignore_index=True).pivot_table(index=['client_batch', 'feature', 'bin'],
                                                                columns='type', values='pct')
    df_psi_detail.columns = [s + '_pct' for s in df_psi_detail.columns.format()]
    df_psi_detail = df_psi_detail.reset_index()

    for splitted_type in filter(lambda s: s != 'train', splitted_types):
        df_psi_detail['train_{}_psi'.format(splitted_type)] = df_psi_detail.apply(
            lambda r: bin_psi(r['train_pct'], r[splitted_type + '_pct']), axis=1)

    psi_col = list(filter(lambda col: '_psi' in col, df_psi_detail.columns.format()))
    df_psi = df_psi_detail.groupby(['client_batch', 'feature']).sum()[psi_col].reset_index()

    df_psi_detail_sum = df_psi_detail.drop(labels='bin', axis=1).groupby(
        ['client_batch', 'feature']).sum().reset_index()
    df_psi_detail_sum['bin'] = '[sum]'

    df_psi_detail = pd.concat([df_psi_detail, df_psi_detail_sum], ignore_index=True).sort_values(
        ['client_batch', 'feature'])
    df_psi_detail = pd.DataFrame(df_psi_detail, columns=['client_batch', 'feature', 'bin',
                                                         'train_pct', 'test_pct', 'oot_pct', 'train_test_psi',
                                                         'train_oot_psi'])

    return df_psi, df_psi_detail


def train_test_split_(df_src, target='y_target', test_size=0.3):
    """
    样本切分函数.先按target分类，每类单独切成train/test，再按train/test合并，
    使得train/test的badrate能高度一致
    :param df_src:
    :param target:
    :param test_size:
    :return:
    """

    l = [[], [], [], []]
    for target_value, X in df_src.groupby(target):

        X[target] = target_value

        row = train_test_split(X.drop(labels=target, axis=1), X[target], test_size=test_size, random_state=1234)

        for i in range(0, 4):
            l[i].append(row[i])

    list_df = []
    for i in range(0, 4):
        list_df.append(pd.concat(l[i]))

    return tuple(list_df)


def split_data_type(df, key_col='tdid', target='target', apply_time='apply_time', test_size=0.3):
    df_id = df.copy()
    if df_id[target].isin([0, 1]).all():
        print('样本y值在0，1')
    else:
        print('\033[0;31m样本y值不在0，1之间，请检查！！！\033[0m')

    print('样本情况：', df_id.shape)
    df_id.drop_duplicates(subset=key_col, inplace=True)
    print('分布情况：', df_id.groupby(target)[key_col].count().sort_index())
    # df_id.groupby(target)['tdid'].count().sort_index().to_excel(
    #     '{}{}_id_distributed.xlsx'.format(data_dir, client_batch))
    print('样本drop_duplicates情况：', df_id.shape)

    df_id = df_id.loc[df_id[target].isin([0, 1])]
    print('样本y值在0，1的样本情况：', df_id.shape)

    # ---------查看各月badrate---------------------
    df_id['apply_month'] = df_id[apply_time].map(lambda s: s[:7])
    print(df_id.groupby('apply_month').describe()[target])

    # ---------样本划分----------------------------
    ##需要oot
    # df_selected = df_id #can filter records here
    # # df_oot = df_selected[df_selected['apply_time']>= '2019-04-01']
    # # X_train = df_selected[df_selected['apply_time']<= '2019-01-31']
    # # X_test = df_selected[(df_selected['apply_time']> '2019-01-31') & (df_selected['apply_time']< '2019-04-01')]

    # df_oot = df_selected[df_selected['apply_time']>= '2019-03-01']
    # X_train = df_selected[df_selected['apply_time']<= '2018-12-31']
    # X_test = df_selected[(df_selected['apply_time']> '2018-12-31') & (df_selected['apply_time']< '2019-03-01')]

    # #X_train, X_test, y_train, y_test = geo_train_test_split(df_not_oot,label=label)

    # df_id.loc[df_oot.index,'type'] = 'oot'
    ##需要oot

    # 不需要oot的时候运行下面这一行代码
    X_train, X_test, y_train, y_test = train_test_split_(df_id, target=target, test_size=test_size)
    # X_train, X_test, y_train, y_test = train_test_split(df_id.drop(columns=target), df_id[target], test_size=test_size,
    #                                                     random_state=123)
    # 不需要oot的时候运行下面这一行代码

    df_id.loc[X_train.index, 'type'] = 'train'
    df_id.loc[X_test.index, 'type'] = 'test'

    print(df_id.groupby('type').describe()[target])

    # ----------输出---------------------------------
    # df_id.to_csv(data_dir + '{}_split.csv'.format(client_batch), index=False)
    return df_id


def select_features_dtypes(df, exclude=None):
    '''
    根据数据集，筛选出数据类型
    :param df: 数据集
    :param exclude: 排除不需要参与筛选的列
    :return:三个list
    '''
    if exclude is not None:
        df = df.drop(columns=exclude)
    # 筛选出数值类型列
    numeric_df = df.select_dtypes([np.number])

    no_numeric_df = df.select_dtypes(include=['object'])
    # 将object类型的列尝试转成时间类型
    dates_objs_df = no_numeric_df.apply(pd.to_datetime, errors='ignore')
    # 筛选出字符类型列
    objs_df = dates_objs_df.select_dtypes(include=['object'])
    # 筛选出时间类型列
    dates_df = list(set(dates_objs_df.columns) - set(objs_df.columns))

    assert len(numeric_df.columns) + len(objs_df.columns) + len(dates_df) == df.shape[1]

    return numeric_df.columns.tolist(), objs_df.columns.tolist(), dates_df


def category_2_woe(df, category_cols=[], target='target'):
    '''
    方法说明。每个类别都会转成woe值。缺失值不转，即还是为缺失值。在考虑到未来如果有新类别，给予other对应woe为0
    :param df:
    :param category_cols:
    :param target:
    :return:
    '''
    var_value_woe = {}
    for i in category_cols:
        # bin_g = df.groupby(by=i)[target].agg({'total_cnt': 'count', 'bad_cnt': 'sum'})
        # https://stackoverflow.com/questions/60229375/solution-for-specificationerror-nested-renamer-is-not-supported-while-agg-alo
        bin_g = df.groupby(by=i)[target].agg([('total_cnt', 'count'), ('bad_cnt', 'sum')])
        bin_g['good_cnt'] = bin_g['total_cnt'] - bin_g['bad_cnt']
        bin_g['bad_rate'] = bin_g['bad_cnt'] / sum(bin_g['bad_cnt'])
        bin_g['good_rate'] = bin_g['good_cnt'] / sum(bin_g['good_cnt'])
        bin_g['good_rate'].replace({0: 0.0000000001}, inplace=True)  # good_rate为0的情况下，woe算出来是-inf。即将0使用一个极小数替换
        bin_g['woe'] = bin_g.apply(lambda x: 0.0 if x['bad_rate'] == 0 else np.log(x['good_rate'] / x['bad_rate']),
                                   axis=1)

        value_woe = bin_g['woe'].to_dict()
        value_woe['other'] = 0  # 未来有新类别的情况下，woe值给予0
        var_value_woe[i] = value_woe

    return var_value_woe


def category_2_woe_save(var_value_woe, path=None):
    if path is None:
        path = sys.path[0]

    with open(path + 'category_var_value_woe.json', 'w') as f:
        json.dump(var_value_woe, f)


def category_2_woe_load(path=None):
    with open(path + 'category_var_value_woe.json', 'r') as f:
        var_value_woe = json.load(f)
    return var_value_woe


def filter_miss(df, miss_threshold=0.9):
    '''

    :param df: 数据集
    :param miss_threshold: 缺失率大于等于该阈值的变量剔除
    :return:
    '''
    names_list = []
    for name, series in df.items():
        n = series.isnull().sum()
        miss_q = n / series.size
        if miss_q < miss_threshold:
            names_list.append(name)
    return names_list


# =============================

class WoeTransformer(TransformerMixin):

    def __init__(self, n_jobs=2):
        self.n_jobs = n_jobs

    def bin_to_woe(self, df, var_bin_woe_dict):
        '''
        根据传进来的var_bin_woe_dict对原始值进行映射。
        如在var_bin_woe_dict没有的类别（数据集中新出现的类别，归为到other这类）同时var_bin_woe_dict中得有other该类别对应的woe值
        如果var_bin_woe_dict中没有other该类别对应的woe值，即数据集中新出现的类别归为缺失值，即新出现的类别没有woe值
        :param df:
        :param var_bin_woe_dict:    形如{"Sex": {"female": -1.5298770033401874, "male": 0.9838327092415774}, "Embarked": {"C": -0.694264203516269, "S": 0.1977338357888416, "other": -0.030202603851420356}}
        :return:
        '''

        for feature, bin_woe in var_bin_woe_dict.items():
            df[feature] = df[feature].map(
                lambda x: x if (x in bin_woe.keys() or x is np.nan or pd.isna(x)) else 'other')
            df[feature] = df[feature].map(bin_woe)

        return df

    def transform(self, df, var_bin_woe_dict, bins_dict={}):
        '''
        输入三列的dataframe，['var_name','range','woe'] 返回转换woe后的数据集
        :param var_bin_woe_dict:    形如{"Sex": {"female": -1.5298770033401874, "male": 0.9838327092415774}, "Embarked": {"C": -0.694264203516269, "S": 0.1977338357888416, "other": -0.030202603851420356}}
        :return:转换woe后的数据集
        '''

        df_ = df.copy()
        if bins_dict:
            print('需要将原始数据转bin')
            df_ = self.data_to_bin(df, bins_dict=bins_dict)
        return self.bin_to_woe(df_, var_bin_woe_dict)

    def data_to_bin(self, df, bins_dict={}):
        '''
        原始数据根据bins_dict进行分箱
        :param df:含有目标变量的数据集；不需要返回var_summary可以不需要目标变量，将函数中target部分注释
        :param target:目标值变量名称
        :param bins_dict:分箱字典, 形如{'D157': [-999, 1.0, 2.0, 3.0, 5.0, inf]}
        :return:
        '''

        if not isinstance(bins_dict, dict):
            assert '请传入类似 {\'D157\': [-999, 1.0, 2.0, 3.0, 5.0, inf]}'

        data_with_bins = Parallel(n_jobs=self.n_jobs)(
            delayed(pd.cut)(df[col], bins=bins, right=False, retbins=True) for col, bins in bins_dict.items())
        data_bin = pd.DataFrame([i[0].astype(str) for i in data_with_bins]).T
        b_dict = dict([(i[0].name, i[1].tolist()) for i in data_with_bins])
        if not operator.eq(bins_dict, b_dict):
            assert '传入的分箱和应用后的分箱不对等，请联系开发者'

        return data_bin


# =============================

class ModelTune():
    def __init__(self):
        self.base_model = None
        self.best_model = None
        self.model_params = None
        self.loss = np.inf
        self.metrics = None
        self.default_params = None
        self.int_params = None

    def get_model(self):
        return self.best_model

    def fit(self, train_data=(), test_data=()
            , init_points=10, iterations=15):

        X_train, y_train = train_data
        X_test, y_test = test_data

        # def loss_fun(train_result, test_result):
        #     train_result = train_result * 100
        #     test_result = test_result * 100
        #     if train_result == test_result:
        #         return test_result
        #
        #     import math
        #     return test_result - math.log(abs(test_result - train_result))

        def loss_fun(train_result, test_result):
            train_result = train_result * 100
            test_result = test_result * 100

            return test_result - 2 ** abs(test_result - train_result)

        # def loss_fun(train_result, test_result):
        #     train_result = train_result * 100
        #     test_result = test_result * 100
        #
        #     return train_result - 2 ** abs(train_result - test_result)

        def obj_fun(**params):
            for param in self.int_params:
                params[param] = int(round(params[param]))

            model = self.base_model(**params, **self.default_params)
            model.fit(X_train, y_train)

            pred_test = model.predict_proba(X_test)[:, 1]
            pred_train = model.predict_proba(X_train)[:, 1]

            test_auc = get_roc_auc_score(y_test, pred_test)
            train_auc = get_roc_auc_score(y_train, pred_train)
            print('test_auc is : ', test_auc)
            print('train_auc is : ', train_auc)

            test_ks = get_ks(y_test, pred_test)
            train_ks = get_ks(y_train, pred_train)

            max_result = loss_fun(train_auc, test_auc)
            # max_result = loss_fun(train_ks, test_ks) * 2 + loss_fun(train_auc, test_auc)

            loss = 1 - max_result
            if loss < self.loss:
                self.loss = loss
                self.best_model = model
                print('best model result is {}'.format(1 - loss))
                print('best model result is : ')
                print(self.best_model.get_params())
            print('current obj_fun result is : ', max_result)

            return max_result

        params_optimizer = BayesianOptimization(obj_fun, self.model_params, random_state=1)
        print('params_optimizer is : ', params_optimizer.space.keys)

        print('begain optimizer params!!!')
        start = time.time()
        params_optimizer.maximize(init_points=init_points, n_iter=iterations, acq='ei', xi=0.0)
        # params_optimizer.maximize(init_points=init_points, n_iter=iterations, acq='ucb', xi=0.0, alpha=1e-6)
        end = time.time()
        print('optimizer params over!!! 共耗时{} 分钟'.format((end - start) / 60))
        print('the best params is : {}'.format(params_optimizer.max['params']))
        print('Maximum xgb value is : {}'.format(params_optimizer.max['target']))


class ClassifierModel(ModelTune):
    def __init__(self):
        super().__init__()
        self.metrics = ['auc', 'ks']


class RegressorModel(ModelTune):
    def __init__(self):
        super().__init__()
        self.metrics = ['r2', 'rmse']


class XGBClassifierTuner(ClassifierModel):
    def __init__(self):
        super().__init__()  # 先执行父类

        self.base_model = XGBClassifier
        self.model_params = {
            'min_child_weight': (1, 300),
            'max_depth': (2, 10),
            'n_estimators': (50, 300),
            'learning_rate': (0.01, 0.2),
            'subsample': (0.4, 1.0),
            'colsample_bytree': (0.3, 1.0),
            'gamma': (0, 2.0),
            'reg_alpha': (0, 2.0),
            'reg_lambda': (0, 2.0),
            # 'max_delta_step': (0, 10)
        }

        self.default_params = {
            'objective': 'binary:logistic',
            'n_jobs': -1,
            'nthread': -1
        }

        self.int_params = ['max_depth', 'n_estimators']


class LGBClassifierTuner(ClassifierModel):
    def __init__(self):
        super().__init__()  # 先执行父类

        self.base_model = LGBMClassifier
        self.model_params = {
            'max_depth': (500, 1500),
            'num_leaves': (200, 800),
            'min_data_in_leaf': (50, 250),
            'n_estimators': (750, 1800),
            'min_child_weight': (0.01, 0.05),
            'bagging_fraction': (0.2, 1.0),
            'feature_fraction': (0.15, 1.0),
            'learning_rate': (0.005, 0.01),
            'reg_alpha': (0.2, 0.6),
            'reg_lambda': (0.25, 1.0)
        }

        self.default_params = {
            'objective': 'binary',
            # 'max_depth': -1,
            'boosting_type': 'gbdt',
            'bagging_seed': 11,
            'metric': 'auc',
            'verbosity': -1,
            'random_state': 47,
            'num_threads': -1
        }

        self.int_params = ['max_depth', 'num_leaves', 'min_data_in_leaf', 'n_estimators']


classifiers_dic = {
    # 'logistic_regression': LogisticRegressionTuner,
    # 'random_forest': RandomForestClassifierTuner,
    'xgboost': XGBClassifierTuner,
    # 'lgb': LGBClassifierTuner
}


def classifiers_model(models=[], metrics=[], train_data=(), test_data=()
                      , init_points=10, iterations=25, verbose=1):
    if type(models) != list:
        raise AttributeError('Argument `models` must be a list, ',
                             'but given {}'.format(type(models)))
    if len(models) == 0:
        models = list(classifiers_dic.keys())
    classifiers = []
    for model in models:
        if model in classifiers_dic:
            classifiers.append(classifiers_dic[model])
    loss = float('inf')
    _model = None
    for classifier in classifiers:
        if verbose:
            print("Optimizing {}...".format(classifier()))
        _model = classifier()
        _model.fit(train_data=train_data,
                   test_data=test_data
                   , init_points=init_points, iterations=iterations)

    return _model.get_model()


def sigle_feature_fit_model(train_data=(), test_data=(), is_noise=False, is_only_return_auc=True):
    X_train, y_train = train_data
    X_test, y_test = test_data

    l = []
    for i in X_train:
        # xclf = xgb.XGBClassifier(colsample_bytree=0.3, seed=123, random_state=1234)
        xclf = xgb.XGBClassifier(**{
            'objective': 'binary:logistic',
            'n_jobs': -1,
            'nthread': -1
        })
        xclf.fit(X_train[[i]], y_train)
        pred_y_train = xclf.predict_proba(X_train[[i]])[:, 1]
        auc = get_roc_auc_score(y_train, pred_y_train)
        ks = get_ks(y_train, pred_y_train)
        l.append((i, auc, ks))

    var_auc_ks_df = pd.DataFrame(l, columns=['features', 'auc', 'ks'])

    var_auc_ks_df = var_auc_ks_df[var_auc_ks_df['auc'] > 0.51]
    # var_auc_ks_df = var_auc_ks_df[var_auc_ks_df['auc'] > 0.1]
    # var_auc_ks_df.sort_values(by=['auc', 'ks'], ascending=False, inplace=True)
    # print(var_auc_ks_df)
    if is_noise:
        var_auc_ks_df.to_excel('xgb_not_del_corr_var_auc_ks_df.xlsx')
    else:
        var_auc_ks_df.to_excel('xgb_var_auc_ks_df.xlsx')
    if is_only_return_auc:
        return var_auc_ks_df.drop(columns='ks')
    else:
        return var_auc_ks_df


def sigle_feature_auc_ks(train_data=(), test_data=(), is_noise=False, is_only_return_auc=True):
    X_train, y_train = train_data
    X_test, y_test = test_data

    l = []
    for i in X_train:
        auc = get_roc_auc_score(y_train, X_train[i])
        ks = get_ks(y_train, X_train[i])
        l.append((i, auc, ks))

    var_auc_ks_df = pd.DataFrame(l, columns=['features', 'auc', 'ks'])
    # var_auc_ks_df.sort_values(by=['auc', 'ks'], ascending=False, inplace=True)
    # print(var_auc_ks_df)
    # if is_noise:
    #     var_auc_ks_df.to_excel('process_after_data/xgb_not_del_corr_var_auc_ks_df_zhijie.xlsx')
    # else:
    #     var_auc_ks_df.to_excel('process_after_data/xgb_var_auc_ks_df_zhijie.xlsx')
    if is_only_return_auc:
        return var_auc_ks_df.drop(columns='ks')
    else:
        return var_auc_ks_df


def change_col_subsample_fit_model(train_data=(), test_data=()):
    X_train, y_train = train_data
    X_test, y_test = test_data

    colsample_bytree = [i / 10 for i in range(3, 11)]
    subsample = [i / 10 for i in range(3, 11)]

    imp_l = []
    for i in range(8):
        params = {
            'min_child_weight': 10,
            'subsample': subsample[i],
            'colsample_bytree': colsample_bytree[i],
            'objective': 'binary:logistic',
            'n_jobs': -1,
            'nthread': -1
        }
        model = xgb.XGBClassifier(**params)
        model.fit(X_train, y_train)
        imp = pd.DataFrame(list(model.get_booster().get_score().items()),
                           columns=['features', 'feature_importances']).set_index('features')
        imp_l.append(imp)

        pred_y_train = model.predict_proba(X_train)[:, 1]
        pred_y_test = model.predict_proba(X_test)[:, 1]

        print('subsample is {} and colsample_bytree is {} model result is : '.format(subsample[i], colsample_bytree[i]))
        print('xgb train auc is ：', get_roc_auc_score(y_train, pred_y_train))
        print('xgb train ks is ：', get_ks(y_train, pred_y_train))
        print('xgb test auc is ：', get_roc_auc_score(y_test, pred_y_test))
        print('xgb test ks is ：', get_ks(y_test, pred_y_test))

    imp_df = pd.concat(imp_l, axis=1)
    imp_df['mean_imp'] = imp_df.mean(axis=1)
    # imp_df.to_excel('imp_df_all_mean.xlsx')
    imp_df.drop(columns=['feature_importances'], inplace=True)
    # imp_df.to_excel('imp_df_mean.xlsx')
    imp_df.reset_index(inplace=True)
    return imp_df


def kfold_xgb_model(train_data=(), is_noise=False, cv=StratifiedKFold(10, shuffle=True)):
    X, y = train_data
    cv_data = cv.split(X, y)

    train_auc_l = []
    valid_auc_l = []
    feature_imp = []
    for fold_num, (train_i, valid_i) in enumerate(cv_data):
        X_train, y_train = X.iloc[train_i], y.iloc[train_i]
        X_valid, y_valid = X.iloc[valid_i], y.iloc[valid_i]

        # print(X.shape)
        # print(X_train.shape)
        # print(X_valid.shape)

        model = xgb.XGBClassifier(**{
            'objective': 'binary:logistic',
            'n_jobs': -1,
            'nthread': -1
        })
        model.fit(X_train, y_train)

        pred_y_train = model.predict_proba(X_train)[:, 1]
        pred_y_valid = model.predict_proba(X_valid)[:, 1]

        train_auc = get_roc_auc_score(y_train, pred_y_train)
        valid_auc = get_roc_auc_score(y_valid, pred_y_valid)

        feature_importance = pd.DataFrame(list(model.get_booster().get_score().items()),
                                          columns=['features', 'feature_importances'])
        # feature_importance.sort_values(by='feature_importances', ascending=False,
        #                                inplace=True)
        feature_importance.set_index('features', inplace=True)
        # feature_importance.reset_index(inplace=True)
        # print(feature_importance)
        feature_imp.append(feature_importance)

        print('Fold {} , train auc is {}, valid_auc is {}'.format(fold_num, train_auc, valid_auc))

        train_auc_l.append(train_auc)
        valid_auc_l.append(valid_auc)

    train_mean_auc = np.array(train_auc_l).mean()
    valid_mean_auc = np.array(valid_auc_l).mean()
    feature_imp_all = pd.concat(feature_imp, axis=1)
    print('train_mean_auc is : {}'.format(train_mean_auc))
    print('valid_mean_auc is : {}'.format(valid_mean_auc))
    # print('feature_imp_all is : {}'.format(feature_imp_all))
    # feature_imp_all.to_excel('process_after_data/feature_imp_all.xlsx')

    feature_imp_all['mean_imp'] = feature_imp_all.mean(axis=1)
    # feature_imp_all.to_excel('process_after_data/feature_imp_all_mean.xlsx')
    feature_imp_all.drop(columns='feature_importances', inplace=True)
    # feature_imp_all.index.name = 'features'
    feature_imp_all.reset_index(inplace=True)
    return feature_imp_all


def unpack_tuple(x):
    if len(x) == 1:
        return x[0]
    else:
        return x


def drop_corr(frame, by='auc', threshold=0.95, return_drop=False):
    if not isinstance(by, (str, pd.Series)):

        if isinstance(by, pd.DataFrame):
            by = pd.Series(by.iloc[:, 1].values, index=by.iloc[:, 0].values)
            # by = pd.Series(by.iloc[:, 1].values, index=frame.columns)
        else:
            by = pd.Series(by, index=frame.columns)

    # 给重要性排下序
    by.sort_values(ascending=False, inplace=True)

    # df = frame.copy()

    by.index = by.index.astype(type(frame.columns.to_list()[0]))
    df_corr = frame[by.index.to_list()].fillna(-999).corr().abs()

    ix, cn = np.where(np.triu(df_corr.values, 1) > threshold)

    del_all = []

    if len(ix):

        for i in df_corr:

            if i not in del_all:
                # 找出与当前特征的相关性大于域值的特征
                del_tmp = df_corr[i][(df_corr[i] > threshold) & (df_corr[i] != 1)].index.to_list()

                # 比较当前特征与需要删除的特征的特征重要性
                if del_tmp:
                    by_tmp = by.loc[del_tmp]
                    del_l = by_tmp[by_tmp <= by.loc[i]].index.to_list()
                    del_all.extend(del_l)

    del_f = list(set(del_all))

    r = frame.drop(columns=del_f)

    res = (r,)
    if return_drop:
        res += (del_f,)

    return unpack_tuple(res)


def forward_corr_delete(df, col_list):
    corr_list = []
    corr_list.append(col_list[0])
    delete_col = []
    # 根据特征重要性的大小进行遍历
    for col in col_list[1:]:
        corr_list.append(col)
        corr = df.loc[:, corr_list].corr()
        corr_tup = [(x, y) for x, y in zip(corr[col].index, corr[col].values)]
        corr_value = [y for x, y in corr_tup if x != col]
        # 若出现相关系数大于0。65，则将该特征剔除
        if len([x for x in corr_value if abs(x) >= 0.5]) > 0:
            delete_col.append(col)

    select_corr_col = [x for x in col_list if x not in delete_col]
    return select_corr_col


def xgb_model(train_data=(), test_data=(), is_noise=False):
    X_train, y_train = train_data
    X_test, y_test = test_data

    params = {
        'min_child_weight': 10,
        'subsample': 0.5,
        'colsample_bytree': 0.5,
        'objective': 'binary:logistic',
        'n_jobs': -1,
        'nthread': -1
    }
    xclf = xgb.XGBClassifier(**params)
    # xclf = xgb.XGBClassifier()
    xclf.fit(X_train, y_train)

    print('===============xgb 不同的重要性===============')
    l = []
    print('weight')
    importance_type = 'weight'
    feature_importance = pd.DataFrame(list(xclf.get_booster().get_score(importance_type=importance_type).items()),
                                      columns=['features', 'feature_importances_{}'.format(importance_type)])
    feature_importance.sort_values(by='feature_importances_{}'.format(importance_type), ascending=False, inplace=True)
    # feature_importance.set_index('features', inplace=True)
    feature_importance.reset_index(inplace=True)
    print(feature_importance)
    l.append(feature_importance)

    print('gain')
    importance_type = 'gain'
    feature_importance = pd.DataFrame(list(xclf.get_booster().get_score(importance_type=importance_type).items()),
                                      columns=['features', 'feature_importances_{}'.format(importance_type)])
    feature_importance.sort_values(by='feature_importances_{}'.format(importance_type), ascending=False, inplace=True)
    # feature_importance.set_index('features', inplace=True)
    feature_importance.reset_index(inplace=True)
    # print(feature_importance)
    l.append(feature_importance)

    print('cover')
    importance_type = 'cover'
    feature_importance = pd.DataFrame(list(xclf.get_booster().get_score(importance_type=importance_type).items()),
                                      columns=['features', 'feature_importances_{}'.format(importance_type)])
    feature_importance.sort_values(by='feature_importances_{}'.format(importance_type), ascending=False, inplace=True)
    # feature_importance.set_index('features', inplace=True)
    feature_importance.reset_index(inplace=True)
    # print(feature_importance)
    l.append(feature_importance)

    # print('total_gain')
    # importance_type = 'total_gain'
    # feature_importance = pd.DataFrame(list(xclf.get_booster().get_score(importance_type=importance_type).items()),
    #                                   columns=['features', 'feature_importances_{}'.format(importance_type)])
    # feature_importance.sort_values(by='feature_importances_{}'.format(importance_type), ascending=False, inplace=True)
    # # feature_importance.set_index('features', inplace=True)
    # feature_importance.reset_index(inplace=True)
    # print(feature_importance)
    # l.append(feature_importance)
    #
    # print('total_cover')
    # importance_type = 'total_cover'
    # feature_importance = pd.DataFrame(list(xclf.get_booster().get_score(importance_type=importance_type).items()),
    #                                   columns=['features', 'feature_importances_{}'.format(importance_type)])
    # feature_importance.sort_values(by='feature_importances_{}'.format(importance_type), ascending=False, inplace=True)
    # # feature_importance.set_index('features', inplace=True)
    # feature_importance.reset_index(inplace=True)
    # print(feature_importance)
    # l.append(feature_importance)

    five_importance = pd.concat(l, axis=1)
    if is_noise:
        five_importance.to_excel('xgb_not_del_corr_five_importance.xlsx')
    else:
        five_importance.to_excel('xgb_five_importance.xlsx')

    print(np.mean(
        cross_val_score(estimator=xclf, X=X_train, y=y_train, scoring='accuracy',
                        cv=StratifiedKFold(5, random_state=123))))

    pred_y_train = xclf.predict_proba(X_train)[:, 1]
    pred_y_test = xclf.predict_proba(X_test)[:, 1]

    print('xgb train auc is ：', get_roc_auc_score(y_train, pred_y_train))
    print('xgb train ks is ：', get_ks(y_train, pred_y_train))
    print('xgb test auc is ：', get_roc_auc_score(y_test, pred_y_test))
    print('xgb test ks is ：', get_ks(y_test, pred_y_test))


if __name__ == '__main__':

    # =========================step 1 相关配置=========================
    log.info('step 1 相关配置')
    feature_type = 'lhpdat'  # 什么数据
    cust_id = 'apply_no'  # 主键
    target = 'target'  # 目标变量
    data_type = 'type'  # 区分数据集变量
    apply_time = 'apply_time'  # 时间

    client = 'lhp09'
    batch = 'p23'

    to_model_var_num = 30  # 不限制的话修改为None
    is_model_data_to_woe = False  # 喂入模型的数据是否需要转化为woe值，False不需要，即原始数据入模型
    fillna_value = -999999  # 缺失值填充的值

    # 阈值配置
    exclude_cols = [apply_time, cust_id, target, data_type, 'apply_month']
    feature_missing_threshould = 0.95  # 缺失率大于等于该阈值的变量剔除

    # 需要删除的变量
    need_drop_cols = ['applthst_loan_amount', 'tzre_report_info_report_no', 'xy_black_trade_no', 'tzre_id',
                      'xy_black_version', 'tzre_version', 'tzre_bi_phone_number']

    # 用于训练模型的数据
    label_encoder_dict = {}
    to_model_data_path = '/Users/ryanzheng/PycharmProjects/data_to_treemodel_v1/to_model_data/lhp_amount_rule_jm.csv'

    # =========================后续代码基本可以不用动=========================

    # 基本不用动
    project_name = '{}{}'.format(client, batch)
    client_batch = '{}{}'.format(client, batch)
    # project_dir = 'model_result_data/{}/{}/'.format(client, batch)
    # output_dir = '{}model/{}/'.format(project_dir, feature_type)
    #
    # os.makedirs(project_dir, exist_ok=True)
    # os.makedirs(output_dir, exist_ok=True)
    # os.makedirs(project_dir + 'data/score/', exist_ok=True)
    # os.makedirs(project_dir + 'data/xgb_score/', exist_ok=True)
    # 基本不用动
    # =========================相关配置=========================

    # In[38]:

    # =========================step 2 读取数据集=========================
    log.info('step 2 开始读取数据集')
    # 读取宽表数据
    log.info('读取样本&特征数据集：{}|{}|{}为样本数据，其他为特征数据'.format(cust_id, apply_time, target))
    all_data = pd.read_csv(to_model_data_path)
    all_data[target] = np.where(all_data[target] <= 200, 1, 0)

    # drop_cols = ['xy_black_version', 'tzre_version']
    # all_data.drop(columns=drop_cols, axis=1, inplace=True)
    all_data.drop(need_drop_cols, axis=1, inplace=True)

    all_data.set_index(cust_id, inplace=True)
    selected_features = all_data.columns.format()
    selected_features = list(set(selected_features) - set(exclude_cols))
    log.info('特征的个数：{}'.format(len(selected_features)))

    # =========================读取字典进行重命名=========================
    #     ##读取字典进行重命名
    #     fea_dict_df = pd.read_excel('/home/marketingscore/ryanzheng/fit_model_project/新特征数据字典v3.xlsx')
    #     fea_dict = fea_dict_df[['feature_code','feature_id']].set_index('feature_code')['feature_id'].to_dict()
    #     all_data.rename(columns=fea_dict, inplace=True)

    #     selected_features = all_data.columns.format()
    #     selected_features = list(set(selected_features) - set(exclude_cols))
    # #     if exclude_vars:
    # #         selected_features = list(set(selected_features) - set(exclude_vars))

    #     ##仅使用数据字典中有的变量
    #     fea_dict_df_list = fea_dict_df['feature_id'].tolist()
    #     selected_features = list(set(selected_features).intersection(set(fea_dict_df_list)))
    #     print(len(selected_features))
    #     ##仅使用数据字典中有的变量

    # =========================读取字典进行重命名=========================

    # 删除特征全为空的样本量
    log.info('删除特征全为空的样本量')
    print('删除特征全为空的样本之前的数据集行列：', all_data.shape)
    all_data.dropna(subset=selected_features, how='all', inplace=True)
    print('删除特征全为空的样本之后的数据集行列：', all_data.shape)

    log.info('样本数据集情况：')
    log.info(all_data[target].value_counts())
    # =========================读取数据集=========================

    log.info('EDA，整体数据探索性数据分析')
    # all_data_eda = detect(all_data)
    # all_data_eda.to_excel('{}{}_{}_all_data_eda.xlsx'.format(
    #     output_dir, project_name, feature_type))

    # =========================step 3 划分训练集和测试集=========================
    log.info('step 3 划分训练集和测试集')
    if data_type not in all_data.columns:
        df_sample = all_data[[target, apply_time]]
        df_sample.reset_index(inplace=True)

        # 随机切分train、test
        df_sample = split_data_type(df_sample, key_col=cust_id, target=target, apply_time=apply_time, test_size=0.25)
        #df_sample.to_csv(project_dir + 'data/{}_split.csv'.format(client_batch), index=False)

        #         #按时间切分
        #         df_oot = df_sample[df_sample['apply_time']>= '2020-04-01']
        #         X_train = df_sample[df_sample['apply_time']<= '2020-02-01']
        #         X_test = df_sample[(df_sample['apply_time']> '2020-02-01') & (df_sample['apply_time']< '2020-04-01')]

        #         df_sample.loc[df_oot.index,'type'] = 'oot'
        #         df_sample.loc[X_train.index,'type'] = 'train'
        #         df_sample.loc[X_test.index,'type'] = 'test'

        #df_sample.to_csv(project_dir + 'data/{}_split.csv'.format(client_batch), index=False)
        df_sample.set_index(cust_id, inplace=True)
        print(df_sample['type'].value_counts())

    # In[39]:

    # 将数据集类别和数据集合并
    # df_sample = all_data[[target, apply_time, data_type]]
    all_data = pd.merge(df_sample[['type']], all_data, left_index=True, right_index=True, how='inner')

    log.info('分开训练集和测试集为两个df')
    train_data = all_data[all_data['type'] == 'train']
    # test_data = all_data[all_data['type'] == 'test']

    log.info('EDA，训练集探索性数据分析')
    # detect(train_data).to_excel('{}{}_{}_train_data_eda.xlsx'.format(
    #     output_dir, project_name, feature_type))
    #     detect(test_data).to_excel('{}{}_{}_test_data_eda.xlsx'.format(
    #         output_dir, project_name, feature_type))

    # =========================step 4 初筛=========================
    log.info('step 4 变量初筛')
    # selected_features = train_data_eda[train_data_eda['missing_q'] <= 0.95].index.to_list()
    print('删除缺失率前变量数量：', len(selected_features))
    selected_features = filter_miss(train_data[selected_features], miss_threshold=feature_missing_threshould)
    print('删除缺失率后变量数量：', len(selected_features))
    train_data = train_data[selected_features + [target]]
    # test_data = test_data[selected_features + [target]]
    # =========================初筛=========================

    # =========================step 5 数据处理=========================
    log.info('step 5 数据woe处理')

    # 离散变量数据处理
    # selected_features = list(set(selected_features) - set(exclude_cols))
    continuous_cols, category_cols, date_cols = select_features_dtypes(train_data[selected_features])

    train_data.loc[:, continuous_cols] = train_data.loc[:, continuous_cols].fillna(fillna_value)
    # test_data.loc[:, continuous_cols] = test_data.loc[:, continuous_cols].fillna(fillna_value)
    all_data.loc[:, continuous_cols] = all_data.loc[:, continuous_cols].fillna(fillna_value)
    # data.loc[:, continuous_cols] = data.loc[:, continuous_cols].fillna(-999)

    # =========================labelencode=========================
    #     def category_to_labelencoder(data, labelencoder=[]):
    #         label_encoder_dict = {}
    #         le = LabelEncoder()
    #         for col in labelencoder:
    #             print('{} in process!!!'.format(col))
    #             data[col] = le.fit_transform(data[col].values)
    #             number = [i for i in range(0, len(le.classes_))]
    #             key = list(le.inverse_transform(number))
    #             label_encoder_dict[col] = dict(zip(key, number))
    #         return label_encoder_dict

    #     def category_to_labelencoder_apply(data, labelencoder_dict={}):
    #         for col, mapping in labelencoder_dict.items():
    #             print('{} in process!!!'.format(col))
    #             data[col] = data[col].map(mapping).fillna(-1)
    #             data[col] = data[col].astype(int)

    #     if category_cols:
    #         train_data.loc[:, category_cols] = train_data.loc[:, category_cols].fillna('-1007')
    #         all_data.loc[:, category_cols] = all_data.loc[:, category_cols].fillna('-1007')
    #         label_encoder_dict = category_to_labelencoder(train_data, category_cols)
    #         category_to_labelencoder_apply(all_data, label_encoder_dict)

    # =========================labelencode=========================

    if category_cols and not label_encoder_dict:
        log.info('step 5.1 类别变量数据处理')
        # train_data.loc[:, category_cols] = train_data.loc[:, category_cols].fillna('miss')
        # test_data.loc[:, category_cols] = test_data.loc[:, category_cols].fillna('miss')

        var_value_woe = category_2_woe(train_data, category_cols, target=target)
        #category_2_woe_save(var_value_woe, '{}'.format(output_dir))
        # var_value_woe = category_2_woe_load('{}'.format(output_dir))
        train_data = WoeTransformer().transform(train_data, var_value_woe)
        # test_data = WoeTransformer().transform(test_data, var_value_woe)
        all_data = WoeTransformer().transform(all_data, var_value_woe)

    # 离散变量数据处理

    # In[40]:

    if is_model_data_to_woe:
        log.info('将箱子转woe')
        log.info('============入模数据需要转化为woe值===========')
        #         train_data_to_model = WoeTransformer().transform(train_data_bin, fb.get_var_bin_woe())
        #         test_data_to_model = WoeTransformer().transform(test_data_bin, fb.get_var_bin_woe())
        # all_data_to_model = WoeTransformer().transform(all_data_bin, fb.get_var_bin_woe())
    else:
        log.info('============入模数据不需要转化为woe值===========')
        #         train_data_to_model = train_data.copy()
        #         test_data_to_model = test_data.copy()
        all_data_to_model = all_data.copy()


    # In[41]:

    def statistics_model_result(all_data=pd.DataFrame()):
        # ===========================step 6 统计=================================
        all_data['score'] = all_data[feature_type].map(lambda v: to_score(v))
        log.info('模型相关结果统计！！！')
        df_splitted_type_auc_ks = all_data.groupby(data_type).apply(
            lambda df: pd.Series({'auc': get_roc_auc_score(df[target], df['score']),
                                  'ks': get_ks(df[target], df['score'])}))
        df_splitted_type_auc_ks = df_splitted_type_auc_ks.reindex(['train', 'test', 'oot', 'cv'])

        log.info('模型效果：')
        print(df_splitted_type_auc_ks)

        all_data['month'] = all_data[apply_time].map(lambda s: s[:7])
        df_monthly_auc_ks = all_data.groupby('month').apply(
            lambda df: pd.Series({'auc': get_roc_auc_score(df[target], df['score']),
                                  'ks': get_ks(df[target], df['score'])}))
        del all_data['month']
        log.info('不同月份的模型效果：')
        print(df_monthly_auc_ks)

        df_desc = all_data[[feature_type, 'score']].describe()
        df_desc.loc['coverage'] = df_desc.loc['count'] / all_data.shape[0]
        log.info('分数describe')
        print(df_desc)

        all_data[data_type] = all_data[data_type].map(lambda s: s.lower())
        all_data['client_batch'] = client_batch
        # df_psi,df_psi_details = psi_statis(all_data, splitted_types=['train','test','oot'], scores=[feature_type])
        df_psi, df_psi_details = psi_statis(all_data, splitted_types=['train', 'test'], scores=[feature_type])
        del all_data['client_batch']
        log.info('模型psi：')
        print(df_psi[['train_test_psi']])
        # log.info(df_psi[['train_test_psi','train_oot_psi']])

        df_output_statis = df_splitted_type_auc_ks.reset_index()
        df_output_statis['feature'] = feature_type
        df_output_statis['project_name'] = project_name
        df_output_statis['client_batch'] = client_batch
        df_output_statis = df_output_statis.pivot_table(
            index=['project_name', 'client_batch', 'feature'],
            columns=data_type,
            values=['auc', 'ks'])
        df_output_statis.columns = ['_'.join(reversed(x)) for x in df_output_statis.columns]
        df_output_statis['feature_cnt'] = len(selected_features)
        df_output_statis['n_estimators'] = model.get_params()['n_estimators']

        log.info('统计结束')
        return df_output_statis
        # ===========================统计=================================


    # In[42]:

    # =========================step 6 训练模型=========================
    X_all, y_all, X_train, y_train, X_test, y_test, X_oot, y_oot = get_splitted_data(
        all_data_to_model, target=target, selected_features=selected_features)

    print('整体数据集大小：', X_all.shape)
    print('训练集大小：', X_train.shape)
    print('测试集大小：', X_test.shape)
    if X_oot is None:
        print('无oot数据集')
    else:
        print('oot集大小：', X_oot.shape)

    # pd.Series(X_test.index).to_csv('{}{}_{}_X_test_key_{}.csv'.format(
    #     output_dir, project_name, feature_type, cust_id), header=cust_id, index=False)

    log.info('step 6 开始训练模型')
    start = datetime.now()

    log.info('step 6.1 ===筛选变量===')

    # ===========================================

    log.info('step 6.1 ===筛选变量===10折交叉后，计算变量的平均重要性')
    # feature_imp = tree_selection.kfold_xgb_model(train_data=(del_corr_df, y_train))
    log.info('筛选前数据集大小：{}'.format(X_train.shape))
    feature_imp = change_col_subsample_fit_model(train_data=(X_train, y_train),
                                                 test_data=(X_test, y_test))

    log.info('将特征重要性持久化')
    # feature_imp.to_csv('{}{}_{}_xgb_allfeature_mean_imp_df.csv'.format(
    #     output_dir, project_name, feature_type))

    log.info('根据10折拟合模型处理后的变量重要性进行变量相关性筛选')
    del_corr_df = drop_corr(X_train, by=feature_imp, threshold=0.9)
    # del_corr_df = tree_selection.drop_corr(del_corr_df, by=feature_imp, threshold=0.8)
    log.info('筛选后数据集大小：{}'.format(del_corr_df.shape))

    # ===========================================

    selected_features = list(del_corr_df.columns)
    log.info('最终入模变量的数量：{}'.format(len(selected_features)))
    log.info('最终入模变量：{}'.format(selected_features))

    feature_imp = change_col_subsample_fit_model(train_data=(del_corr_df, y_train),
                                                 test_data=(X_test[del_corr_df.columns], y_test))

    log.info('将待入模特征重要性持久化')
    # feature_imp.to_csv('{}{}_{}_xgb_tomodel_feature_mean_imp_df.csv'.format(
    #     output_dir, project_name, feature_type))

    log.info('贝叶斯进行模型调参')
    model = classifiers_model(train_data=(X_train[selected_features], y_train),
                              test_data=(X_test[selected_features], y_test),
                              init_points=5, iterations=8, verbose=1)
    log.info('模型调参完成！！！')
    log.info('模型参数：{}'.format(model.get_xgb_params()))
    log.info('模型参数：{}'.format(model.get_params()))

    df_featurescore = pd.DataFrame(list(model._Booster.get_fscore().items()), columns=['特征名称', '特征权重值']
                                   ).sort_values('特征权重值', ascending=False)
    # df_featurescore.to_csv('{}{}_{}_xgb_featurescore_first.csv'.format(
    #     output_dir, project_name, feature_type), index=False)

    end = datetime.now()
    log.info('模型训练完成, 使用 {} 秒'.format((end - start).seconds))

    # X_all = pd.concat([X_train, X_test])
    X_all[feature_type] = model.predict_proba(X_all[selected_features])[:, 1]
    all_data = pd.concat([all_data_to_model, X_all[feature_type]], axis=1)

    statistics_model_result(all_data=all_data)

    # X_all.to_csv('{}{}_{}_X_all.csv'.format(output_dir, project_name, feature_type))
    # all_data.to_csv('{}{}_{}_all_data.csv'.format(output_dir, project_name, feature_type))

    if to_model_var_num:
        start = datetime.now()

        print('过滤前{}个特征出来，再次训练'.format(to_model_var_num))
        #         importance = model._Booster.get_fscore()
        #         importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
        #         features_importance = pd.DataFrame()
        #         features_importance = features_importance.append(importance, ignore_index=True)
        #         features_importance.columns = ['特征名称', '特征权重值']
        #         # features_importance.to_csv(
        #         #     '{}{}_{}_xgb_features_importance.csv'.format(output_dir, project_name, feature_type))
        #         selected_features = features_importance.iloc[:to_model_var_num]['特征名称'].tolist()

        selected_features = df_featurescore.iloc[:to_model_var_num]['特征名称'].tolist()

        print('过滤后的特征：', selected_features)

        X_all, y_all, X_train, y_train, X_test, y_test, X_oot, y_oot = get_splitted_data(
            all_data_to_model, target=target, selected_features=selected_features)
        print('整体数据集大小：', X_all.shape)
        print('训练集大小：', X_train.shape)
        print('测试集大小：', X_test.shape)
        if X_oot is None:
            print('无oot数据集')
        else:
            print('oot集大小：', X_oot.shape)

        # 手动指定调参
        # model = xgb.XGBClassifier(**ini_params)
        # model.fit(X_train, y_train)

        # 贝叶斯调参
        log.info('贝叶斯进行模型调参')
        model = classifiers_model(train_data=(X_train[selected_features], y_train),
                                  test_data=(X_test[selected_features], y_test),
                                  init_points=5, iterations=8, verbose=1)
        log.info('模型调参完成！！！')
        log.info('模型参数：{}'.format(model.get_xgb_params()))
        log.info('模型参数：{}'.format(model.get_params()))

        end = datetime.now()
        log.info('模型训练完成, 使用 {} 秒'.format((end - start).seconds))

    # X_all = pd.concat([X_train, X_test])
    X_all[feature_type] = model.predict_proba(X_all[selected_features])[:, 1]
    all_data = pd.concat([all_data_to_model, X_all[feature_type]], axis=1)

    df_output_statis = statistics_model_result(all_data=all_data)

    # X_all.to_csv('{}{}_{}_X_all.csv'.format(output_dir, project_name, feature_type))
    # all_data.to_csv('{}{}_{}_all_data.csv'.format(output_dir, project_name, feature_type))

    # ==========================训练模型=========================

    # In[43]:

    # ===========================step 7 模型持久化=================================

    log.info('模型相关结果持久化')
    # all_data[feature_type].to_frame().to_csv(
    #     '{}/data/score/{}_{}_score.csv'.format(project_dir, project_name, feature_type))
    # all_data[feature_type].to_frame().to_csv(
    #     '{}/data/xgb_score/{}_{}_score.csv'.format(project_dir, project_name, feature_type))
    # all_data[feature_type].to_frame().to_csv('{}{}_{}_score.csv'.format(
    #     output_dir, project_name, feature_type))
    #
    # joblib.dump(model._Booster, '{}{}_{}_xgb.ml'.format(
    #     output_dir, project_name, feature_type))
    # json.dump(model.get_params(), open('{}{}_{}_xgb.params'.format(
    #     output_dir, project_name, feature_type), 'w'))
    #
    # model._Booster.dump_model('{}{}_{}_xgb.txt'.format(output_dir, project_name, feature_type))
    #
    # df_featurescore = pd.DataFrame(list(model._Booster.get_fscore().items()), columns=['特征名称', '特征权重值']
    #                                ).sort_values('特征权重值', ascending=False)
    # df_featurescore.to_csv('{}{}_{}_xgb_featurescore.csv'.format(
    #     output_dir, project_name, feature_type), index=False)
    #
    # df_corr = X_all.corr()
    # df_corr.to_csv('{}{}_{}_xgb_corr.csv'.format(
    #     output_dir, project_name, feature_type), index_label='feature')
    #
    # df_rawdata = all_data[selected_features]
    # df_rawdata.reset_index(inplace=True)
    # df_rawdata_col_name = df_rawdata.columns.tolist()
    # df_rawdata_col_name.insert(len(df_rawdata_col_name) - 1,
    #                            df_rawdata_col_name.pop(df_rawdata_col_name.index(cust_id)))
    # df_rawdata = df_rawdata[df_rawdata_col_name]
    # df_rawdata.head(100).to_csv('{}{}_{}_xgb_rawdata.csv'.format(
    #     output_dir, project_name, feature_type), index=False)
    #
    # df_output_statis.to_csv('{}{}_{}_xgb_output_statis.csv'.format(
    #     output_dir, project_name, feature_type))
    #
    # os.makedirs(project_dir + 'data/statis/auc_ks', exist_ok=True)
    # df_output_statis.to_csv('{}data/statis/auc_ks/{}.csv'.format(
    #     project_dir, feature_type))


    from model_evaluator import model_save as ms
    save_pythonmodel('td06p1_xgb_model_v1.ml',model,script_id=script_id)

    log.info('模型相关结果持久化完成')
    # ===========================模型持久化=================================

    # In[ ]:

    # In[ ]:
