In [101]:
# coding: utf-8
import multiprocessing
from collections import Counter
import xgboost as xgb
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
from sklearn.model_selection import KFold
import gc
from sklearn import preprocessing
from scipy.stats import entropy
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.metrics import roc_auc_score, roc_curve
import datetime
import time
from itertools import product

nowtime = datetime.date.today()
nowtime = str(nowtime)[-5:]
print(nowtime)
warnings.filterwarnings('ignore')

DATA_PATH = '/Users/senlinlidewo/pyprogram/天池竞赛-零基础金融风控/数据/'

def load_dataset(DATA_PATH):
    train_label = pd.read_csv(DATA_PATH + 'train.csv')['isDefault']
    train = pd.read_csv(DATA_PATH + 'train.csv')
    test = pd.read_csv(DATA_PATH + 'testA.csv')
#     train = train.sample(40000)
#     test = test.sample(5000)
#     train.to_csv(DATA_PATH + 'target.csv')
#     train_label = pd.read_csv(DATA_PATH + 'target.csv')['isDefault']
    
    feats = [f for f in train.columns if f not in ['isDefault']]
    # train = train[feats]
    test = test[feats]
    print('train.shape', train.shape)
    print('test.shape', test.shape)

    return train_label, train, test


# 处理时间
def transform_time(x):
    day = int(x.split(' ')[0])
    hour = int(x.split(' ')[2].split('.')[0].split(':')[0])
    minute = int(x.split(' ')[2].split('.')[0].split(':')[1])
    second = int(x.split(' ')[2].split('.')[0].split(':')[2])
    return 86400 * day + 3600 * hour + 60 * minute + second


def transform_day(date1):
    date2 = "2020-01-01"
    date1 = time.strptime(date1, "%Y-%m-%d")
    date2 = time.strptime(date2, "%Y-%m-%d")

    # 根据上面需要计算日期还是日期时间，来确定需要几个数组段。下标0表示年，小标1表示月，依次类推...
    # date1=datetime.datetime(date1[0],date1[1],date1[2],date1[3],date1[4],date1[5])
    # date2=datetime.datetime(date2[0],date2[1],date2[2],date2[3],date2[4],date2[5])
    date1 = datetime.datetime(date1[0], date1[1], date1[2])
    date2 = datetime.datetime(date2[0], date2[1], date2[2])
    # 返回两个变量相差的值，就是相差天数
    # print((date2 - date1).days)  # 将天数转成int型
    return (date2 - date1).days


# transform_day('2007-09-01')

def labelEncoder_df(df, features):
    for i in features:
        encoder = preprocessing.LabelEncoder()
        df[i] = encoder.fit_transform(df[i])



class MeanEncoder:
    def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode

        :param n_splits: the number of splits used in mean encoding

        :param target_type: str, 'regression' or 'classification'

        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """

        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}

        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None

        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))

    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()

        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()

        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg([('mean','mean'), ('beta','size')])
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)

        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values

        return nf_train, nf_test, prior, col_avg_y

    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)

        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target,
                        self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.index.get_loc[large_ind], y.index.get_loc[large_ind], X_new.index.get_loc[small_ind], variable, None,
                        self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new

    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()

        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits

        return X_new



def employmentLength_deal(x):
    if x == r'\N':
        result = -999
    elif x == -999:
        result = -999
    elif x == '-999':
        result = -999
    elif x == '< 1 year':
        result = 0.5
    elif x == '10+ years':
        result = 12
    else:
        result = int(x.split(' ')[0][0])
    # print(result)
    return result


def earliesCreditLine_month_deal(x):
    x = x.split('-')[0]
    # print(x)
    dict = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10,
            'Nov': 11, 'Dec': 12}
    result = dict[x]
    return result


def gradeTrans(x):
    dict = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
    result = dict[x]
    return result


def subGradeTrans(x):
    dict = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
    result = dict[x[0]]
    result = result * 5 + int(x[1])
    return result




def myEntro(x):
    """
        calculate shanno ent of x
    """
    x = np.array(x)
    x_value_list = set([x[i] for i in range(x.shape[0])])
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]
        logp = np.log2(p)
        ent -= p * logp
    #     print(x_value,p,logp)
    # print(ent)
    return ent


def myRms(records):
    records = list(records)
    """
    均方根值 反映的是有效值而不是平均值
    """
    return np.math.sqrt(sum([x ** 2 for x in records]) / len(records))


def myMode(x):
    return np.mean(pd.Series.mode(x))


def myQ25(x):
    return x.quantile(0.25)


def myQ75(x):
    return x.quantile(0.75)


def myRange(x):
    return pd.Series.max(x) - pd.Series.min(x)


# 预处理
def data_preprocess(DATA_PATH):
    train_label, train, test = load_dataset(DATA_PATH=DATA_PATH)
    # 拼接数据

    data = pd.concat([train, test], axis=0, ignore_index=True)
    print('初始拼接后：', data.shape)
    # n_feat = [f for f in data.columns if f[0] == 'n']

    n_feat = ['n0', 'n1', 'n2', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', ]
    # nameList = ['min', 'max', 'sum', 'mean', 'median', 'skew', 'std', 'mode', 'range']
    # statList = ['min', 'max', 'sum', 'mean', 'median', 'skew', 'std', myMode, myRange]
    nameList = ['max', 'sum', 'mean', 'median', 'skew', 'std']
    statList = ['max', 'sum', 'mean', 'median', 'skew', 'std']


    for i in range(len(nameList)):
        data['n_feat_{}'.format(nameList[i])] = data[n_feat].agg(statList[i], axis=1)
    print('n特征处理后：', data.shape)


    # count编码
    count_list = ['subGrade', 'grade', 'postCode', 'regionCode','homeOwnership','title','employmentTitle','employmentLength']
    data = count_coding(data, count_list)
    print('count编码后：', data.shape)
    ### 用数值特征对类别特征做统计刻画，随便挑了几个跟price相关性最高的匿名特征
    cross_cat = ['subGrade', 'grade', 'employmentLength', 'term', 'homeOwnership', 'postCode', 'regionCode','employmentTitle','title']
    cross_num = ['dti', 'revolBal','revolUtil', 'ficoRangeHigh', 'interestRate', 'loanAmnt', 'installment', 'annualIncome', 'n14',
                 'n2', 'n6', 'n9', 'n5', 'n8']

    data[['employmentLength']].fillna(-999, inplace=True)

#     data = cross_cat_num(data, cross_num, cross_cat)  # 一阶交叉
#     print('一阶特征处理后：', data.shape)
#     data = cross_qua_cat_num(data)  # 二阶交叉
#     print('二阶特征处理后：', data.shape)
    # 缺失值处理
    for temp in count_list:
        del data[temp+'_count']
    # num_fill_col = ['employmentLength', 'postCode', ]
    cols = ['employmentTitle', 'employmentLength', 'postCode', 'dti', 'pubRecBankruptcies', 'revolUtil', 'title',
            'n0', 'n1', 'n2', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']
    for col in cols:
        data[col].fillna(r'\N', inplace=True)
    cols = [f for f in cols if f not in ['employmentLength']]
    for col in cols:
        data[col].replace({r'\N': -999}, inplace=True)
        data[col] = data[col]
    # print('缺失值情况：', data.isnull().sum())

    data['grade'] = data['grade'].apply(lambda x: gradeTrans(x))
    data['subGrade'] = data['subGrade'].apply(lambda x: subGradeTrans(x))
    print('1data.shape', data.shape)

    data['employmentLength'] = data['employmentLength'].apply(lambda x: employmentLength_deal(x))
    data['issueDate_year'] = data['issueDate'].apply(lambda x: int(x.split('-')[0]))
    data['issueDate_month'] = data['issueDate'].apply(lambda x: int(x.split('-')[1]))
    data['issueDate_day'] = data['issueDate'].apply(lambda x: transform_day(x))
    data['issueDate_week'] = data['issueDate_day'].apply(lambda x: int(x % 7) + 1)

    print('2_data.shape', data.shape)
    data['earliesCreditLine_year'] = data['earliesCreditLine'].apply(lambda x: 2020 - (int(x.split('-')[-1])))
    data['earliesCreditLine_month'] = data['earliesCreditLine'].apply(lambda x: earliesCreditLine_month_deal(x))
    data['earliesCreditLine_Allmonth'] = data['earliesCreditLine_year'] * 12 - data['earliesCreditLine_month']
    del data['issueDate'], data['earliesCreditLine']

    print('预处理完毕', data.shape)

    return data, train_label





def kfold_stats_feature(train, test, feats, k):
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=6666)  # 这里最好和后面模型的K折交叉验证保持一致

    train['fold'] = None
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['isDefault'])):
        train.loc[val_idx, 'fold'] = fold_

    kfold_features = []
    for feat in feats:
        nums_columns = ['isDefault']
        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            kfold_features.append(colname)
            train[colname] = None
            for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['isDefault'])):
                tmp_trn = train.iloc[trn_idx]
                order_label = tmp_trn.groupby([feat])[f].mean()
                tmp = train.loc[train.fold == fold_, [feat]]
                train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)
                # fillna
                global_mean = train[f].mean()
                train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
            train[colname] = train[colname].astype(float)

        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            test[colname] = None
            order_label = train.groupby([feat])[f].mean()
            test[colname] = test[feat].map(order_label)
            # fillna
            global_mean = train[f].mean()
            test[colname] = test[colname].fillna(global_mean)
            test[colname] = test[colname].astype(float)
    del train['fold']
    return train, test

def GridSearch(clf, params, X, y):
    cscv = GridSearchCV(clf, params, scoring='roc_auc', n_jobs=4, cv=10)
    cscv.fit(X, y)
    print(cscv.cv_results_)
    print(cscv.best_params_)
    print(cscv.best_score_)

### count编码
def count_coding(df, fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return (df)


# 定义交叉特征统计
def cross_cat_num(df, num_col, cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max', '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
            })
            df = df.merge(feat, on=f1, how='left')
    return (df)


def cross_qua_cat_num(df):
    for f_pair in tqdm([
        ['subGrade', 'regionCode'], ['grade', 'regionCode'], ['subGrade', 'postCode'], ['grade', 'postCode'], ['employmentTitle','title'],
        ['regionCode','title'], ['postCode','title'], ['homeOwnership','title'], ['homeOwnership','employmentTitle'],['homeOwnership','employmentLength'],
        ['regionCode', 'postCode']
    ]):
        ### 共现次数
        df['_'.join(f_pair) + '_count'] = df.groupby(f_pair)['id'].transform('count')
        ### n unique、熵
        df = df.merge(df.groupby(f_pair[0], as_index=False)[f_pair[1]].agg({
            '{}_{}_nunique'.format(f_pair[0], f_pair[1]): 'nunique',
            '{}_{}_ent'.format(f_pair[0], f_pair[1]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[0], how='left')
        df = df.merge(df.groupby(f_pair[1], as_index=False)[f_pair[0]].agg({
            '{}_{}_nunique'.format(f_pair[1], f_pair[0]): 'nunique',
            '{}_{}_ent'.format(f_pair[1], f_pair[0]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[1], how='left')
        ### 比例偏好
        df['{}_in_{}_prop'.format(f_pair[0], f_pair[1])] = df['_'.join(f_pair) + '_count'] / df[f_pair[1] + '_count']
        df['{}_in_{}_prop'.format(f_pair[1], f_pair[0])] = df['_'.join(f_pair) + '_count'] / df[f_pair[0] + '_count']
    return (df)


### count编码
def count_coding(df, fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return (df)

def gen_basicFea(data):
    data['avg_income'] = data['annualIncome'] / data['employmentLength']
    data['total_income'] = data['annualIncome'] * data['employmentLength']
    data['avg_loanAmnt'] = data['loanAmnt'] / data['term']
    data['mean_interestRate'] = data['interestRate'] / data['term']
    data['all_installment'] = data['installment'] * data['term']

    data['rest_money_rate'] = data['avg_loanAmnt'] / (data['annualIncome'] + 0.1)  # 287个收入为0
    data['rest_money'] = data['annualIncome'] - data['avg_loanAmnt']

    data['closeAcc'] = data['totalAcc'] - data['openAcc']
    data['ficoRange_mean'] = (data['ficoRangeHigh'] + data['ficoRangeLow']) / 2
    del data['ficoRangeHigh'], data['ficoRangeLow']

    data['rest_pubRec'] = data['pubRec'] - data['pubRecBankruptcies']

    data['rest_Revol'] = data['loanAmnt'] - data['revolBal']

    data['dis_time'] = data['issueDate_year'] - (2020 - data['earliesCreditLine_year'])
    for col in ['employmentTitle', 'grade', 'subGrade', 'regionCode', 'issueDate_month', 'postCode']:
        data['{}_count'.format(col)] = data.groupby([col])['id'].transform('count')

    return data


def plotroc(train_y, train_pred, test_y, val_pred):
    lw = 2
    ##train
    fpr, tpr, thresholds = roc_curve(train_y.values, train_pred, pos_label=1.0)
    train_auc_value = roc_auc_score(train_y.values, train_pred)
    ##valid
    fpr, tpr, thresholds = roc_curve(test_y.values, val_pred, pos_label=1.0)
    valid_auc_value = roc_auc_score(test_y.values, val_pred)

    return train_auc_value, valid_auc_value


def xgb_model(train, target, test, k):
    # saveFeature_df = pd.read_csv('../feature/xgb_920_556_5_score.csv',header=None)
#     saveFeature_df2 = pd.read_csv('../feature/xgb_09-20_74_0.8_0.6_5_score.csv',header=None)
    
    # saveFeature_df.columns=['feature','score']
#     saveFeature_df2.columns=['feature','score']

#     saveFeature_list = list(saveFeature_df['feature'].values)
#     saveFeature_list2 = list(saveFeature_df2['feature'].values)
    
    # saveFeature_list = list(saveFeature_df[saveFeature_df['score']>10]['feature'])
    saveFeature_list=list(train.columns)
    feats = [f for f in saveFeature_list if f not in ['id', 'isDefault']]
    feaNum = len(feats)
    print('Current num of features:', len(feats))

    seeds = [2020,666666,188888]
    output_preds = 0
    xgb_oof_probs = np.zeros(train.shape[0])

    for seed in seeds:
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        oof_probs = np.zeros(train.shape[0])

        offline_score = []
        feature_importance_df = pd.DataFrame()
        params = {'booster': 'gbtree',
                  'objective': 'binary:logistic',
                  'eval_metric': 'auc',
                  'min_child_weight': 5,
                  'max_depth': 8,
                  'subsample': ss,
                  'colsample_bytree': fs,
                  'eta': 0.01,

                  'seed': seed,
                  'nthread': -1,

                  'tree_method': 'hist'
                  }
        for i, (train_index, test_index) in enumerate(folds.split(train, target)):
            
            train_y, test_y = target[train_index], target[test_index]
            train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
            train_matrix = xgb.DMatrix(train_X, label=train_y, missing=np.nan)
            valid_matrix = xgb.DMatrix(test_X, label=test_y, missing=np.nan)
            test_matrix = xgb.DMatrix(test[feats], missing=np.nan)
            watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
            model = xgb.train(params, train_matrix, num_boost_round=1000, evals=watchlist, verbose_eval=100,
                              early_stopping_rounds=600)
            val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            train_pred = model.predict(train_matrix, ntree_limit=model.best_ntree_limit)
            xgb_oof_probs[test_index] += val_pred / len(seeds)
            # oof_probs[test_index] += val_pred
            test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)

            # 绘制roc曲线
            train_auc_value, valid_auc_value = plotroc(train_y, train_pred, test_y, val_pred)
            print('train_auc:{},valid_auc{}'.format(train_auc_value, valid_auc_value))
            offline_score.append(valid_auc_value)
            print(offline_score)
            output_preds += test_pred / k / len(seeds)
            
#             sub_df = test[['id']].copy()
#             sub_df['isDefault'] = output_preds
#             off = test[['id']].copy()
#             subVal_df = train[['id']].copy()
#             subVal_df.loc[test_index,'isDefault'] = xgb_oof_probs[test_index]
#             outpath = '../user_data/fold/'
#             fold_score = round(valid_auc_value, 5)
#             sub_df.to_csv( outpath + str(fold_score) + '_' + str(feaNum) + '_' + nowtime + '_{}_{}_xgb.csv'.format(i, kflod_num),
#             index=False)
#             subVal_df.to_csv(outpath + str(fold_score) + '_' + str(feaNum) + '_' + nowtime + '_{}_{}_xgbVal.csv'.format(i, kflod_num),
#             index=False)

            fold_importance_df = pd.DataFrame()
            fold_importance_df["Feature"] = model.get_fscore().keys()
            fold_importance_df["importance"] = model.get_fscore().values()
            fold_importance_df["fold"] = i + 1

            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#             print(feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False).head(50))
#             feature_sorted = feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False)
#             feature_sorted.to_csv('../feature/xgb_{}_{}_{}_{}_score.csv'.format(i+1,nowtime, feaNum, kflod_num))

#             if i==3:
#                 break
#             gc.collect()


        print('all_auc:', roc_auc_score(target.values, oof_probs))
        print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
        feature_sorted = feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False)
        feature_sorted.to_csv('../feature/xgb_importance.csv')
        top_features = feature_sorted.index
        print(feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False).head(50))
    return output_preds, xgb_oof_probs, np.mean(offline_score), feaNum



03-15


In [102]:
if __name__ == '__main__':
    DATA_PATH ='/Users/senlinlidewo/pyprogram/天池竞赛-零基础金融风控/数据/'
    print('读取数据...')
    data, train_label = data_preprocess(DATA_PATH=DATA_PATH)

    print('开始特征工程...')
    data = gen_basicFea(data)


    print('data.shape', data.shape)
    print('开始模型训练...')
    train = data[~data['isDefault'].isnull()].copy()
    target = train_label
    test = data[data['isDefault'].isnull()].copy()

    target_encode_cols = ['postCode', 'regionCode', 'homeOwnership', 'employmentTitle','title']

    kflod_num = 5
    ss = 0.8
    fs = 0.4

    class_list = ['postCode', 'regionCode', 'homeOwnership', 'employmentTitle','title']
    MeanEnocodeFeature = class_list  # 声明需要平均数编码的特征
    ME = MeanEncoder(MeanEnocodeFeature, target_type='classification')  # 声明平均数编码的类
    train = ME.fit_transform(train, target)  # 对训练数据集的X和y进行拟合
    # x_train_fav = ME.fit_transform(x_train,y_train_fav)#对训练数据集的X和y进行拟合
    test = ME.transform(test)  # 对测试集进行编码
    print('num0:mean_encode train.shape', train.shape, test.shape)

    train, test = kfold_stats_feature(train, test, target_encode_cols, kflod_num)
    print('num1:target_encode train.shape', train.shape, test.shape)
    ### target encoding目标编码，回归场景相对来说做目标编码的选择更多，不仅可以做均值编码，还可以做标准差编码、中位数编码等
    enc_cols = []
    stats_default_dict = {
        'max': train['isDefault'].max(),
        'min': train['isDefault'].min(),
        'median': train['isDefault'].median(),
        'mean': train['isDefault'].mean(),
        'sum': train['isDefault'].sum(),
        'std': train['isDefault'].std(),
        'skew': train['isDefault'].skew(),
        'kurt': train['isDefault'].kurt(),
        'mad': train['isDefault'].mad()
    }
    ### 暂且选择这三种编码
    enc_stats = ['max', 'min', 'skew', 'std']
    skf = KFold(n_splits=kflod_num, shuffle=True, random_state=6666)
    for f in tqdm(['postCode', 'regionCode', 'homeOwnership', 'employmentTitle','title']):
        enc_dict = {}
        for stat in enc_stats:
            enc_dict['{}_target_{}'.format(f, stat)] = stat
            train['{}_target_{}'.format(f, stat)] = 0
            test['{}_target_{}'.format(f, stat)] = 0
            enc_cols.append('{}_target_{}'.format(f, stat))
        for i, (trn_idx, val_idx) in enumerate(skf.split(train, target)):
            trn_x, val_x = train.iloc[trn_idx].reset_index(drop=True), train.iloc[val_idx].reset_index(drop=True)
            enc_df = trn_x.groupby(f, as_index=False)['isDefault'].agg(enc_dict)
            val_x = val_x[[f]].merge(enc_df, on=f, how='left')
            test_x = test[[f]].merge(enc_df, on=f, how='left')
            for stat in enc_stats:
                val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].fillna(
                    stats_default_dict[stat])
                test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format(f, stat)].fillna(
                    stats_default_dict[stat])
                train.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].values
                test['{}_target_{}'.format(f, stat)] += test_x['{}_target_{}'.format(f, stat)].values / skf.n_splits

    print('num2:target_encode train.shape', train.shape, test.shape)

    train.drop(['postCode', 'regionCode', 'homeOwnership', 'employmentTitle','title'], axis=1, inplace=True)
    test.drop(['postCode', 'regionCode', 'homeOwnership', 'employmentTitle','title'], axis=1, inplace=True)
    print('输入数据维度：', train.shape, test.shape)
    

读取数据...
train.shape (800000, 47)
test.shape (200000, 46)
初始拼接后： (1000000, 47)
n特征处理后： (1000000, 53)
count编码后： (1000000, 61)
1data.shape (1000000, 53)
2_data.shape (1000000, 57)
预处理完毕 (1000000, 58)
开始特征工程...
data.shape (1000000, 74)
开始模型训练...
num0:mean_encode train.shape (800000, 84) (200000, 84)


  0%|          | 0/5 [00:00<?, ?it/s]

num1:target_encode train.shape (800000, 89) (200000, 89)


100%|██████████| 5/5 [02:41<00:00, 32.25s/it]

num2:target_encode train.shape (800000, 109) (200000, 109)
输入数据维度： (800000, 104) (200000, 104)





In [3]:
#train and test 含有target的值

train = train.apply(lambda x: pd.to_numeric(x,errors='ignore'))
test = test.apply(lambda x: pd.to_numeric(x,errors='ignore'))

In [103]:
train =train[~train.isin([np.nan, np.inf, -np.inf]).any(1)]
# test =test[~test.isin([np.nan, np.inf, -np.inf]).any(1)]
# powerful tools 使用skearn会对dataset有特定的要求，当出现nan，inf时，可以用这行代码进行数据集处理

In [104]:
train1 = train.drop(['isDefault','id'],axis=1)
# test1 = train.drop(['isDefault','id'],axis=1)
target = train['isDefault']

In [67]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(train1,target,test_size = 0.33,random_state=6666)

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay


FPR, TPR, _ = roc_curve(y_test, y_score)
ROC_AUC = auc(FPR, TPR)
print (ROC_AUC)

In [37]:
## 多元逻辑回归
logreg = LogisticRegression(solver='newton-cg',
                           penalty= 'l2',random_state = 6666,
                            class_weight={0:0.25, 1:0.75},
                            warm_start=True,
                            max_iter =1000
                           )
C_vals = [0.1,0.3,0.5,0.7,0.9]
cv = StratifiedShuffleSplit(n_splits = 3, test_size = .25)
param = {'C': C_vals}
grid = GridSearchCV(
    estimator=LogisticRegression(), 
    param_grid = param,
#    scoring = 'accuracy',
    scoring = 'roc_auc',
    n_jobs =-1,
    cv = cv
)
grid.fit(train1, target)
grid.best_score_
log_grid= grid.best_estimator_
# 到这里log_grid 可以用作predict test 数据集

In [96]:
#knn model 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=-1,
                                            p=2
                                            )
k_range = range(1,31)
weights_options=['uniform','distance']
param = {'n_neighbors':k_range, 'weights':weights_options}
cv = StratifiedShuffleSplit(n_splits = 3, test_size = .25)

grid_knn = GridSearchCV(knn, param,cv=cv,verbose = False,scoring = 'roc_auc', n_jobs=-1)
grid_knn.fit(train1,target)
knn_grid = grid_knn.best_estimator_
grid.best_score_

0.5932676869013956

In [94]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gau = gaussian.fit(train1,target)

In [100]:
#支持向量机
Cs = [0.001, 0.01, 0.1, 1,1.5,2,2.5,3,4,5, 10]
gammas = [0.0001,0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
cv = StratifiedShuffleSplit(n_splits=3, test_size=.30, random_state=66)
grid_search = GridSearchCV(SVC(kernel = 'rbf', probability=True),param_grid, cv=cv,scoring = 'roc_auc') ## 'rbf' stands for gaussian kernel
grid_search.fit(train1,target)
grid_svc = grid_search.best_estimator_
grid_search.best_score_

0.5

In [16]:
##Decision tree
dt = DecisionTreeClassifier(class_weight='balanced', 
                                              max_leaf_nodes=3,
                                              min_impurity_decrease=1.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=4,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              random_state=666,
                                              splitter='best')
max_depth = range(1,30)
max_feature = [21,22,23,24,25,26,28,29,30,'auto']
criterion=["entropy", "gini"]
cv = StratifiedShuffleSplit(n_splits=3, random_state=15)
param = {'max_depth':max_depth, 
         'max_features':max_feature, 
         'criterion': criterion}
grid_d = GridSearchCV(dt, 
                                param_grid = param, 
                                 verbose=False, 
                                 cv=cv,
                                n_jobs = -1,scoring = 'roc_auc'
                   )
grid_d.fit(train1, target) 
grid_dt = grid_d.best_estimator_
grid_d.best_score_

KeyboardInterrupt: 

In [18]:
# random forest
from sklearn.ensemble import RandomForestClassifier
n_estimators = [140,145,150,155,160]
max_depth = range(1,10)
criterions = ['gini', 'entropy']
cv = StratifiedShuffleSplit(n_splits=3, test_size=.30, random_state=15)


parameters = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'criterion': criterions
              
        }
grid_r = GridSearchCV(estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
                                 param_grid=parameters,
                                 cv=cv,
                                 n_jobs = -1,scoring = 'roc_auc')
grid_r.fit(train1,target)
grid_rf = grid_r.best_estimator_
grid_r.best_score_

KeyboardInterrupt: 

In [121]:
#bagging 
from sklearn.ensemble import BaggingClassifier
n_estimators = [10,30,50,70,80,150,160, 170,175,180,185]
cv = StratifiedShuffleSplit(n_splits=3, test_size=.30, random_state=15)

parameters = {'n_estimators':n_estimators}
grid_ba = GridSearchCV(BaggingClassifier(base_estimator=None, bootstrap=True,
                                         bootstrap_features=False,
                                         max_features=1.0, max_samples=1.0,
                                         n_jobs=None,
                                         oob_score=False, random_state=None,
                                         verbose=0, warm_start=False),
                                 param_grid=parameters,
                                 cv=cv,
                                 n_jobs = -1,scoring = 'roc_auc')
grid_ba.fit(train1,target)
grid_bag = grid_ba.best_estimator_
grid_ba.best_score_

0.6993834324677208

In [127]:
# adaboost ensemble model 
from sklearn.ensemble import AdaBoostClassifier
n_estimators = [100,140,145,150,160, 170,175,180,185]
learning_r = [0.1,1,0.01,0.5]
cv = StratifiedShuffleSplit(n_splits=3, test_size=.30, random_state=15)


parameters = {'n_estimators':n_estimators,
              'learning_rate':learning_r
              
        }
grid_ad = GridSearchCV(AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          random_state=None),
                                param_grid=parameters,
                                 cv=cv,
                                 n_jobs = -1,scoring = 'roc_auc')
grid_ad.fit(train1,target)
grid_ada = grid_ad.best_estimator_
grid_ad.best_score_

0.6934462498186565

In [165]:
# gradient boost model 
from sklearn.ensemble import GradientBoostingClassifier
gradient_boost = GradientBoostingClassifier()
cv = StratifiedShuffleSplit(n_splits=3, test_size=.30, random_state=15)
grad_accuracies = cross_val_score(gradient_boost, train1,target, cv = cv, scoring='roc_auc')
grad_accuracies.mean()

0.70668794429131

In [99]:
param_test2 = {'min_samples_split=1200':range(800,2000,200)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.09, min_samples_split=800,n_estimators = 70,min_samples_leaf=60,max_depth=4,max_features=17,subsample=0.9,random_state=10), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(train1,target)

ValueError: Invalid parameter min_samples_split=1200 for estimator GradientBoostingClassifier(learning_rate=0.02, max_depth=4, max_features=17,
                           min_samples_leaf=60, min_samples_split=800,
                           n_estimators=70, random_state=10, subsample=0.9). Check the list of available parameters with `estimator.get_params().keys()`.

In [106]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(learning_rate=0.09, min_samples_split=800,n_estimators = 70,min_samples_leaf=60,max_depth=4,max_features=17,subsample=0.9,random_state=10)
gbc_model = gbc.fit(train1,target)

In [116]:
test1 = test.drop(['isDefault','id'],axis=1)

In [121]:
test1 = test1.fillna(method='ffill')
test1.shape

(200000, 102)

In [125]:
probs = gbc_model.predict_proba(test1)
probs = probs[:,1]
ids = test.id.values
prediction = pd.DataFrame({'id':ids,'isDefault':probs})
prediction.head(5)
prediction.to_csv('/Users/senlinlidewo/pyprogram/天池竞赛-零基础金融风控/数据/submission2.csv')

ModuleNotFoundError: No module named 'torch'

In [115]:
import pickle
f = open('/Users/senlinlidewo/pyprogram/天池竞赛-零基础金融风控/数据/gbc_model.pickle','wb')
pickle.dump(gbc_model,f)
f.close

<function BufferedWriter.close>

In [111]:
f = open('/Users/senlinlidewo/pyprogram/天池竞赛-零基础金融风控/数据/gbc_model.pickle','rb')
gbc_model1 = pickle.load(f)
f.close()
print(gbc_model1.predict(train1[0:1,:]))

EOFError: Ran out of input

In [95]:

print(gsearch1.best_params_)
print(gsearch1.best_score_)

{'subsample': 0.9}
0.7268963954709584


In [89]:
model  = gsearch1.best_estimator_
model.fit(train1,target)


GradientBoostingClassifier(learning_rate=0.09, max_depth=4, max_features='sqrt',
                           min_samples_leaf=60, min_samples_split=800,
                           n_estimators=70, random_state=10, subsample=0.8)

In [53]:
train.id

0       499065
1       338178
2        97768
3        80881
5       281517
         ...  
9995    727332
9996    717380
9997    578410
9998    726522
9999    365564
Name: id, Length: 9465, dtype: int64

In [173]:
# XGB
from xgboost import XGBClassifier
cv = StratifiedShuffleSplit(n_splits=3, test_size=.30, random_state=15)
XGBClassifier = XGBClassifier()
xgb = XGBClassifier.fit(train1, target)



In [167]:
#extra tree
from sklearn.ensemble import ExtraTreesClassifier
ExtraTreesClassifier = ExtraTreesClassifier()
cv = StratifiedShuffleSplit(n_splits=10, test_size=.30, random_state=15)
extra_accuracies = cross_val_score(ExtraTreesClassifier, train1,target, cv = cv, scoring='roc_auc')
extra = ExtraTreesClassifier.fit(train1, target)
print(extra_accuracies.mean())

0.7120883505005077


In [193]:
# gaussian process 

from sklearn.gaussian_process import GaussianProcessClassifier
cv = StratifiedShuffleSplit(n_splits=3, test_size=.30, random_state=15)
gaussianp_accuracies = cross_val_score(GaussianProcessClassifier(), train1,target, cv = cv, scoring='roc_auc')
GaussianProcessClassifier = GaussianProcessClassifier()
gaussianprocess = GaussianProcessClassifier.fit(train1, target)
gaussianp_accuracies.mean()

0.5

In [197]:
# voting classifer 
from sklearn.ensemble import VotingClassifier

voting_classifier = VotingClassifier(estimators=[
    ('lr_grid', log_grid),
    ('svc', grid_svc),
    ('random_forest', grid_rf),
    ('gradient_boosting', gradident),
    ('decision_tree_grid',grid_dt),
    ('knn_classifier', knn_grid),
    ('XGB_Classifier', xgb),
    ('bagging_classifier', grid_bag),
    ('adaBoost_classifier',grid_ada),
    ('ExtraTrees_Classifier', extra),
    ('gaussian_classifier',gau),
    ('gaussian_process_classifier', gaussianprocess)
],voting='hard')

#voting_classifier = voting_classifier.fit(train_x,train_y)
voting_classifier = voting_classifier.fit(train1,target)



array([nan, nan, nan])

In [None]:
all_models = [log_grid,grid_svc,grid_rf,grid_dt,knn_grid,xgb,grid_bag,grid_ada,extra,gau,gaussianprocess,voting_classifier]

In [199]:
all_models = [log_grid,grid_svc,grid_rf,grid_dt,knn_grid,xgb,grid_bag,grid_ada,extra,gau,gaussianprocess,voting_classifier]
c = {}
for i in all_models:
    if i == log_grid:
        a = i.decision_function(train1)
        FPR, TPR, _ = roc_curve(target,a)
        b = auc(FPR, TPR)
        c[i] = b
    elif i==grid_rf:
        a = i.predict_proba(train1)[:,1]
        FPR, TPR, _ = roc_curve(target,a)
        b = auc(FPR, TPR)
        c[i] = b
    elif i == grid_bag:
        c[i] = grid_ba.best_score_
    elif i == grid_ada:
        c[i] = grid_ad.best_score_
    elif i == gradident:
        c[i] = grad_accuracies.mean()
    elif i == xgb:
        c[i] = xgb_accuracies.mean()
    elif i == extra:
        c[i] = extra_accuracies.mean()
    elif i ==gaussianprocess:
        c[i] = gaussianp_accuracies.mean()
    else:
        a = i.predict(train1)
        FPR, TPR, _ = roc_curve(target,a)
        b = auc(FPR, TPR)
        c[i] = b
#a = i.predict(train1)
#b = accuracy_score(a, target)
    
display(c)

{LogisticRegression(C=0.15): 0.5696746067179662,
 SVC(C=0.001, gamma=0.0001, probability=True): 0.5,
 RandomForestClassifier(criterion='entropy', max_depth=2, n_estimators=140): 0.7872928539538303,
 DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                        max_depth=1, max_features=21, max_leaf_nodes=3,
                        min_impurity_decrease=1.0, min_samples_leaf=4,
                        random_state=666): 0.5,
 KNeighborsClassifier(n_jobs=-1, n_neighbors=1): 1.0,
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.300000012, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
               reg_alpha=0, reg_lambda=1, scale

#这个输出格式暂时不对。
test_prediction = (max(c, key=c.get)).predict(test1)
submission = pd.DataFrame({
        "id": id,
        "isdefault": test_prediction
    })

#submission.id = submission.id.astype(int)
#submission.isdefault = submission.isDefault.astype(int)

submission.to_csv("/Users/senlinlidewo/pyprogram/天池竞赛-零基础金融风控/数据/submission.csv", index=False)

result = pd.read_csv('/Users/senlinlidewo/pyprogram/天池竞赛-零基础金融风控/数据/submission.csv')