<h2>定义部分：参考<a href="https://github.com/WangliLin/xunfei2021_car_loan_top1">https://github.com/WangliLin/xunfei2021_car_loan_top1</a></h2>

In [31]:
import pickle
import logging
import os
from datetime import datetime
import warnings
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from tqdm import tqdm
import scipy.signal as sg
from scipy.stats import ks_2samp, kstatvar
from scipy.stats import variation
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder


warnings.filterwarnings('ignore')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
np.random.seed(1024)

In [32]:
def train_lgb_kfold(X_train, y_train, X_test, n_fold=5, cate_feats=None):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        logging.info(f'############ fold {fold} ###########')
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr, categorical_feature=cate_feats)
        dvalid = lgb.Dataset(X_val, y_val, categorical_feature=cate_feats, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'min_data_in_leaf': 50,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'n_jobs': -1,
            'seed': 1024,
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=300,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=50,
                        early_stopping_rounds=20)

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds

In [33]:
def lgb_model(train, target, test, k, seed):
    feats = [f for f in train.columns if f not in ['zhdh', 'black_flag']]
    print('Current num of features:', len(feats))

    oof_probs = np.zeros((train.shape[0],))
    output_preds = 0
    offline_score = []
    feature_importance_df = pd.DataFrame()
    parameters = {
         'boosting_type': 'gbdt',
            'objective': 'binary',
            'tree_learner':'serial',
            'metric': 'auc',
            'min_child_weight': 4,
            'num_leaves': 64,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
            'learning_rate': 0.02,
            'seed': seed,
            'nthread': 32,
            'n_jobs':8,
            'silent': True,
            'verbose': -1,
    }

    seeds = [2]
    for seed in seeds:
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        for i, (train_index, test_index) in enumerate(folds.split(train, target)):
            train_y, test_y = target.iloc[train_index], target.iloc[test_index]
            train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]

            dtrain = lgb.Dataset(train_X,
                                 label=train_y)
            dval = lgb.Dataset(test_X,
                               label=test_y)

            lgb_model = lgb.train(
                parameters,
                dtrain,
                num_boost_round=8000,
                valid_sets=[dval],
                callbacks=[early_stopping(100), log_evaluation(100)],

            )

            oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration) / len(
                seeds)

            offline_score.append(lgb_model.best_score['valid_0']['auc'])
            output_preds += lgb_model.predict(test[feats],
                                              num_iteration=lgb_model.best_iteration) / folds.n_splits / len(seeds)
            print(offline_score)
            # feature importance
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
            fold_importance_df["fold"] = i + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(50))

    return output_preds, oof_probs, np.mean(offline_score), feature_importance_df

<h2>读取数据部分</h2>

In [34]:
data = pd.read_csv('../data/账户交易信息.csv')
data_static = pd.read_csv('../data/账户静态信息.csv')
data_label = pd.read_csv('../data/训练集标签.csv')
data = pd.merge(data, data_label, on="zhdh", how="left")

<h2>特征工程部分</h2>

In [35]:
data_trade = data[["zhdh", "dfzh"]]
data_trade = data_trade.value_counts().reset_index()
data_trade.columns = ["zhdh", "dfzh", "number"]
data_trade = pd.merge(data_trade, data_label, on="zhdh", how="left")
data_trade

Unnamed: 0,zhdh,dfzh,number,black_flag
0,A596886AF5381E59,0F76A363EACBDEAF,5770,0.0
1,A596886AF5381E59,B5D082CF912AAD2C,4791,0.0
2,0065E0EC09A64440,14BEFED1370B730A,2463,
3,E8FD8174DE7EE23E,0F76A363EACBDEAF,2058,
4,37C924CD58503D52,810310CDF06F7157,1671,0.0
...,...,...,...,...
194258,7D6F0F5480F9B910,741E5D9F2AE6EB7C,1,
194259,7D6F0F5480F9B910,77131FDB1D2900A2,1,
194260,7D6F0F5480F9B910,778315B2EF658590,1,
194261,E0DE80CE53CD84CB,2ADB29A2758717BD,1,


In [36]:
def get_ks(_data: DataFrame) -> float:
    """
    检验每个月的入账出账是否同分布\n
    """
    _df = pd.pivot_table(_data, values="jyje", index="日期", columns="年月")
    _df = _df.fillna(0)
    if _df.shape[1] < 2:
        return 1
    result = 0
    for j in range(_df.shape[1] - 1):
        result = result + ks_2samp(_df.iloc[:, j], _df.iloc[:, j+1]).statistic
    return result / (_df.shape[1] - 1)

In [37]:
in_jyjesum_name = data[(data["jdbj"] == 1) & (data["black_flag"] == 1)].groupby("jyrq")["jyje"].sum().sort_values(ascending=False).index.tolist()[0:25]
out_jyjesum_name = data[(data["jdbj"] == 0) & (data["black_flag"] == 1)].groupby("jyrq")["jyje"].sum().sort_values(ascending=False).index.tolist()[0:25]

def get_in_jyjesum(_df: DataFrame) -> [float]:
    """
    获取交易金额按日汇总\n
    """
    ser = _df[_df["jdbj"] == 1].groupby("jyrq")["jyje"].sum().reindex(in_jyjesum_name).fillna(0)
    return ser.tolist()

def get_out_jyjesum(_df: DataFrame) -> [float]:
    """
    获取出账交易金额的按日汇总\n
    """
    ser = _df[_df["jdbj"] == 0].groupby("jyrq")["jyje"].sum().reindex(in_jyjesum_name).fillna(0)
    return ser.tolist()

in_jyjesum_name = list(map(lambda x: "in_jyjesum_" + x, in_jyjesum_name))
out_jyjesum_name = list(map(lambda x: "out_jyjesum_" + x, out_jyjesum_name))

In [38]:
def get_tfidf(colname: str, topk: int, jdbj: int, _df: DataFrame) -> ([str], [int]):
    """
    获取特征的tfidf\n
    """
    top_values = data[(data["jdbj"] == jdbj) & (data["black_flag"] == 1)][colname].value_counts().to_frame()
    top_values = top_values.reset_index()
    top_values = top_values.sort_values([colname, "index"], ascending=(False, True))
    top_values = top_values.set_index("index").index.tolist()[0:topk]
    ser = _df[_df["jdbj"] == jdbj][colname].value_counts().reindex(top_values).fillna(0)
    if jdbj == 1:
        return list(map(lambda x: "in_" + colname + "_" + x, top_values)), ser.tolist()
    elif jdbj == 0:
        return list(map(lambda x: "out_" + colname + "_" + x, top_values)), ser.tolist()

In [39]:
def get_tfidf_sum(colname: str, sumcol: str, topk: int, jdbj: int, _df: DataFrame) -> ([str], [float]):
    """
    找到最频繁的几个colname，按照colname汇总sumcol\n
    """
    top_values = data[(data["jdbj"] == jdbj) & (data["black_flag"] == 1)][[colname, sumcol]].groupby(colname).sum().sort_values(sumcol, ascending=False).index.tolist()[0:topk]
    ser = _df[_df["jdbj"] == jdbj][[colname, sumcol]].groupby(colname)[sumcol].sum().reindex(top_values).fillna(0)
    if jdbj == 1:
        return list(map(lambda x: "in_" + colname + "_" + sumcol + "_sum_" + x, top_values)), ser.tolist()
    elif jdbj == 0:
        return list(map(lambda x: "out_" + colname + "_" + sumcol + "_sum_" + x, top_values)), ser.tolist()

In [40]:
def get_time_feature(_data: DataFrame, colnames: [str]) -> ([str], [float]):
    """
    计算时间特征\n
    """
    result1 = []
    result2 = []
    features = ["mean", "max", "min", "median"]
    for colname in colnames:
        if colname in ("jy_is_month_start", "jy_is_month_end", "jy_is_wknd"):
            features = ["mean"]
        elif colname == "jy_dayofweek":
            features = ["mean", "median"]
        else:
            features = ["mean", "max", "min", "median"]
        result1.extend(list(map(lambda x: colname + "_" + str(x), features)))
        result2.extend(_data[colname].agg(features))
    return result1, result2

In [41]:
def get_manytoone(_df: DataFrame, colname: str, numval: float) -> (float, float, float, float, float, float, int):
    """
    多次转入一次转出、多次转出一次转入\n
    :return 多次转入一次转出, 多次转出一次转入, 算在一起, 区间段长度, 区间段个数
    """
    min_idxs = sg.argrelmin(_df[colname].values)[0]
    if numval is not None:
        min_idxs = _df.iloc[min_idxs].loc[_df.iloc[min_idxs, _df.columns.to_list().index(colname)] < numval].index.values
    if len(min_idxs) == 0:
        return 0, 0, 0, 0, 0, 0, 0
    min_idxs = min_idxs + 1
    if data_sample.shape[0] != min_idxs[-1]:
        min_idxs = np.append(min_idxs, [data_sample.shape[0]])
    if min_idxs[0] != 0:
        min_idxs = np.insert(min_idxs, 0, [0])
    min_idxs = min_idxs[min_idxs <= _df.shape[0]]
    min_idxs = np.unique(min_idxs)
    idx_pairs = np.hstack([np.roll(min_idxs.reshape(-1, 1), 1), min_idxs.reshape(-1, 1)])[1:]
    nin_1out_ratio = []
    nout_1in_ratio = []
    max_inlen = []
    max_outlen = []
    for idx_pair in idx_pairs:
        _temp = _df.iloc[idx_pair[0]:idx_pair[1]]
        in_num = _temp["jdbj"].sum()
        out_num = (_temp["jdbj"] == 0).sum()
        # 分段的最大0长度，最大1长度
        if in_num > out_num:
            nin_1out_ratio.append(in_num / (in_num + out_num))
            max_inlen.append(len(max("".join(list(map(lambda x: str(x), _temp["jdbj"].values.tolist()))).split("0"), key=len)))
        elif in_num < out_num:
            nout_1in_ratio.append(out_num / (in_num + out_num))
            max_outlen.append(len(max("".join(list(map(lambda x: str(x), _temp["jdbj"].values.tolist()))).split("1"), key=len)))
    mean_nin_1out_ratio = np.sum(nin_1out_ratio)
    mean_nout_1in_ratio = np.mean(nout_1in_ratio)
    mean_n_1_ratio = np.mean(nin_1out_ratio + nout_1in_ratio)
    mean_max_inlen = np.mean(max_inlen)
    mean_max_outlen = np.mean(max_outlen)
    mean_max_len = np.mean(max_inlen + max_outlen)
    len_mins = len(min_idxs)
    return mean_nin_1out_ratio, mean_nout_1in_ratio, mean_n_1_ratio, mean_max_inlen, mean_max_outlen, mean_max_len, len_mins

In [42]:
d = {}
users = data.zhdh.unique().tolist()
for user_id in tqdm(users):
    data_sample = data[data.zhdh == user_id].reset_index(drop=True)
    data_sample["转账日期"] = data_sample["jyrq"] + " " + data_sample["jysj"]
    data_sample["转账日期"] = pd.to_datetime(data_sample["转账日期"])
    data_sample = data_sample.sort_values("转账日期")
    data_sample["年月"] = data_sample["转账日期"].apply(lambda x: x.year * 100 + x.month)
    data_sample["日期"] = data_sample["转账日期"].apply(lambda x: x.day)
    
    # 交易金额是否为负数
    neg_num = (data_sample["jyje"] < 0).sum()
    neg_sum = data_sample[data_sample["jyje"] < 0]["jyje"].sum()
    if neg_num == 0:
        neg_per = 0
    else:
        neg_per = neg_sum / neg_num
        
    # 对冲交易额为负的数据
    del_index = data_sample[data_sample["jyje"] < 0].index.to_list()
    if len(del_index) > 0:
        del_index.extend(list(map(lambda x: x - 1, del_index)))
    data_sample = data_sample.drop(index=del_index)
    
    # 最大进账频次，最小进账频次，最大出账频次，最小出账频次
    max_in_cnt = data_sample[data_sample["jdbj"] == 1]["jyrq"].value_counts().max()
    min_in_cnt = data_sample[data_sample["jdbj"] == 1]["jyrq"].value_counts().min()
    median_in_cnt = data_sample[data_sample["jdbj"] == 1]["jyrq"].value_counts().median()
    max_out_cnt = data_sample[data_sample["jdbj"] == 0]["jyrq"].value_counts().max()
    min_out_cnt = data_sample[data_sample["jdbj"] == 0]["jyrq"].value_counts().min()
    median_out_cnt = data_sample[data_sample["jdbj"] == 0]["jyrq"].value_counts().median()
    
    # 对方账户的出现频次的最大值、最小值、中位数、个数
    max_df_cnt = data_sample["dfzh"].value_counts().max()
    min_df_cnt = data_sample["dfzh"].value_counts().min()
    median_df_cnt = data_sample["dfzh"].value_counts().median()
    num_df_cnt = data_sample["dfzh"].nunique()
    
    # 转入金额、转出金额的变异系数
    in_amt_variation = variation(data_sample[data_sample.jdbj == 1].jyje.values) 
    out_amt_variation = variation(data_sample[data_sample.jdbj == 0].jyje)
    in_amt_max = data_sample[data_sample.jdbj == 1]["jyje"].max()
    in_amt_min = data_sample[data_sample.jdbj == 1]["jyje"].max()
    in_amt_mean = data_sample[data_sample.jdbj == 1]["jyje"].mean()
    in_amt_ptp = in_amt_max - in_amt_min
    out_amt_max = data_sample[data_sample.jdbj == 0]["jyje"].max()
    out_amt_min = data_sample[data_sample.jdbj == 0]["jyje"].min()
    out_amt_mean = data_sample[data_sample.jdbj == 0]["jyje"].mean()
    out_amt_ptp = out_amt_max - out_amt_min
    
    # 临近月的转账分布一致性
    in_ks = get_ks(data_sample[data_sample["jdbj"] == 1])
    out_ks = get_ks(data_sample[data_sample["jdbj"] == 0])
    
    # 交易渠道
    in_jyqd_name, in_jyqd = get_tfidf("jyqd", 10, 1, data_sample)
    out_jyqd_name, out_jyqd = get_tfidf("jyqd", 10, 0, data_sample)
    
    # 摘要代号
    in_zydh_name, in_zydh = get_tfidf("zydh", 10, 1, data_sample)
    out_zydh_name, out_zydh = get_tfidf("zydh", 10, 0, data_sample)
    
    # 交易日期
    in_jyrq_name, in_jyrq = get_tfidf("jyrq", 25, 1, data_sample)
    out_jyrq_name, out_jyrq = get_tfidf("jyrq", 25, 0, data_sample)
    
    # 转账日期的时间特征
    data_sample['jy_month'] = data_sample["转账日期"].dt.month
    data_sample['jy_day'] = data_sample["转账日期"].dt.day
    data_sample['jy_weekofyear'] = data_sample["转账日期"].dt.weekofyear
    data_sample['jy_dayofyear'] = data_sample["转账日期"].dt.dayofyear
    data_sample['jy_dayofweek'] = data_sample["转账日期"].dt.dayofweek
    data_sample['jy_is_wknd'] = data_sample["转账日期"].dt.dayofweek // 6
    data_sample['jy_is_month_start'] = data_sample["转账日期"].dt.is_month_start.astype(int)
    data_sample['jy_is_month_end'] = data_sample["转账日期"].dt.is_month_end.astype(int)
    data_sample['jy_hour'] = data_sample['jysj'].apply(lambda x:int(x.split(':')[0]))
    data_sample['jy_minu'] = data_sample['jysj'].apply(lambda x:int(x.split(':')[1]))
    data_sample['jy_date'] = data_sample['jysj'].apply(lambda x:int(x.split(':')[0])*60 + int(x.split(':')[1]))
    in_timefeature_name, in_timefeature = get_time_feature(data_sample[data_sample["jdbj"] == 1], ["jy_month", "jy_day", "jy_weekofyear", "jy_dayofyear", "jy_dayofweek", "jy_is_wknd", "jy_is_month_start", "jy_is_month_end", "jy_hour", "jy_minu", "jy_date"])
    out_timefeature_name, out_timefeature = get_time_feature(data_sample[data_sample["jdbj"] == 0], ["jy_month", "jy_day", "jy_weekofyear", "jy_dayofyear", "jy_dayofweek", "jy_is_wknd", "jy_is_month_start", "jy_is_month_end", "jy_hour", "jy_minu", "jy_date"])
    
    # 按日汇总的交易金额
    in_jyjesum = get_in_jyjesum(data_sample)
    out_jyjesum = get_out_jyjesum(data_sample)
    
    # 余额
    in_max_ye = (data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"]).max()
    in_min_ye = (data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"]).min()
    in_mean_ye = (data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"]).mean()
    in_variation_ye = variation(data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"])
    out_max_ye = (data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"]).max()
    out_min_ye = (data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"]).min()
    out_mean_ye = (data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"]).mean()
    out_variation_ye = variation(data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"])
    
    # 最常用的对方账号的汇款记录
    in_dfzh_tfidf_name, in_dfzh_tfidf = get_tfidf("dfzh", 6, 1, data_sample)
    out_dfzh_tfidf_name, out_dfzh_tfidf = get_tfidf("dfzh", 6, 0, data_sample)
    
    # 对方名称长度
    in_max_dfmccd = data_sample[data_sample["jdbj"] == 1]["dfmccd"].max()
    in_min_dfmccd = data_sample[data_sample["jdbj"] == 1]["dfmccd"].min()
    in_mean_dfmccd = data_sample[data_sample["jdbj"] == 1]["dfmccd"].mean()
    in_variation_dfmccd = variation(data_sample[data_sample["jdbj"] == 1]["dfmccd"])
    in_ptp_dfmccd = in_max_dfmccd - in_min_dfmccd
    out_max_dfmccd = data_sample[data_sample["jdbj"] == 0]["dfmccd"].max()
    out_min_dfmccd = data_sample[data_sample["jdbj"] == 0]["dfmccd"].min()
    out_mean_dfmccd = data_sample[data_sample["jdbj"] == 0]["dfmccd"].mean()
    out_variation_dfmccd = variation(data_sample[data_sample["jdbj"] == 0]["dfmccd"])
    out_ptp_dfmccd = out_max_dfmccd - out_min_dfmccd
    
    # 对方账户的最大年龄、最小年龄、平均年龄
    dfzhs = pd.merge(DataFrame({"dfzh": data_sample["dfzh"].unique()}), data_static, how="inner", left_on="dfzh", right_on="zhdh")
    if dfzhs.shape[0] > 0:
        max_df_age = dfzhs["年龄"].max()
        min_df_age = dfzhs["年龄"].min()
        mean_df_age = dfzhs["年龄"].mean()
    else:
        max_df_age = 0
        min_df_age = 0
        mean_df_age = 0
        
    # 对方行号
    in_dfhh_nunique = data_sample[data_sample["jdbj"] == 1]["dfhh"].nunique()
    out_dfhh_nunique = data_sample[data_sample["jdbj"] == 0]["dfhh"].nunique()
    # 交易渠道
    in_jyqd_nunique = data_sample[data_sample["jdbj"] == 1]["jyqd"].nunique()
    out_jyqd_nunique = data_sample[data_sample["jdbj"] == 0]["jyqd"].nunique()
    # 摘要代号
    in_zydh_nunique = data_sample[data_sample["jdbj"] == 1]["zydh"].nunique()
    out_zydh_nunique = data_sample[data_sample["jdbj"] == 0]["zydh"].nunique()
    
    # 按照交易渠道汇总交易额
    in_jyqd_sum_jyje_name, in_jyqd_sum_jyje = get_tfidf_sum("jyqd", "jyje", 7, 1, data_sample)
    out_jyqd_sum_jyje_name, out_jyqd_sum_jyje = get_tfidf_sum("jyqd", "jyje", 5, 0, data_sample)
    
    # 按照交易日期汇总交易额
    in_jyrq_sum_jyje_name, in_jyrq_sum_jyje = get_tfidf_sum("jyrq", "jyje", 10, 1, data_sample)
    out_jyrq_sum_jyje_name, out_jyrq_sum_jyje = get_tfidf_sum("jyrq", "jyje", 10, 0, data_sample)
    
    # 分箱统计交易金额
    labels = ["A", "B", "C", "D", "E", "F"]
    in_jyje_bins_name = list(map(lambda x: "in_jyje_" + x, labels))
    out_jyje_bins_name = list(map(lambda x: "out_jyje_" + x, labels))
    data_sample["jyje分箱"] = pd.cut(data_sample["jyje"], bins=[0, 10, 100, 1000, 10000, 100000, np.inf], right=False, include_lowest=True, labels=labels)
    in_jyje_bins = data_sample[data_sample["jdbj"] == 1]["jyje分箱"].value_counts().reindex(labels).fillna(0).tolist()
    out_jyje_bins = data_sample[data_sample["jdbj"] == 0]["jyje分箱"].value_counts().reindex(labels).fillna(0).tolist()
    
    # 多次转入一次转出，多次转出一次转入
    mean_nin_1out_ratio, mean_nout_1in_ratio, mean_n_1_ratio, mean_max_inlen, mean_max_outlen, mean_max_len, len_mins = get_manytoone(data_sample, "zhye", 100)
    
    # 多次转入一次转出、多次转出一次转入的特征计数
    count_110 = len(list(filter(lambda x: len(x) > 4, "".join(list(map(str, data_sample["jdbj"].values.tolist()))).split("0"))))
    count_100 = len(list(filter(lambda x: len(x) > 4, "".join(list(map(str, data_sample["jdbj"].values.tolist()))).split("1"))))
    
    # 多次转入一次转出、多次转出一次转入的特征计数
    count_110 = len(list(filter(lambda x: len(x) > 4, "".join(list(map(str, data_sample["jdbj"].values.tolist()))).split("0"))))
    count_100 = len(list(filter(lambda x: len(x) > 4, "".join(list(map(str, data_sample["jdbj"].values.tolist()))).split("1"))))
    
    # 交易频繁的对手账户个数
    pinfan_dfzh = (data_sample["dfzh"].value_counts() > 5).sum()
    
    # 小额转账次数和大额转账次数之比
    in_data_sample = data_sample[data_sample["jdbj"] == 1]
    in_small_ratio = (in_data_sample["jyje"] < 100).sum() / (in_data_sample["jyje"] >= 100).sum()
    out_data_sample = data_sample[data_sample["jdbj"] == 0]
    out_small_ratio = (out_data_sample["jyje"] < 100).sum() / (out_data_sample["jyje"] >= 100).sum()
    small_ratio = (data_sample["jyje"] < 100).sum() / (data_sample["jyje"] >= 100).sum()
    
    all_cnt = data_sample.shape[0] # 操作次数
    
    # 转入次数、转出次数、转入次数占比
    in_cnt = data_sample.jdbj.sum()
    out_cnt = all_cnt - in_cnt
    in_ratio = in_cnt / all_cnt
    
    # 转入金额、转出金额、转入金额占比
    in_amt = data_sample[data_sample.jdbj == 1].jyje.sum()
    out_amt = data_sample[data_sample.jdbj == 0].jyje.sum()
    in_amt_ratio = in_amt / (in_amt + out_amt)
    
    # 转入人数、转出人数、转入人数占比
    in_user_cnt = data_sample[data_sample.jdbj == 1].dfzh.nunique()
    out_user_cnt = data_sample[data_sample.jdbj == 0].dfzh.nunique()
    in_user_ratio = in_user_cnt / (in_user_cnt + out_user_cnt)
    
    date_cnt = data_sample.jyrq.nunique() # 转账日期数
    
    d[user_id] = [all_cnt, in_cnt, out_cnt, in_ratio, in_amt, out_amt, in_amt_ratio, in_amt_variation, out_amt_variation, in_user_cnt, out_user_cnt, in_user_ratio, date_cnt, max_in_cnt, min_in_cnt, max_out_cnt, min_out_cnt, max_df_cnt, min_df_cnt, median_df_cnt, in_ks, out_ks] + in_jyqd + out_jyqd + in_zydh + out_zydh + in_jyrq + out_jyrq + in_timefeature + out_timefeature + [in_max_ye, in_min_ye, in_mean_ye, in_variation_ye, out_max_ye, out_min_ye, out_mean_ye, out_variation_ye] + [median_in_cnt, median_out_cnt, num_df_cnt, in_amt_max, in_amt_min, in_amt_mean, in_amt_ptp, out_amt_max, out_amt_min, out_amt_mean, out_amt_ptp] + in_dfzh_tfidf + in_dfzh_tfidf + [in_max_dfmccd, in_min_dfmccd, in_mean_dfmccd, in_variation_dfmccd, in_ptp_dfmccd, out_max_dfmccd, out_min_dfmccd, out_mean_dfmccd, out_variation_dfmccd, out_ptp_dfmccd] + [neg_num, neg_per] + [in_dfhh_nunique, out_dfhh_nunique, in_jyqd_nunique, out_jyqd_nunique, in_zydh_nunique, out_zydh_nunique] + in_jyqd_sum_jyje + out_jyqd_sum_jyje + in_jyrq_sum_jyje + out_jyrq_sum_jyje + in_jyje_bins + out_jyje_bins + [mean_nin_1out_ratio, mean_nout_1in_ratio, mean_n_1_ratio, mean_max_inlen, mean_max_outlen, mean_max_len, len_mins] + [count_100, count_110] + [pinfan_dfzh, in_small_ratio, out_small_ratio, small_ratio]

data_df = pd.DataFrame.from_dict(d).T.reset_index()
data_df.columns = ['zhdh', 'all_cnt', 'in_cnt', 'out_cnt', 'in_ratio', 'in_amt', 'out_amt', 'in_amt_ratio', "in_amt_variation", "out_amt_variation", 'in_user_cnt', 'out_user_cnt', 'in_user_ratio', 'date_cnt', "max_in_cnt", "min_in_cnt", "max_out_cnt", "min_out_cnt", "max_df_cnt", "min_df_cnt", "median_df_cnt", "in_ks", "out_ks"] + in_jyqd_name + out_jyqd_name + in_zydh_name + out_zydh_name + in_jyrq_name + out_jyrq_name + list(map(lambda x: "in_" + x, in_timefeature_name)) + list(map(lambda x: "out_" + x, out_timefeature_name)) + ["in_max_ye", "in_min_ye", "in_mean_ye", "in_variation_ye", "out_max_ye", "out_min_ye", "out_mean_ye", "out_variation_ye"] + ["median_in_cnt", "median_out_cnt", "num_df_cnt", "in_amt_max", "in_amt_min", "in_amt_mean", "in_amt_ptp", "out_amt_max", "out_amt_min", "out_amt_mean", "out_amt_ptp"] + in_dfzh_tfidf_name + out_dfzh_tfidf_name + ["in_max_dfmccd", "in_min_dfmccd", "in_mean_dfmccd", "in_variation_dfmccd", "in_ptp_dfmccd", "out_max_dfmccd", "out_min_dfmccd", "out_mean_dfmccd", "out_variation_dfmccd", "out_ptp_dfmccd"] + ["neg_num", "neg_per"] + ["in_dfhh_nunique", "out_dfhh_nunique", "in_jyqd_nunique", "out_jyqd_nunique", "in_zydh_nunique", "out_zydh_nunique"] + in_jyqd_sum_jyje_name + out_jyqd_sum_jyje_name + in_jyrq_sum_jyje_name + out_jyrq_sum_jyje_name + in_jyje_bins_name + out_jyje_bins_name + ["mean_nin_1out_ratio", "mean_nout_1in_ratio", "mean_n_1_ratio", "mean_max_inlen", "mean_max_outlen", "mean_max_len", "len_mins"] + ["count_100", "count_110"] + ["pinfan_dfzh", "in_small_ratio", "out_small_ratio", "small_ratio"]

100%|████████████████████████████████████████████████████████████████████████████████████████████| 6000/6000 [21:30<00:00,  4.65it/s]


<h2>合并标签部分</h2>

In [43]:
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))

In [44]:
data_static.columns = ['zhdh', 'khrq', 'khjgdh', 'xb', 'age']
data_static["khrq"] = pd.to_datetime(data_static["khrq"], format="%Y-%m-%d")
data_static['year']  = data_static['khrq'].dt.year
data_static['month'] = data_static['khrq'].dt.month
data_static['day']   = data_static['khrq'].dt.day
data_static["khjgdh"] = label_encode(data_static["khjgdh"])
df_feats = pd.merge(left=data_df, right=data_static[['zhdh', 'xb', 'age', "year", "month", "day"]], on='zhdh', how='left')
df_final = pd.merge(left=df_feats, right=data_label, on='zhdh', how='left')

<h2>准备数据集</h2>

In [45]:
df_train = df_final[df_final.black_flag.notnull()].reset_index(drop=True)
df_test = df_final[df_final.black_flag.isnull()].reset_index(drop=True)

feats = df_train.columns[1:-1].tolist()
X_train = df_train[feats]
y_train = df_train['black_flag']
X_test = df_test[feats]

<h2>开始训练</h2>

In [46]:
gbms, oof_preds, test_preds = train_lgb_kfold(X_train, y_train, X_test, n_fold=5)

2023-03-11 21:29:54,295 : INFO : ############ fold 0 ###########


[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23654
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 274
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.995463	valid_1's auc: 0.971019
[100]	training's auc: 0.999988	valid_1's auc: 0.973333
[150]	training's auc: 1	valid_1's auc: 0.97537


2023-03-11 21:29:54,542 : INFO : ############ fold 1 ###########
2023-03-11 21:29:54,662 : INFO : ############ fold 2 ###########


Early stopping, best iteration is:
[176]	training's auc: 1	valid_1's auc: 0.976852
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23769
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 276
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.9957	valid_1's auc: 0.967037
Early stopping, best iteration is:
[61]	training's auc: 0.997685	valid_1's auc: 0.970463
[LightGBM] [Info] Number of positive: 240, number of negative: 720


2023-03-11 21:29:54,787 : INFO : ############ fold 3 ###########


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23902
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 274
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.996343	valid_1's auc: 0.937407
Early stopping, best iteration is:
[55]	training's auc: 0.997147	valid_1's auc: 0.940093
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23667
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 276
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.996071	

2023-03-11 21:29:54,924 : INFO : ############ fold 4 ###########


Early stopping, best iteration is:
[75]	training's auc: 0.999369	valid_1's auc: 0.9575
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23614
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 275
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.994497	valid_1's auc: 0.965185
Early stopping, best iteration is:
[79]	training's auc: 0.99934	valid_1's auc: 0.96963


In [47]:
lgb_preds, lgb_oof, lgb_score, feature_importance_df = lgb_model(train=X_train, target=y_train, test=X_test, k=5, seed=2020)

Current num of features: 289
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.944352
Early stopping, best iteration is:
[26]	valid_0's auc: 0.951759
[0.9517592592592593]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.939722
Early stopping, best iteration is:
[7]	valid_0's auc: 0.942593
[0.9517592592592593, 0.9425925925925925]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.964167
[200]	valid_0's auc: 0.968796
[300]	valid_0's auc: 0.967685
Early stopping, best iteration is:
[200]	valid_0's auc: 0.968796
[0.9517592592592593, 0.9425925925925925, 0.9687962962962963]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.963426
[200]	valid_0's auc: 0.96537
Early stopping, best iteration is:
[140]	valid_0's auc: 0.968241
[0.9517592592592593, 0.9425925925925925, 0.9687962962962963, 0.9682407407407407]
Training until validation scores don't improve for 10

In [48]:
test_preds = (test_preds + lgb_preds) / 2
oof_preds = (oof_preds + lgb_oof) / 2

<h2>搜索最优阈值</h2>

In [49]:
def gen_thres_new(df_train, oof_preds):
    df_train['oof_preds'] = oof_preds
    quantile_point = df_train['black_flag'].mean()
    thres = df_train['oof_preds'].quantile(1 - quantile_point)

    _thresh = []
    for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.01):
        _thresh.append(
            [thres_item, f1_score(df_train['black_flag'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

    _thresh = np.array(_thresh)
    best_id = _thresh[:, 1].argmax()
    best_thresh = _thresh[best_id][0]

    print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
    return best_thresh

best_thresh = gen_thres_new(df_train, oof_preds)

阈值: 0.34158381939791266
训练集的f1: 0.9024644732514489


<h2>生成提交结果</h2>

In [50]:
df_test['black_flag'] = np.where(test_preds > best_thresh, 1, 0)
df_test[['zhdh', 'black_flag']].to_csv("../data/submit_%s.csv" % datetime.now().strftime("%Y-%m-%dT%H-%M"))