<h2>定义部分：参考<a href="https://github.com/WangliLin/xunfei2021_car_loan_top1">https://github.com/WangliLin/xunfei2021_car_loan_top1</a></h2>

In [36]:
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from scipy.stats import variation
from tqdm import tqdm
from scipy.stats import ks_2samp, kstatvar
import pickle
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer
from sklearn.preprocessing import OrdinalEncoder


np.random.seed(1024)

def train_lgb_kfold(X_train, y_train, X_test, n_fold=5, cate_feats=None):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        logging.info(f'############ fold {fold} ###########')
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr, categorical_feature=cate_feats)
        dvalid = lgb.Dataset(X_val, y_val, categorical_feature=cate_feats, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'min_data_in_leaf': 50,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'n_jobs': -1,
            'seed': 1024,
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=300,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=50,
                        early_stopping_rounds=20)

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds

<h2>读取数据部分</h2>

In [37]:
data = pd.read_csv('../data/账户交易信息.csv')
data_static = pd.read_csv('../data/账户静态信息.csv')
data_label = pd.read_csv('../data/训练集标签.csv')
data = pd.merge(data, data_label, on="zhdh", how="left")

<h2>特征工程部分</h2>

In [38]:
data_trade = data[["zhdh", "dfzh"]]
data_trade = data_trade.value_counts().reset_index()
data_trade.columns = ["zhdh", "dfzh", "number"]
data_trade = pd.merge(data_trade, data_label, on="zhdh", how="left")
data_trade

Unnamed: 0,zhdh,dfzh,number,black_flag
0,A596886AF5381E59,0F76A363EACBDEAF,5770,0.0
1,A596886AF5381E59,B5D082CF912AAD2C,4791,0.0
2,0065E0EC09A64440,14BEFED1370B730A,2463,
3,E8FD8174DE7EE23E,0F76A363EACBDEAF,2058,
4,37C924CD58503D52,810310CDF06F7157,1671,0.0
...,...,...,...,...
194258,7D6F0F5480F9B910,741E5D9F2AE6EB7C,1,
194259,7D6F0F5480F9B910,77131FDB1D2900A2,1,
194260,7D6F0F5480F9B910,778315B2EF658590,1,
194261,E0DE80CE53CD84CB,2ADB29A2758717BD,1,


In [39]:
def get_ks(_data: DataFrame) -> float:
    """
    检验每个月的入账出账是否同分布\n
    """
    _df = pd.pivot_table(_data, values="jyje", index="日期", columns="年月")
    _df = _df.fillna(0)
    if _df.shape[1] < 2:
        return 1
    result = 0
    for j in range(_df.shape[1] - 1):
        result = result + ks_2samp(_df.iloc[:, j], _df.iloc[:, j+1]).statistic
    return result / (_df.shape[1] - 1)

In [40]:
top_in_jyqd = data[(data["jdbj"] == 1) & (data["black_flag"] == 1)]["jyqd"].value_counts().index.tolist()[0:10]
top_out_jyqd = data[(data["jdbj"] == 0) & (data["black_flag"] == 1)]["jyqd"].value_counts().index.tolist()[0:10]
def get_in_jyqd(_df: DataFrame) -> [int]:
    """
    获取入账交易渠道频次\n
    """
    ser = _df[_df["jdbj"] == 1]["jyqd"].value_counts().reindex(top_in_jyqd).fillna(0)
    return ser.tolist()

def get_out_jyqd(_df: DataFrame) -> [int]:
    """
    获取入账交易渠道频次\n
    """
    ser = _df[_df["jdbj"] == 0]["jyqd"].value_counts().reindex(top_out_jyqd).fillna(0)
    return ser.tolist()

In [41]:
in_jyjesum_name = data[(data["jdbj"] == 1) & (data["black_flag"] == 1)].groupby("jyrq")["jyje"].sum().sort_values(ascending=False).index.tolist()[0:25]
out_jyjesum_name = data[(data["jdbj"] == 0) & (data["black_flag"] == 1)].groupby("jyrq")["jyje"].sum().sort_values(ascending=False).index.tolist()[0:25]

def get_in_jyjesum(_df: DataFrame) -> [float]:
    """
    获取交易金额按日汇总\n
    """
    ser = _df[_df["jdbj"] == 1].groupby("jyrq")["jyje"].sum().reindex(in_jyjesum_name).fillna(0)
    return ser.tolist()

def get_out_jyjesum(_df: DataFrame) -> [float]:
    """
    获取出账交易金额的按日汇总\n
    """
    ser = _df[_df["jdbj"] == 0].groupby("jyrq")["jyje"].sum().reindex(in_jyjesum_name).fillna(0)
    return ser.tolist()

in_jyjesum_name = list(map(lambda x: "in_jyjesum_" + x, in_jyjesum_name))
out_jyjesum_name = list(map(lambda x: "out_jyjesum_" + x, out_jyjesum_name))

In [42]:
def get_tfidf(colname: str, topk: int, jdbj: int, _df: DataFrame) -> ([str], [int]):
    """
    获取特征的tfidf\n
    """
    top_values = data[(data["jdbj"] == jdbj) & (data["black_flag"] == 1)][colname].value_counts().index.tolist()[0:topk]
    ser = _df[_df["jdbj"] == jdbj][colname].value_counts().reindex(top_values).fillna(0)
    if jdbj == 1:
        return list(map(lambda x: "in_" + colname + "_" + x, top_values)), ser.tolist()
    elif jdbj == 0:
        return list(map(lambda x: "out_" + colname + "_" + x, top_values)), ser.tolist()

In [43]:
def get_tfidf_sum(colname: str, sumcol: str, topk: int, jdbj: int, _df: DataFrame) -> ([str], [float]):
    """
    找到最频繁的几个colname，按照colname汇总sumcol\n
    """
    top_values = data[(data["jdbj"] == jdbj) & (data["black_flag"] == 1)][[colname, sumcol]].groupby(colname).sum().sort_values(sumcol, ascending=False).index.tolist()[0:topk]
    ser = _df[_df["jdbj"] == jdbj][[colname, sumcol]].groupby(colname)[sumcol].sum().reindex(top_values).fillna(0)
    if jdbj == 1:
        return list(map(lambda x: "in_" + colname + "_" + sumcol + "_sum_" + x, top_values)), ser.tolist()
    elif jdbj == 0:
        return list(map(lambda x: "out_" + colname + "_" + sumcol + "_sum_" + x, top_values)), ser.tolist()

In [44]:
def get_time_feature(_data: DataFrame, colnames: [str]) -> ([str], [float]):
    """
    计算时间特征\n
    """
    result1 = []
    result2 = []
    features = ["mean", "max", "min", "median"]
    for colname in colnames:
        result1.extend(list(map(lambda x: colname + "_" + str(x), features)))
        result2.extend(_data[colname].agg(features))
    return result1, result2

In [45]:
d = {}
users = data.zhdh.unique().tolist()
for user_id in tqdm(users):
    data_sample = data[data.zhdh == user_id].reset_index(drop=True)
    data_sample["转账日期"] = data_sample["jyrq"] + " " + data_sample["jysj"]
    data_sample["转账日期"] = pd.to_datetime(data_sample["转账日期"])
    data_sample = data_sample.sort_values("转账日期")
    data_sample["年月"] = data_sample["转账日期"].apply(lambda x: x.year * 100 + x.month)
    data_sample["日期"] = data_sample["转账日期"].apply(lambda x: x.day)
    
    # 交易金额是否为负数
    neg_num = (data_sample["jyje"] < 0).sum()
    neg_sum = data_sample[data_sample["jyje"] < 0]["jyje"].sum()
    if neg_num == 0:
        neg_per = 0
    else:
        neg_per = neg_sum / neg_num
        
    # 对冲交易额为负的数据
    del_index = data_sample[data_sample["jyje"] < 0].index.to_list()
    if len(del_index) > 0:
        del_index.extend(list(map(lambda x: x - 1, del_index)))
    data_sample = data_sample.drop(index=del_index)
    
    # 最大进账频次，最小进账频次，最大出账频次，最小出账频次
    max_in_cnt = data_sample[data_sample["jdbj"] == 1]["jyrq"].value_counts().max()
    min_in_cnt = data_sample[data_sample["jdbj"] == 1]["jyrq"].value_counts().min()
    median_in_cnt = data_sample[data_sample["jdbj"] == 1]["jyrq"].value_counts().median()
    max_out_cnt = data_sample[data_sample["jdbj"] == 0]["jyrq"].value_counts().max()
    min_out_cnt = data_sample[data_sample["jdbj"] == 0]["jyrq"].value_counts().min()
    median_out_cnt = data_sample[data_sample["jdbj"] == 0]["jyrq"].value_counts().median()
    
    # 对方账户的出现频次的最大值、最小值、中位数、个数
    max_df_cnt = data_sample["dfzh"].value_counts().max()
    min_df_cnt = data_sample["dfzh"].value_counts().min()
    median_df_cnt = data_sample["dfzh"].value_counts().median()
    num_df_cnt = data_sample["dfzh"].nunique()
    
    # 转入金额、转出金额的变异系数
    in_amt_variation = variation(data_sample[data_sample.jdbj == 1].jyje.values) 
    out_amt_variation = variation(data_sample[data_sample.jdbj == 0].jyje)
    in_amt_max = data_sample[data_sample.jdbj == 1]["jyje"].max()
    in_amt_min = data_sample[data_sample.jdbj == 1]["jyje"].max()
    in_amt_mean = data_sample[data_sample.jdbj == 1]["jyje"].mean()
    in_amt_ptp = in_amt_max - in_amt_min
    out_amt_max = data_sample[data_sample.jdbj == 0]["jyje"].max()
    out_amt_min = data_sample[data_sample.jdbj == 0]["jyje"].min()
    out_amt_mean = data_sample[data_sample.jdbj == 0]["jyje"].mean()
    out_amt_ptp = out_amt_max - out_amt_min
    
    # 临近月的转账分布一致性
    in_ks = get_ks(data_sample[data_sample["jdbj"] == 1])
    out_ks = get_ks(data_sample[data_sample["jdbj"] == 0])
    
    # 交易渠道
    in_jyqd = get_in_jyqd(data_sample)
    out_jyqd = get_out_jyqd(data_sample)
    
    # 摘要代号
    in_zydh_name, in_zydh = get_tfidf("zydh", 10, 1, data_sample)
    out_zydh_name, out_zydh = get_tfidf("zydh", 10, 0, data_sample)
    
    # 交易日期
    in_jyrq_name, in_jyrq = get_tfidf("jyrq", 25, 1, data_sample)
    out_jyrq_name, out_jyrq = get_tfidf("jyrq", 25, 0, data_sample)
    
    # 转账日期的时间特征
    data_sample['jy_month'] = data_sample["转账日期"].dt.month
    data_sample['jy_day'] = data_sample["转账日期"].dt.day
    data_sample['jy_weekofyear'] = data_sample["转账日期"].dt.weekofyear
    data_sample['jy_dayofyear'] = data_sample["转账日期"].dt.dayofyear
    data_sample['jy_dayofweek'] = data_sample["转账日期"].dt.dayofweek
    data_sample['jy_is_wknd'] = data_sample["转账日期"].dt.dayofweek // 6
    data_sample['jy_is_month_start'] = data_sample["转账日期"].dt.is_month_start.astype(int)
    data_sample['jy_is_month_end'] = data_sample["转账日期"].dt.is_month_end.astype(int)
    data_sample['jy_hour'] = data_sample['jysj'].apply(lambda x:int(x.split(':')[0]))
    data_sample['jy_minu'] = data_sample['jysj'].apply(lambda x:int(x.split(':')[1]))
    data_sample['jy_date'] = data_sample['jysj'].apply(lambda x:int(x.split(':')[0])*60 + int(x.split(':')[1]))
    in_timefeature_name, in_timefeature = get_time_feature(data_sample[data_sample["jdbj"] == 1], ["jy_month", "jy_day", "jy_weekofyear", "jy_dayofyear", "jy_dayofweek", "jy_is_wknd", "jy_is_month_start", "jy_is_month_end", "jy_hour", "jy_minu", "jy_date"])
    out_timefeature_name, out_timefeature = get_time_feature(data_sample[data_sample["jdbj"] == 0], ["jy_month", "jy_day", "jy_weekofyear", "jy_dayofyear", "jy_dayofweek", "jy_is_wknd", "jy_is_month_start", "jy_is_month_end", "jy_hour", "jy_minu", "jy_date"])
    
    # 按日汇总的交易金额
    in_jyjesum = get_in_jyjesum(data_sample)
    out_jyjesum = get_out_jyjesum(data_sample)
    
    # 余额
    in_max_ye = (data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"]).max()
    in_min_ye = (data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"]).min()
    in_mean_ye = (data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"]).mean()
    in_variation_ye = variation(data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"])
    out_max_ye = (data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"]).max()
    out_min_ye = (data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"]).min()
    out_mean_ye = (data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"]).mean()
    out_variation_ye = variation(data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"])
    
    # 自己给自己转账
    in_toself_cnt = data_sample[(data_sample["dfzh"] == user_id) & (data_sample["jdbj"] == 1)].shape[0]
    in_toself_amt = data_sample[(data_sample["dfzh"] == user_id) & (data_sample["jdbj"] == 1)]["jyje"].sum()
    out_toself_cnt = data_sample[(data_sample["dfzh"] == user_id) & (data_sample["jdbj"] == 0)].shape[0]
    out_toself_amt = data_sample[(data_sample["dfzh"] == user_id) & (data_sample["jdbj"] == 0)]["jyje"].sum()
    
    # 最常用的对方账号的汇款记录
    in_dfzh_tfidf_name, in_dfzh_tfidf = get_tfidf("dfzh", 10, 1, data_sample)
    out_dfzh_tfidf_name, out_dfzh_tfidf = get_tfidf("dfzh", 10, 0, data_sample)
    
    # 对方名称长度
    in_max_dfmccd = data_sample[data_sample["jdbj"] == 1]["dfmccd"].max()
    in_min_dfmccd = data_sample[data_sample["jdbj"] == 1]["dfmccd"].min()
    in_mean_dfmccd = data_sample[data_sample["jdbj"] == 1]["dfmccd"].mean()
    in_variation_dfmccd = variation(data_sample[data_sample["jdbj"] == 1]["dfmccd"])
    in_ptp_dfmccd = in_max_dfmccd - in_min_dfmccd
    out_max_dfmccd = data_sample[data_sample["jdbj"] == 0]["dfmccd"].max()
    out_min_dfmccd = data_sample[data_sample["jdbj"] == 0]["dfmccd"].min()
    out_mean_dfmccd = data_sample[data_sample["jdbj"] == 0]["dfmccd"].mean()
    out_variation_dfmccd = variation(data_sample[data_sample["jdbj"] == 0]["dfmccd"])
    out_ptp_dfmccd = out_max_dfmccd - out_min_dfmccd
    
    # 对方账户的最大年龄、最小年龄、平均年龄
    dfzhs = pd.merge(DataFrame({"dfzh": data_sample["dfzh"].unique()}), data_static, how="inner", left_on="dfzh", right_on="zhdh")
    if dfzhs.shape[0] > 0:
        max_df_age = dfzhs["年龄"].max()
        min_df_age = dfzhs["年龄"].min()
        mean_df_age = dfzhs["年龄"].mean()
    else:
        max_df_age = 0
        min_df_age = 0
        mean_df_age = 0
        
    # 对方行号
    in_dfhh_nunique = data_sample[data_sample["jdbj"] == 1]["dfhh"].nunique()
    out_dfhh_nunique = data_sample[data_sample["jdbj"] == 0]["dfhh"].nunique()
    # 交易渠道
    in_jyqd_nunique = data_sample[data_sample["jdbj"] == 1]["jyqd"].nunique()
    out_jyqd_nunique = data_sample[data_sample["jdbj"] == 0]["jyqd"].nunique()
    # 摘要代号
    in_zydh_nunique = data_sample[data_sample["jdbj"] == 1]["zydh"].nunique()
    out_zydh_nunique = data_sample[data_sample["jdbj"] == 0]["zydh"].nunique()
    
    # 按照交易渠道汇总交易额
    in_jyqd_sum_jyje_name, in_jyqd_sum_jyje = get_tfidf_sum("jyqd", "jyje", None, 1, data_sample)
    out_jyqd_sum_jyje_name, out_jyqd_sum_jyje = get_tfidf_sum("jyqd", "jyje", None, 0, data_sample)
    
    # 按照交易日期汇总交易额
    in_jyrq_sum_jyje_name, in_jyrq_sum_jyje = get_tfidf_sum("jyrq", "jyje", 10, 1, data_sample)
    out_jyrq_sum_jyje_name, out_jyrq_sum_jyje = get_tfidf_sum("jyrq", "jyje", 10, 0, data_sample)
    
    # 分箱统计交易金额
    labels = ["A", "B", "C", "D", "E", "F"]
    in_jyje_bins_name = list(map(lambda x: "in_jyje_" + x, labels))
    out_jyje_bins_name = list(map(lambda x: "out_jyje_" + x, labels))
    data_sample["jyje分箱"] = pd.cut(data_sample["jyje"], bins=[0, 10, 100, 1000, 10000, 100000, np.inf], right=False, include_lowest=True, labels=labels)
    in_jyje_bins = data_sample[data_sample["jdbj"] == 1]["jyje分箱"].value_counts().reindex(labels).fillna(0).tolist()
    out_jyje_bins = data_sample[data_sample["jdbj"] == 0]["jyje分箱"].value_counts().reindex(labels).fillna(0).tolist()
    
    all_cnt = data_sample.shape[0] # 操作次数
    
    # 转入次数、转出次数、转入次数占比
    in_cnt = data_sample.jdbj.sum()
    out_cnt = all_cnt - in_cnt
    in_ratio = in_cnt / all_cnt
    
    # 转入金额、转出金额、转入金额占比
    in_amt = data_sample[data_sample.jdbj == 1].jyje.sum()
    out_amt = data_sample[data_sample.jdbj == 0].jyje.sum()
    in_amt_ratio = in_amt / (in_amt + out_amt)
    
    # 转入人数、转出人数、转入人数占比
    in_user_cnt = data_sample[data_sample.jdbj == 1].dfzh.nunique()
    out_user_cnt = data_sample[data_sample.jdbj == 0].dfzh.nunique()
    in_user_ratio = in_user_cnt / (in_user_cnt + out_user_cnt)
    
    date_cnt = data_sample.jyrq.nunique() # 转账日期数
    
    d[user_id] = [all_cnt, in_cnt, out_cnt, in_ratio, in_amt, out_amt, in_amt_ratio, in_amt_variation, out_amt_variation, in_user_cnt, out_user_cnt, in_user_ratio, date_cnt, max_in_cnt, min_in_cnt, max_out_cnt, min_out_cnt, max_df_cnt, min_df_cnt, median_df_cnt, in_ks, out_ks] + in_jyqd + out_jyqd + in_zydh + out_zydh + in_jyrq + out_jyrq + in_timefeature + out_timefeature + [in_max_ye, in_min_ye, in_mean_ye, in_variation_ye, out_max_ye, out_min_ye, out_mean_ye, out_variation_ye] + [median_in_cnt, median_out_cnt, num_df_cnt, in_amt_max, in_amt_min, in_amt_mean, in_amt_ptp, out_amt_max, out_amt_min, out_amt_mean, out_amt_ptp] + [in_toself_cnt, in_toself_amt, out_toself_cnt, out_toself_amt] + in_dfzh_tfidf + in_dfzh_tfidf + [in_max_dfmccd, in_min_dfmccd, in_mean_dfmccd, in_variation_dfmccd, in_ptp_dfmccd, out_max_dfmccd, out_min_dfmccd, out_mean_dfmccd, out_variation_dfmccd, out_ptp_dfmccd] + [max_df_age, min_df_age, mean_df_age] + [neg_num, neg_per] + [in_dfhh_nunique, out_dfhh_nunique, in_jyqd_nunique, out_jyqd_nunique, in_zydh_nunique, out_zydh_nunique] + in_jyqd_sum_jyje + out_jyqd_sum_jyje + in_jyrq_sum_jyje + out_jyrq_sum_jyje + in_jyje_bins + out_jyje_bins

data_df = pd.DataFrame.from_dict(d).T.reset_index()
data_df.columns = ['zhdh', 'all_cnt', 'in_cnt', 'out_cnt', 'in_ratio', 'in_amt', 'out_amt', 'in_amt_ratio', "in_amt_variation", "out_amt_variation", 'in_user_cnt', 'out_user_cnt', 'in_user_ratio', 'date_cnt', "max_in_cnt", "min_in_cnt", "max_out_cnt", "min_out_cnt", "max_df_cnt", "min_df_cnt", "median_df_cnt", "in_ks", "out_ks"] + list(map(lambda x: "in_jyqd_" + x, top_in_jyqd)) + list(map(lambda x: "out_jyqd_" + x, top_out_jyqd)) + in_zydh_name + out_zydh_name + in_jyrq_name + out_jyrq_name + list(map(lambda x: "in_" + x, in_timefeature_name)) + list(map(lambda x: "out_" + x, out_timefeature_name)) + ["in_max_ye", "in_min_ye", "in_mean_ye", "in_variation_ye", "out_max_ye", "out_min_ye", "out_mean_ye", "out_variation_ye"] + ["median_in_cnt", "median_out_cnt", "num_df_cnt", "in_amt_max", "in_amt_min", "in_amt_mean", "in_amt_ptp", "out_amt_max", "out_amt_min", "out_amt_mean", "out_amt_ptp"] + ["in_toself_cnt", "in_toself_amt", "out_toself_cnt", "out_toself_amt"] + in_dfzh_tfidf_name + out_dfzh_tfidf_name + ["in_max_dfmccd", "in_min_dfmccd", "in_mean_dfmccd", "in_variation_dfmccd", "in_ptp_dfmccd", "out_max_dfmccd", "out_min_dfmccd", "out_mean_dfmccd", "out_variation_dfmccd", "out_ptp_dfmccd"] + ["max_df_age", "min_df_age", "mean_df_age"] + ["neg_num", "neg_per"] + ["in_dfhh_nunique", "out_dfhh_nunique", "in_jyqd_nunique", "out_jyqd_nunique", "in_zydh_nunique", "out_zydh_nunique"] + in_jyqd_sum_jyje_name + out_jyqd_sum_jyje_name + in_jyrq_sum_jyje_name + out_jyrq_sum_jyje_name + in_jyje_bins_name + out_jyje_bins_name

100%|████████████████████████████████████████████████████████████████████████████████| 6000/6000 [26:51<00:00,  3.72it/s]


<h2>合并标签部分</h2>

In [46]:
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))

In [47]:
data_static.columns = ['zhdh', 'khrq', 'khjgdh', 'xb', 'age']
data_static["khrq"] = pd.to_datetime(data_static["khrq"], format="%Y-%m-%d")
data_static['year']  = data_static['khrq'].dt.year
data_static['month'] = data_static['khrq'].dt.month
data_static['day']   = data_static['khrq'].dt.day
data_static["khjgdh"] = label_encode(data_static["khjgdh"])
df_feats = pd.merge(left=data_df, right=data_static[['zhdh', 'xb', 'age', "year", "month", "day"]], on='zhdh', how='left')
df_final = pd.merge(left=df_feats, right=data_label, on='zhdh', how='left')

<h2>准备数据集</h2>

In [48]:
df_train = df_final[df_final.black_flag.notnull()].reset_index(drop=True)
df_test = df_final[df_final.black_flag.isnull()].reset_index(drop=True)

feats = df_train.columns[1:-1].tolist()
X_train = df_train[feats]
y_train = df_train['black_flag']
X_test = df_test[feats]

<h2>使用SMOTE增强</h2>

In [49]:
# smo = SMOTE(random_state=42)
# X_train, y_train = smo.fit_sample(X_train, y_train)

<h2>开始训练</h2>

In [50]:
gbms, oof_preds, test_preds = train_lgb_kfold(X_train, y_train, X_test, n_fold=5)

2023-03-05 13:33:45,063 : INFO : ############ fold 0 ###########


[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22149
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 283
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.995457	valid_1's auc: 0.973611

2023-03-05 13:33:45,406 : INFO : ############ fold 1 ###########



[100]	training's auc: 0.999994	valid_1's auc: 0.975648
Early stopping, best iteration is:
[88]	training's auc: 0.999919	valid_1's auc: 0.976204
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22271
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 286
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612

2023-03-05 13:33:45,646 : INFO : ############ fold 2 ###########



[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.995347	valid_1's auc: 0.964537
Early stopping, best iteration is:
[45]	training's auc: 0.994236	valid_1's auc: 0.966296
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22358
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 283
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.9964	valid_1's auc: 0.933519

2023-03-05 13:33:46,001 : INFO : ############ fold 3 ###########



[100]	training's auc: 0.999983	valid_1's auc: 0.936481
Early stopping, best iteration is:
[96]	training's auc: 0.999971	valid_1's auc: 0.938426
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22182
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 285
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.995694	valid_1's auc: 0.951759

2023-03-05 13:33:46,306 : INFO : ############ fold 4 ###########



Early stopping, best iteration is:
[73]	training's auc: 0.999253	valid_1's auc: 0.958333
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22119
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 284
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.99467	valid_1's auc: 0.964352
[100]	training's auc: 1	valid_1's auc: 0.970833
Early stopping, best iteration is:
[88]	training's auc: 0.999826	valid_1's auc: 0.971852


<h2>搜索最优阈值</h2>

In [51]:
def gen_thres_new(df_train, oof_preds):
    df_train['oof_preds'] = oof_preds
    quantile_point = df_train['black_flag'].mean()
    thres = df_train['oof_preds'].quantile(1 - quantile_point)

    _thresh = []
    for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.01):
        _thresh.append(
            [thres_item, f1_score(df_train['black_flag'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

    _thresh = np.array(_thresh)
    best_id = _thresh[:, 1].argmax()
    best_thresh = _thresh[best_id][0]

    print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
    return best_thresh

best_thresh = gen_thres_new(df_train, oof_preds)

阈值: 0.4159764778153206
训练集的f1: 0.8961849884634143


<h2>生成提交结果</h2>

In [52]:
df_test['black_flag'] = np.where(test_preds > best_thresh, 1, 0)
df_test[['zhdh', 'black_flag']].to_csv("../data/submit_%s.csv" % datetime.now().strftime("%Y-%m-%dT%H-%M"))