<h2>定义部分：参考<a href="https://github.com/WangliLin/xunfei2021_car_loan_top1">https://github.com/WangliLin/xunfei2021_car_loan_top1</a></h2>

In [29]:
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from scipy.stats import variation
from tqdm import tqdm
from scipy.stats import ks_2samp, kstatvar
import pickle
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer
from sklearn.preprocessing import OrdinalEncoder


np.random.seed(1024)

def train_lgb_kfold(X_train, y_train, X_test, n_fold=5, cate_feats=None):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        logging.info(f'############ fold {fold} ###########')
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr, categorical_feature=cate_feats)
        dvalid = lgb.Dataset(X_val, y_val, categorical_feature=cate_feats, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'min_data_in_leaf': 50,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'n_jobs': -1,
            'seed': 1024,
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=300,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=50,
                        early_stopping_rounds=20)

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds

<h2>读取数据部分</h2>

In [17]:
data = pd.read_csv('../data/账户交易信息.csv')
data_static = pd.read_csv('../data/账户静态信息.csv')
data_label = pd.read_csv('../data/训练集标签.csv')
data = pd.merge(data, data_label, on="zhdh", how="left")

<h2>特征工程部分</h2>

In [18]:
def get_ks(_data: DataFrame) -> float:
    """
    检验每个月的入账出账是否同分布\n
    """
    _df = pd.pivot_table(_data, values="jyje", index="日期", columns="年月")
    _df = _df.fillna(0)
    if _df.shape[1] < 2:
        return 1
    result = 0
    for j in range(_df.shape[1] - 1):
        result = result + ks_2samp(_df.iloc[:, j], _df.iloc[:, j+1]).statistic
    return result / (_df.shape[1] - 1)

In [19]:
top_in_jyqd = data[(data["jdbj"] == 1) & (data["black_flag"] == 1)]["jyqd"].value_counts().index.tolist()[0:10]
top_out_jyqd = data[(data["jdbj"] == 0) & (data["black_flag"] == 1)]["jyqd"].value_counts().index.tolist()[0:10]
def get_in_jyqd(_df: DataFrame) -> [int]:
    """
    获取入账交易渠道频次\n
    """
    ser = _df[_df["jdbj"] == 1]["jyqd"].value_counts().reindex(top_in_jyqd).fillna(0)
    return ser.tolist()

def get_out_jyqd(_df: DataFrame) -> [int]:
    """
    获取入账交易渠道频次\n
    """
    ser = _df[_df["jdbj"] == 0]["jyqd"].value_counts().reindex(top_out_jyqd).fillna(0)
    return ser.tolist()

In [20]:
in_jyjesum_name = data[(data["jdbj"] == 1) & (data["black_flag"] == 1)].groupby("jyrq")["jyje"].sum().sort_values(ascending=False).index.tolist()[0:25]
out_jyjesum_name = data[(data["jdbj"] == 0) & (data["black_flag"] == 1)].groupby("jyrq")["jyje"].sum().sort_values(ascending=False).index.tolist()[0:25]

def get_in_jyjesum(_df: DataFrame) -> [float]:
    """
    获取交易金额按日汇总\n
    """
    ser = _df[_df["jdbj"] == 1].groupby("jyrq")["jyje"].sum().reindex(in_jyjesum_name).fillna(0)
    return ser.tolist()

def get_out_jyjesum(_df: DataFrame) -> [float]:
    """
    获取出账交易金额的按日汇总\n
    """
    ser = _df[_df["jdbj"] == 0].groupby("jyrq")["jyje"].sum().reindex(in_jyjesum_name).fillna(0)
    return ser.tolist()

in_jyjesum_name = list(map(lambda x: "in_jyjesum_" + x, in_jyjesum_name))
out_jyjesum_name = list(map(lambda x: "out_jyjesum_" + x, out_jyjesum_name))

In [21]:
def get_tfidf(colname: str, topk: int, jdbj: int, _df: DataFrame) -> ([str], [int]):
    """
    获取特征的tfidf\n
    """
    top_values = data[(data["jdbj"] == jdbj) & (data["black_flag"] == 1)][colname].value_counts().index.tolist()[0:topk]
    ser = _df[_df["jdbj"] == jdbj][colname].value_counts().reindex(top_values).fillna(0)
    if jdbj == 1:
        return list(map(lambda x: "in_" + colname + "_" + x, top_values)), ser.tolist()
    elif jdbj == 0:
        return list(map(lambda x: "out_" + colname + "_" + x, top_values)), ser.tolist()

In [22]:
def get_time_feature(_data: DataFrame, colnames: [str]) -> ([str], [float]):
    """
    计算时间特征\n
    """
    result1 = []
    result2 = []
    features = ["mean", "max", "min"]
    for colname in colnames:
        result1.extend(list(map(lambda x: colname + "_" + str(x), features)))
        result2.extend(_data[colname].agg(features))
    return result1, result2

In [23]:
d = {}
users = data.zhdh.unique().tolist()
for user_id in tqdm(users):
    data_sample = data[data.zhdh == user_id].reset_index(drop=True)
    data_sample["转账日期"] = data_sample["jyrq"] + " " + data_sample["jysj"]
    data_sample["转账日期"] = pd.to_datetime(data_sample["转账日期"])
    data_sample = data_sample.sort_values("转账日期")
    data_sample["年月"] = data_sample["转账日期"].apply(lambda x: x.year * 100 + x.month)
    data_sample["日期"] = data_sample["转账日期"].apply(lambda x: x.day)
    data_sample["对方账户"] = data_sample["dfzh"] + data_sample["dfhh"]
    
    # 最大进账频次，最小进账频次，最大出账频次，最小出账频次
    max_in_cnt = data_sample[data_sample["jdbj"] == 1]["jyrq"].value_counts().max()
    min_in_cnt = data_sample[data_sample["jdbj"] == 1]["jyrq"].value_counts().min()
    max_out_cnt = data_sample[data_sample["jdbj"] == 0]["jyrq"].value_counts().max()
    min_out_cnt = data_sample[data_sample["jdbj"] == 0]["jyrq"].value_counts().min()
    
    # 对方账户的出现频次的最大值、最小值、中位数
    max_df_cnt = data_sample["对方账户"].value_counts().max()
    min_df_cnt = data_sample["对方账户"].value_counts().min()
    median_df_cnt = data_sample["对方账户"].value_counts().median()
    
    # 转入金额、转出金额的变异系数
    in_amt_variation = variation(data_sample[data_sample.jdbj == 1].jyje.values) 
    out_amt_variation = variation(data_sample[data_sample.jdbj == 0].jyje)
    
    # 临近月的转账分布一致性
    in_ks = get_ks(data_sample[data_sample["jdbj"] == 1])
    out_ks = get_ks(data_sample[data_sample["jdbj"] == 0])
    
    # 交易渠道
    in_jyqd = get_in_jyqd(data_sample)
    out_jyqd = get_out_jyqd(data_sample)
    
    # 摘要代号
    in_zydh_name, in_zydh = get_tfidf("zydh", 10, 1, data_sample)
    out_zydh_name, out_zydh = get_tfidf("zydh", 10, 0, data_sample)
    
    # 交易日期
    in_jyrq_name, in_jyrq = get_tfidf("jyrq", 25, 1, data_sample)
    out_jyrq_name, out_jyrq = get_tfidf("jyrq", 25, 0, data_sample)
    
    # 转账日期的时间特征
    data_sample['jy_month'] = data_sample["转账日期"].dt.month
    data_sample['jy_day'] = data_sample["转账日期"].dt.day
    data_sample['jy_weekofyear'] = data_sample["转账日期"].dt.weekofyear
    data_sample['jy_dayofyear'] = data_sample["转账日期"].dt.dayofyear
    data_sample['jy_dayofweek'] = data_sample["转账日期"].dt.dayofweek
    data_sample['jy_is_wknd'] = data_sample["转账日期"].dt.dayofweek // 6
    data_sample['jy_is_month_start'] = data_sample["转账日期"].dt.is_month_start.astype(int)
    data_sample['jy_is_month_end'] = data_sample["转账日期"].dt.is_month_end.astype(int)
    data_sample['jy_hour'] = data_sample['jysj'].apply(lambda x:int(x.split(':')[0]))
    data_sample['jy_minu'] = data_sample['jysj'].apply(lambda x:int(x.split(':')[1]))
    data_sample['jy_date'] = data_sample['jysj'].apply(lambda x:int(x.split(':')[0])*60 + int(x.split(':')[1]))
    in_timefeature_name, in_timefeature = get_time_feature(data_sample[data_sample["jdbj"] == 1], ["jy_month", "jy_day", "jy_weekofyear", "jy_dayofyear", "jy_dayofweek", "jy_is_wknd", "jy_is_month_start", "jy_is_month_end", "jy_hour", "jy_minu", "jy_date"])
    out_timefeature_name, out_timefeature = get_time_feature(data_sample[data_sample["jdbj"] == 0], ["jy_month", "jy_day", "jy_weekofyear", "jy_dayofyear", "jy_dayofweek", "jy_is_wknd", "jy_is_month_start", "jy_is_month_end", "jy_hour", "jy_minu", "jy_date"])
    
    # 按日汇总的交易金额
    in_jyjesum = get_in_jyjesum(data_sample)
    out_jyjesum = get_out_jyjesum(data_sample)
    
    # 余额
    in_max_ye = (data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"]).max()
    in_min_ye = (data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"]).min()
    in_mean_ye = (data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"]).mean()
    in_variation_ye = variation(data_sample[data_sample["jdbj"] == 1]["jyje"] / data_sample[data_sample["jdbj"] == 1]["zhye"])
    out_max_ye = (data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"]).max()
    out_min_ye = (data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"]).min()
    out_mean_ye = (data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"]).mean()
    out_variation_ye = variation(data_sample[data_sample["jdbj"] == 0]["jyje"] / data_sample[data_sample["jdbj"] == 0]["zhye"])
    
    all_cnt = data_sample.shape[0] # 操作次数
    
    # 转入次数、转出次数、转入次数占比
    in_cnt = data_sample.jdbj.sum()
    out_cnt = all_cnt - in_cnt
    in_ratio = in_cnt / all_cnt
    
    # 转入金额、转出金额、转入金额占比
    in_amt = data_sample[data_sample.jdbj == 1].jyje.sum()
    out_amt = data_sample[data_sample.jdbj == 0].jyje.sum()
    in_amt_ratio = in_amt / (in_amt + out_amt)
    
    # 转入人数、转出人数、转入人数占比
    in_user_cnt = data_sample[data_sample.jdbj == 1].dfzh.nunique()
    out_user_cnt = data_sample[data_sample.jdbj == 0].dfzh.nunique()
    in_user_ratio = in_user_cnt / (in_user_cnt + out_user_cnt)
    
    date_cnt = data_sample.jyrq.nunique() # 转账日期数
    
    d[user_id] = [all_cnt, in_cnt, out_cnt, in_ratio, in_amt, out_amt, in_amt_ratio, in_amt_variation, out_amt_variation, in_user_cnt, out_user_cnt, in_user_ratio, date_cnt, max_in_cnt, min_in_cnt, max_out_cnt, min_out_cnt, max_df_cnt, min_df_cnt, median_df_cnt, in_ks, out_ks] + in_jyqd + out_jyqd + in_zydh + out_zydh + in_jyrq + out_jyrq + in_timefeature + out_timefeature + [in_max_ye, in_min_ye, in_mean_ye, in_variation_ye, out_max_ye, out_min_ye, out_mean_ye, out_variation_ye]

data_df = pd.DataFrame.from_dict(d).T.reset_index()
data_df.columns = ['zhdh', 'all_cnt', 'in_cnt', 'out_cnt', 'in_ratio', 'in_amt', 'out_amt', 'in_amt_ratio', "in_amt_variation", "out_amt_variation", 'in_user_cnt', 'out_user_cnt', 'in_user_ratio', 'date_cnt', "max_in_cnt", "min_in_cnt", "max_out_cnt", "min_out_cnt", "max_df_cnt", "min_df_cnt", "median_df_cnt", "in_ks", "out_ks"] + list(map(lambda x: "in_jyqd_" + x, top_in_jyqd)) + list(map(lambda x: "out_jyqd_" + x, top_out_jyqd)) + in_zydh_name + out_zydh_name + in_jyrq_name + out_jyrq_name + list(map(lambda x: "in_" + x, in_timefeature_name)) + list(map(lambda x: "out_" + x, out_timefeature_name)) + ["in_max_ye", "in_min_ye", "in_mean_ye", "in_variation_ye", "out_max_ye", "out_min_ye", "out_mean_ye", "out_variation_ye"]

100%|████████████████████████████████████████████████████████████████████████████████████████████| 6000/6000 [09:36<00:00, 10.40it/s]


<h2>合并标签部分</h2>

In [34]:
data_static.columns = ['zhdh', 'khrq', 'khjgdh', 'xb', 'age']
df_feats = pd.merge(left=data_df, right=data_static[['zhdh', 'xb', 'age']], on='zhdh', how='left')
df_final = pd.merge(left=df_feats, right=data_label, on='zhdh', how='left')

<h2>准备数据集</h2>

In [35]:
df_train = df_final[df_final.black_flag.notnull()].reset_index(drop=True)
df_test = df_final[df_final.black_flag.isnull()].reset_index(drop=True)

feats = df_train.columns[1:-1].tolist()
X_train = df_train[feats]
y_train = df_train['black_flag']
X_test = df_test[feats]

<h2>使用SMOTE增强</h2>

In [36]:
# smo = SMOTE(random_state=42)
# X_train, y_train = smo.fit_sample(X_train, y_train)

<h2>开始训练</h2>

In [37]:
gbms, oof_preds, test_preds = train_lgb_kfold(X_train, y_train, X_test, n_fold=5)

2023-02-22 15:18:04,187 : INFO : ############ fold 0 ###########
2023-02-22 15:18:04,322 : INFO : ############ fold 1 ###########


[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13466
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 172
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.994867	valid_1's auc: 0.977315
Early stopping, best iteration is:
[50]	training's auc: 0.994867	valid_1's auc: 0.977315
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13520
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 174
[LightGBM] [Info] [binary:Boos

2023-02-22 15:18:04,493 : INFO : ############ fold 2 ###########


[50]	training's auc: 0.994722	valid_1's auc: 0.959722
[100]	training's auc: 0.999988	valid_1's auc: 0.960093
Early stopping, best iteration is:
[85]	training's auc: 0.999682	valid_1's auc: 0.963333
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13557
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 172
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds


2023-02-22 15:18:04,649 : INFO : ############ fold 3 ###########


[50]	training's auc: 0.994728	valid_1's auc: 0.92463
Early stopping, best iteration is:
[61]	training's auc: 0.996962	valid_1's auc: 0.928333
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13476
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 173
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.995064	valid_1's auc: 0.945185


2023-02-22 15:18:04,807 : INFO : ############ fold 4 ###########


Early stopping, best iteration is:
[68]	training's auc: 0.998461	valid_1's auc: 0.948148
[LightGBM] [Info] Number of positive: 240, number of negative: 720
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13375
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 173
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250000 -> initscore=-1.098612
[LightGBM] [Info] Start training from score -1.098612
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.994253	valid_1's auc: 0.953889


[100]	training's auc: 0.999988	valid_1's auc: 0.964167
[150]	training's auc: 1	valid_1's auc: 0.968056
Early stopping, best iteration is:
[137]	training's auc: 1	valid_1's auc: 0.970185


<h2>搜索最优阈值</h2>

In [38]:
def gen_thres_new(df_train, oof_preds):
    df_train['oof_preds'] = oof_preds
    quantile_point = df_train['black_flag'].mean()
    thres = df_train['oof_preds'].quantile(1 - quantile_point)

    _thresh = []
    for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.01):
        _thresh.append(
            [thres_item, f1_score(df_train['black_flag'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

    _thresh = np.array(_thresh)
    best_id = _thresh[:, 1].argmax()
    best_thresh = _thresh[best_id][0]

    print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
    return best_thresh

best_thresh = gen_thres_new(df_train, oof_preds)

阈值: 0.4923403360935121
训练集的f1: 0.8938574938574939


<h2>生成提交结果</h2>

In [39]:
df_test['black_flag'] = np.where(test_preds > best_thresh, 1, 0)
df_test[['zhdh', 'black_flag']].to_csv("../data/submit_%s.csv" % datetime.now().strftime("%Y-%m-%dT%H-%M"))