In [1]:
import os
import time
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
def tpr_weight_funtion(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

In [3]:
col_dict = {
    'id': '主键',
    'XINGBIE': '性别',
    'CSNY': '出生年月',
    'HYZK': '婚姻状况',
    'ZHIYE': '职业',
    'ZHICHEN': '职称',
    'ZHIWU': '职务',
    'XUELI': '学历',
    'DWJJLX': '单位经济类型',
    'DWSSHY': '单位所属行业',
    'GRJCJS': '个人缴存基数',
    'GRZHZT': '个人账户状态',
    'GRZHYE': '个人账户余额',
    'GRZHSNJZYE': '个人账户上年结转余额',
    'GRZHDNGJYE': '个人账户当年归集余额', 
    'GRYJCE': '个人月缴存额',
    'DWYJCE': '单位月缴存额',
    'DKFFE': '贷款发放额',
    'DKYE': '贷款余额',
    'DKLL': '贷款利率',
    'label': '是否逾期'
}
drop_cols = ['主键', '婚姻状况', '职业', '职称', '职务', '学历', '单位月缴存额', '是否逾期']
raw_cate_cols = ['单位经济类型', '单位所属行业', '个人账户状态', '性别' ]
raw_num_cols = ['个人缴存基数', '个人账户余额', '个人账户上年结转余额', '个人账户当年归集余额', '个人月缴存额', '贷款发放额', 
                '贷款余额', '贷款利率']
root = '../data'
dt = "2021-1-01 00:00:00"
timestamp_assign = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
timestamp_assign=time.mktime(timestamp_assign)


In [4]:
train = pd.read_csv(root+'/train.csv')
test = pd.read_csv(root+'/test.csv')
submit = pd.read_csv(root+'/submit.csv')

In [5]:
df = pd.concat([train, test], axis = 0).reset_index(drop = True)
df.columns=df.columns.map(col_dict)

In [6]:
#首套
#5年以上3.25： 2.708
#1-5年 2.75： 2.292
#二套
#5年以上 3.575： 2.979
#1-5年 3.025： 2.521
rate_dict = {3.025: 2.521, 3.575: 2.979, 3.25: 2.708, 2.75: 2.292}

def rate_func(x):
    if x == 3.025:
        return 2.521
    if x == 3.575:
        return 2.979
    if x == 3.25:
        return 2.708
    if x == 2.75:
        return 2.292
    return x

def loan_years(x):
    if x == 2.708 or x == 2.979:
        return 0
    return 1

def num_house(x):
    if x == 2.521 or x== 2.979:
        return 0
    return 1

def monthly_house_payments(p, i):
    if i == 2.708 or i == 2.979:
        n = 30*12
    else:
        n = 5*12
    i = i/100
    return p*i*(1+i)**n/((1+i)**n-1)

def ability_pay(gz, gjj, yg):
    #还贷能力系数
    return (gz+gjj)/yg

def convert(x):
    age = (datetime.datetime.fromtimestamp(timestamp_assign)-datetime.datetime.fromtimestamp(x)).days
    return age//365+1

In [7]:
def deal_table(df):
    df['出生年月'] = df['出生年月'].apply(convert)
    df['贷款利率'] = df['贷款利率'].apply(rate_func)
    df, genFeats1 = get_bin(df, col = '出生年月', bins=[18,25,30,35,40,45])
    df, genFeats2 = get_bin(df, col = '贷款余额', bins=[100000, 120000, 140000, 180000, 220000, 260000, 300000])
    df, genFeats3 = get_bin(df, col = '贷款发放额', bins=[100000, 120000, 140000, 180000, 220000, 260000, 300000])
    df['贷款发放额_贷款余额'] = df['贷款发放额'] + df['贷款余额']
    df['贷款发放额_贷款余额_multi_贷款利率'] = (df['贷款发放额'] + df['贷款余额']) * df['贷款利率']
    df['贷款发放额_multi_贷款利率'] = df['贷款发放额'] * df['贷款利率']
    df['贷款余额_multi_贷款利率'] = df['贷款余额'] * df['贷款利率']

    df['贷款发放额_multi_贷款利率_ratio'] = df['贷款发放额'] * df['贷款利率'] / df['贷款发放额_贷款余额_multi_贷款利率']
    df['贷款余额_multi_贷款利率_ratio'] = df['贷款余额'] * df['贷款利率'] / df['贷款发放额_贷款余额_multi_贷款利率']
    df['贷款余额_贷款发放额_ratio'] = df['贷款余额'] / df['贷款发放额_贷款余额']
    df['贷款发放额_贷款余额_ratio'] = df['贷款发放额'] / df['贷款发放额_贷款余额']
    df['个人账户余额_diff_个人账户当年归集余额'] = df['个人账户余额'] - df['个人账户当年归集余额']
    df['个人账户余额_diff_个人账户上年结转余额'] = df['个人账户余额'] - df['个人账户上年结转余额']
    df['月供'] = df.apply(lambda row: monthly_house_payments(row['贷款发放额'], row['贷款利率']), axis=1)
    df['月供2'] = df.apply(lambda row: monthly_house_payments(row['贷款余额'], row['贷款利率']), axis=1)
    df['还贷能力系数'] = df.apply(lambda row: ability_pay(row['个人缴存基数'], row['个人月缴存额'], row['月供']), axis=1)
    df['月供/个人缴存基数'] = df['月供']/df['个人缴存基数']
    df['月供/个人月缴存额'] = df['月供']/df['个人月缴存额']
    df['归集余额+结转余额'] = df['个人账户上年结转余额']+df['个人账户当年归集余额']
    df['公积金+结转余额'] = df['个人月缴存额']*24+df['个人账户上年结转余额']
    df['公积金-归集余额'] = df['个人月缴存额']*24-df['个人账户当年归集余额']
    df['归集余额+结转余额-个人账户余额'] = df['归集余额+结转余额']-df['个人账户余额']
    df['个人月缴存额/贷款余额'] = df['个人月缴存额']/df['贷款余额']
    df['个人账户余额/贷款余额'] = df['个人账户余额']/df['贷款余额']
    df['个人月缴存额/贷款发放额'] = df['个人月缴存额']/df['贷款发放额']
    df['贷款利率*贷款发放额'] = df['贷款利率']*df['贷款发放额']
    df['贷款利率*贷款余额'] = df['贷款利率']*df['贷款余额']
    df['个人缴存基数/个人账户余额'] = df['个人缴存基数']*df['个人账户余额']
    df['个人缴存基数/个人账户上年结转余额'] = df['个人缴存基数']*df['个人账户上年结转余额']
    df['个人缴存基数/贷款发放额'] = df['个人缴存基数']*df['贷款发放额']
    df['个人缴存基数/贷款余额'] = df['个人缴存基数']*df['贷款余额']
    df['贷款利率/个人缴存基数'] = df['贷款利率']/df['个人缴存基数']
    df['公积金比例'] = df['个人月缴存额']/df['个人缴存基数']
    df['贷款年限类别'] = df['贷款利率'].apply(loan_years)
    df['第N房'] = df['贷款利率'].apply(num_house)
    df['已还贷款'] = df['贷款发放额']-df['贷款余额']
    df['贷款余额/公积金'] = df['贷款余额']/df['个人缴存基数']
    df['已还贷款比例'] = df['已还贷款']/df['贷款发放额']
    df['已还贷款年限'] = df['已还贷款']/df['月供']
    df['贷款余额-个人账户余额'] = df['贷款余额']/df['个人账户余额']
    
    return df, genFeats2, genFeats3

hand_gen_feats = ['贷款发放额_贷款余额', '贷款发放额_贷款余额_multi_贷款利率', '贷款发放额_multi_贷款利率',
                  '贷款余额_multi_贷款利率', '贷款发放额_multi_贷款利率_ratio', '贷款余额_multi_贷款利率_ratio',
                 '贷款余额_贷款发放额_ratio', '贷款发放额_贷款余额_ratio', '个人账户余额_diff_个人账户当年归集余额',
                 '个人账户余额_diff_个人账户上年结转余额', '月供', '月供2', '还贷能力系数', '月供/个人缴存基数',
                 '月供/个人月缴存额', '归集余额+结转余额-个人账户余额', '个人月缴存额/贷款余额', '个人账户余额/贷款余额',
                 '个人月缴存额/贷款发放额', '贷款利率*贷款发放额', '贷款利率*贷款余额', '个人缴存基数/个人账户余额',
                 '个人缴存基数/个人账户上年结转余额', '个人缴存基数/贷款发放额', '个人缴存基数/贷款余额', '贷款利率/个人缴存基数',
                 '公积金比例', '贷款年限类别', '第N房', '已还贷款', '贷款余额/公积金', '已还贷款比例', '已还贷款年限',
                 '贷款余额-个人账户余额']

In [8]:
def cat_feat_engin(df, raw_cate_cols):
    for f in tqdm(raw_cate_cols):
        df[f].fillna('-1', inplace=True)
        le = LabelEncoder()
        le.fit(df[f])
        df[f] = le.transform(df[f])
        df[f + '_count'] = df[f].map(df[f].value_counts())
        df = pd.concat([df,pd.get_dummies(df[f],prefix=f"{f}")],axis=1)
        
    cate_cols_combine = [[raw_cate_cols[i], raw_cate_cols[j]] for i in range(len(raw_cate_cols)) \
                         for j in range(i + 1, len(raw_cate_cols))]

    for f1, f2 in tqdm(cate_cols_combine):
        df['{}_{}_count'.format(f1, f2)] = df.groupby([f1, f2])['主键'].transform('count')
        df['{}_in_{}_prop'.format(f1, f2)] = df['{}_{}_count'.format(f1, f2)] / df[f2 + '_count']
        df['{}_in_{}_prop'.format(f2, f1)] = df['{}_{}_count'.format(f1, f2)] / df[f1 + '_count']

    return df

def cat_num_feat_engin(df, raw_cate_cols, raw_num_cols, hand_gen_feats, genFeats2, genFeats3):
    for f1 in tqdm(raw_cate_cols):
        g = df.groupby(f1)
        for f2 in raw_num_cols + hand_gen_feats:
            for stat in ['sum', 'mean', 'std', 'max', 'min', 'std']:
                df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)
        for f3 in genFeats2 + genFeats3:
            for stat in ['sum', 'mean']:
                df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)

    return df

def num_feat_engin(df, raw_num_cols, hand_gen_feats):
    num_cols_gen_feats = raw_num_cols + hand_gen_feats
    for f1 in tqdm(num_cols_gen_feats):
        g = df.groupby(f1)
        for f2 in num_cols_gen_feats:
            if f1 != f2:
                for stat in ['sum', 'mean', 'std', 'max', 'min', 'std']:
                    df['{}_{}_{}'.format(f1, f2, stat)] = g[f2].transform(stat)

    for i in tqdm(range(len(num_cols_gen_feats))):
        for j in range(i + 1, len(num_cols_gen_feats)):
            df[f'numsOf_{num_cols_gen_feats[i]}_{num_cols_gen_feats[j]}_add'] = df[num_cols_gen_feats[i]] + df[num_cols_gen_feats[j]]
            df[f'numsOf_{num_cols_gen_feats[i]}_{num_cols_gen_feats[j]}_diff'] = df[num_cols_gen_feats[i]] - df[num_cols_gen_feats[j]]
            df[f'numsOf_{num_cols_gen_feats[i]}_{num_cols_gen_feats[j]}_multi'] = df[num_cols_gen_feats[i]] * df[num_cols_gen_feats[j]]
            df[f'numsOf_{num_cols_gen_feats[i]}_{num_cols_gen_feats[j]}_div'] = df[num_cols_gen_feats[i]] / (df[num_cols_gen_feats[j]] + 0.0000000001)

    return df

In [9]:
# 目标编码
def kfold_mean(df_train, df_test, target, target_mean_list):
    folds = StratifiedKFold(n_splits=10, random_state=42)

    mean_of_target = df_train[target].mean()

    for fold_, (trn_idx, val_idx) in tqdm(
            enumerate(folds.split(df_train, y=df_train[target]))):
        tr_x = df_train.iloc[trn_idx, :]
        vl_x = df_train.iloc[val_idx, :]

        for col in target_mean_list:
            df_train.loc[vl_x.index, f'{col}_target_enc'] = vl_x[col].map(
                tr_x.groupby(col)[target].mean())

    for col in target_mean_list:
        df_train[f'{col}_target_enc'] = df_train[col].map(
            df_train.groupby(col)[f'{col}_target_enc'].mean())
        
        df_test[f'{col}_target_enc'] = df_test[col].map(
            df_train.groupby(col)[f'{col}_target_enc'].mean())

    return pd.concat([df_train, df_test], ignore_index=True)


In [10]:
def get_bin(df, col, bins):
    for i in range(len(bins)):
        df[col+"_genFeat"+str(i)]=(df[col] > bins[i]).astype(int)
       
    return df, [col + f'_genFeat{i}' for i in range(len(bins))]

In [None]:
df = kfold_mean(df[~df['是否逾期'].isna()], df[df['是否逾期'].isna()], '是否逾期', raw_cate_cols)
df, genFeats2, genFeats3 = deal_table(df)
df = cat_feat_engin(df, raw_cate_cols)
df = cat_num_feat_engin(df, raw_cate_cols, raw_num_cols, hand_gen_feats, genFeats2, genFeats3)
df = num_feat_engin(df, raw_num_cols, hand_gen_feats)
train_data = df[df['是否逾期'].isna() == False].reset_index(drop=True)
test_data = df[df['是否逾期'].isna() == True].reset_index(drop=True)

10it [00:00, 52.25it/s]
100%|██████████| 4/4 [00:00<00:00, 18.05it/s]
100%|██████████| 6/6 [00:00<00:00, 23.42it/s]
100%|██████████| 4/4 [00:05<00:00,  1.32s/it]
100%|██████████| 42/42 [06:49<00:00,  9.75s/it]
 10%|▉         | 4/42 [00:52<07:58, 12.60s/it]

In [None]:
drop_feats = [f for f in train_data.columns if train_data[f].nunique() == 1 or train_data[f].nunique() == 0]
cols = [col for col in train_data.columns if col not in ['主键', '是否逾期'] + drop_feats]
len(drop_feats), drop_feats

In [None]:
# raw_feat_cols = [col for col in list(train_data.columns) if col not in drop_cols]
train_label = train_data['是否逾期']
train_data = train_data[cols]
test_data = test_data[cols]


In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
params = {
# 'boosting_type': 'gbdt',
#     'objective': 'binary',
    'metric': ['auc'],
    'eval_metric': ['auc'],
    'num_leaves': 31,
#     'max_bin': 50,
#         'max_depth': 6,
    "learning_rate": 0.01,
    "subsample": 0.8,
    "colsample_bytree": 0.8,  # 每次迭代中随机选择特征的比例
    "bagging_fraction": 0.8,  # 每次迭代时用的数据比例
    'min_child_samples': 25,
    'n_jobs': -1,
    'silent': True,  # 信息输出设置成1则没有信息输出
    'seed': 1208,
    'n_estimators':45000,
    'scale_pos_weight':0.1,
#         'lambda_l1':0.3,
#         'lambda_l2':0.2,
#     'min_split_gain':0.2,
    'verbose' : -1
    }  #设置出参数


In [None]:
n_folds = 5
oof_lgb = np.zeros(len(train_data))
predictions_lgb = np.zeros(len(test_data))
feat_importance_table = pd.DataFrame(columns=['feat', 'importance'])

# for seed in [617, 1208, 916]:
KF = StratifiedKFold(n_splits=n_folds, random_state=1208)
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train_data.values, train_label.values)):
#     trn_data = lgb.Dataset(train_data.iloc[trn_idx],label=train_label.iloc[trn_idx])    
#     val_data = lgb.Dataset(train_data.iloc[val_idx],label=train_label.iloc[val_idx])
    num_round = 45000
    clf = LGBMClassifier(**params)

    hist=clf.fit(

        X = train_data.iloc[trn_idx],
        y = train_label.iloc[trn_idx],
#                     fobj=logloss,
        eval_set = [(train_data.iloc[val_idx],train_label.iloc[val_idx])],
        verbose=500,
        early_stopping_rounds=1000,  
#         eval_metric = tpr_weight_funtion
#         categorical_feature=feature_list
    )
#         feat_importance_table['importance'+str(fold_)] = clf.feature_importances_
    val_pred = clf.predict_proba(train_data.iloc[val_idx], num_iteration=clf.best_iteration_)[:,1]
    oof_lgb[val_idx] = val_pred
    predictions_lgb[:] += clf.predict_proba(test_data, num_iteration=clf.best_iteration_)[:,1]
    print('--------------------------------'*2)
    print("TPR: {}".format(tpr_weight_funtion(train_label.iloc[val_idx], val_pred)))
    print('--------------------------------'*2)
# feat_importance_table['importance'] = feat_importance_table.mean(axis=1)
# feat_importance_table['feat'] = clf.

    print("AUC score: {}".format(roc_auc_score(train_label, oof_lgb)))
    print("TPR weight: {}".format(tpr_weight_funtion(train_label,oof_lgb)))

In [None]:
len(train_data.columns)

In [None]:
def tpr_weight_funtion(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

In [None]:
# rmse：0.4793978962640551
# l2: 0.4793978962640551
# quantiled:0.47653246282190787
# huber：0.47979688066739207
# fair：0.4786361987667755
# poisson：0.47279651795429817
# tweedie：0.4684439608269858
# binary_error：0.4763511062749365
# cross_entropy：0.473449401523395
# cross_entropy_lambda：0.4729778745012695
# kullback_leibler：0.473449401523395
# binary_logloss：0.473449401523395

In [None]:
0.47435618425825177
0.4803409503083061

In [None]:
# feat_importance_table = pd.DataFrame(columns=['feat', 'importance'])
# feat_importance_table['feat'] = gbm.feature_name()
# feat_importance_table['importance'] = gbm.feature_importance()
# feat_importance_table.sort_values('importance', ascending=False)

In [None]:
# test_pre = gbm.predict(test_data, num_iteration=gbm.best_iteration)
submit['label'] = predictions_lgb / 15
submit.to_csv('../result//0114-02.csv', index=False)

In [None]:
submit.head()