In [1]:
import os
import time
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
drop_cols = ['主键', '婚姻状况', '职业', '职称', '职务', '学历', '单位月缴存额', '是否逾期']
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
# drop_cols = ['主键', '是否逾期']

In [2]:
root = '../data'
col_dict = {
    'id': '主键',
    'XINGBIE': '性别',
    'CSNY': '出生年月',
    'HYZK': '婚姻状况',
    'ZHIYE': '职业',
    'ZHICHEN': '职称',
    'ZHIWU': '职务',
    'XUELI': '学历',
    'DWJJLX': '单位经济类型',
    'DWSSHY': '单位所属行业',
    'GRJCJS': '个人缴存基数',
    'GRZHZT': '个人账户状态',
    'GRZHYE': '个人账户余额',
    'GRZHSNJZYE': '个人账户上年结转余额',
    'GRZHDNGJYE': '个人账户当年归集余额', 
    'GRYJCE': '个人月缴存额',
    'DWYJCE': '单位月缴存额',
    'DKFFE': '贷款发放额',
    'DKYE': '贷款余额',
    'DKLL': '贷款利率',
    'label': '是否逾期'
}

In [3]:
# 目标编码
def kfold_mean(df_train, df_test, target, target_mean_list):
    folds = StratifiedKFold(n_splits=10, random_state=42)

    mean_of_target = df_train[target].mean()

    for fold_, (trn_idx, val_idx) in tqdm(
            enumerate(folds.split(df_train, y=df_train[target]))):
        tr_x = df_train.iloc[trn_idx, :]
        vl_x = df_train.iloc[val_idx, :]

        for col in target_mean_list:
            df_train.loc[vl_x.index, f'{col}_target_enc'] = vl_x[col].map(
                tr_x.groupby(col)[target].mean())

    for col in target_mean_list:
        df_train[f'{col}_target_enc'] = df_train[col].map(
            df_train.groupby(col)[f'{col}_target_enc'].mean())
        
        df_test[f'{col}_target_enc'] = df_test[col].map(
            df_train.groupby(col)[f'{col}_target_enc'].mean())

    return pd.concat([df_train, df_test], ignore_index=True)


In [4]:
def tpr_weight_funtion(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

In [5]:
def get_age(df,col):
    df[col+"_genFeat1"]=(df[col] > 18).astype(int)
    df[col+"_genFeat2"]=(df[col] > 25).astype(int)
    df[col+"_genFeat3"]=(df[col] > 30).astype(int)
    df[col+"_genFeat4"]=(df[col] > 35).astype(int)
    df[col+"_genFeat5"]=(df[col] > 40).astype(int)
    df[col+"_genFeat6"]=(df[col] > 45).astype(int)
    return df, [col + f'_genFeat{i}' for i in range(1, 7)]

def get_daikuanYE(df,col):
    df[col + '_genFeat1'] = (df[col] > 100000).astype(int)
    df[col + '_genFeat2'] = (df[col] > 120000).astype(int)
    df[col + '_genFeat3'] = (df[col] > 140000).astype(int)
    df[col + '_genFeat4'] = (df[col] > 180000).astype(int)
    df[col + '_genFeat5'] = (df[col] > 220000).astype(int)
    df[col + '_genFeat6'] = (df[col] > 260000).astype(int)
    df[col + '_genFeat7'] = (df[col] > 300000).astype(int)
    return df, [col + f'_genFeat{i}' for i in range(1, 8)]



In [6]:
def logloss(y_pred, y):
    y_true = y.get_label()
    residual = (y_true - y_pred).astype("float")
#     grad = np.where(residual>0, -y_true/(np.exp(y_true*y_pred)+1), -y_true/(np.exp(y_true*y_pred)+1))#对预估里程低于实际里程的情况加大惩罚
#     hess = np.where(residual>0, np.exp(y_true*y_pred)/((np.exp(y_true*y_pred)+1)**2), np.exp(y_true*y_pred)/((np.exp(y_true*y_pred)+1)**2))#对预估里程低于实际里程的情况加大惩罚
    grad = np.where(residual<0, -2*(residual)/(y_true+1), -5*2*(residual)/(y_true+1))#对预估里程低于实际里程的情况加大惩罚
    hess = np.where(residual<0, 2/(y_true+1), 5*2/(y_true+1))#对预估里程低于实际里程的情况加大惩罚
    return grad, hess

In [7]:
def monthly_house_payments(p, i):
    if i == 2.708 or i == 2.979:
        n = 30*12
    else:
        n = 5*12
    i = i/100
    return p*i*(1+i)**n/((1+i)**n-1)

In [8]:
dt = "2021-1-01 00:00:00"
timestamp_assign = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
timestamp_assign=time.mktime(timestamp_assign)
def convert(x):
    age = (datetime.datetime.fromtimestamp(timestamp_assign)-datetime.datetime.fromtimestamp(x)).days
    return age//365+1

In [9]:
train = pd.read_csv(root+'/train.csv')
test = pd.read_csv(root+'/test.csv')
submit = pd.read_csv(root+'/submit.csv')
train['CSNY'] = train['CSNY'].apply(convert)
test['CSNY'] = test['CSNY'].apply(convert)

In [10]:
train.columns=train.columns.map(col_dict)
test.columns=test.columns.map(col_dict)
for col in [f for f in train.select_dtypes('int64').columns if f not in ['是否逾期', '贷款发放额']]:
    print(col)
    train[col].fillna('-1', inplace=True)
    test[col].fillna('-1', inplace=True)
    le = LabelEncoder()
    le.fit(pd.concat([train[[col]], test[[col]]], axis=0, ignore_index=True))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

性别
出生年月
婚姻状况
职业
职称
职务
学历
单位经济类型
单位所属行业
个人账户状态


In [11]:
#首套
#5年以上3.25： 2.708
#1-5年 2.75： 2.292
#二套
#5年以上 3.575： 2.979
#1-5年 3.025： 2.521
rate_dict = {3.025: 2.521, 3.575: 2.979, 3.25: 2.708, 2.75: 2.292}

def rate_func(x):
    if x == 3.025:
        return 2.521
    if x == 3.575:
        return 2.979
    if x == 3.25:
        return 2.708
    if x == 2.75:
        return 2.292
    return x

def loan_years(x):
    if x == 2.708 or x == 2.979:
        return 0
    return 1

def num_house(x):
    if x == 2.521 or x== 2.979:
        return 0
    return 1

In [12]:
def ability_pay(gz, gjj, yg):
    #还贷能力系数
    return (gz+gjj)/yg

In [13]:
def combine_feat(x, y):
    feat_dic = {}
    for i in range(len(x)):
        pair = (x[i], y[i])
        feat_dic[pair] = feat_dic.get(pair, 0) + 1

    return feat_dic

def HYJJLX(x, y, company_feat):
    #行业+经济类型
    if (x, y) in company_feat:
        return company_feat[(x, y)]
    return -1

company_feat = combine_feat(list(train['单位经济类型']), list(train['单位所属行业']))
company_feat = {k: v for k, v in company_feat.items() if v>50}

In [14]:
train['贷款利率'] = train['贷款利率'].apply(rate_func)
test['贷款利率'] = test['贷款利率'].apply(rate_func)
feature_list =  ['单位经济类型', '单位所属行业', '个人账户状态']
data = pd.concat([train, test], ignore_index=True)
# data = kfold_mean(data[~data['是否逾期'].isna()], data[data['是否逾期'].isna()],
#                   '是否逾期',
#                   feature_list)

# 频数统计

for col in feature_list:
    data[col + '_COUNT'] = data[col].map(data[col].value_counts())
    col_idx = data[col].value_counts()
    for idx in col_idx[col_idx < 5].index:
        data[col] = data[col].replace(idx, -1)  
        
# 偏离值特征
group_list = ['单位经济类型', '单位所属行业', '个人账户状态']
num_feature_list = ['个人月缴存额', '贷款发放额', '贷款余额', '个人缴存基数',
                    '个人账户上年结转余额', '个人账户当年归集余额']                   
for group in group_list:
    for feature in num_feature_list:
        tmp = data.groupby(group)[feature].agg([sum, min, max, np.mean]).reset_index()
        tmp = pd.merge(data, tmp, on=group, how='left')
        data['{}-mean_gb_{}'.format(feature, group)] = data[feature] - tmp['mean']
        data['{}-min_gb_{}'.format(feature, group)] = data[feature] - tmp['min']
        data['{}-max_gb_{}'.format(feature, group)] = data[feature] - tmp['max']
        data['{}/sum_gb_{}'.format(feature, group)] = data[feature] / tmp['sum']  
train, test = data[~data['是否逾期'].isna()], data[data['是否逾期'].isna()]


In [15]:

raw_feat_cols = [col for col in list(train.columns) if col not in drop_cols ]
train_data = train[raw_feat_cols]
test_data = test[raw_feat_cols]
train_label = train['是否逾期']

In [16]:
root = '../data'
col_dict = {
    'id': '主键',
    'XINGBIE': '性别',
    'CSNY': '出生年月',
    'HYZK': '婚姻状况',
    'ZHIYE': '职业',
    'ZHICHEN': '职称',
    'ZHIWU': '职务',
    'XUELI': '学历',
    'DWJJLX': '单位经济类型',
    'DWSSHY': '单位所属行业',
    'GRJCJS': '个人缴存基数',
    'GRZHZT': '个人账户状态',
    'GRZHYE': '个人账户余额',
    'GRZHSNJZYE': '个人账户上年结转余额',
    'GRZHDNGJYE': '个人账户当年归集余额', 
    'GRYJCE': '个人月缴存额',
    'DWYJCE': '单位月缴存额',
    'DKFFE': '贷款发放额',
    'DKYE': '贷款余额',
    'DKLL': '贷款利率',
    'label': '是否逾期'
}

In [None]:
def deal_table(df):
    df, genFeats1 = get_age(df, col = '出生年月')
    df, genFeats2 = get_daikuanYE(df, col = '贷款余额')
    df, genFeats3 = get_daikuanYE(df, col = '贷款发放额')
    df['贷款发放额_贷款余额'] = df['贷款发放额'] + df['贷款余额']
    df['贷款发放额_贷款余额_multi_贷款利率'] = (df['贷款发放额'] + df['贷款余额']) * df['贷款利率']
    df['贷款发放额_multi_贷款利率'] = df['贷款发放额'] * df['贷款利率']
    df['贷款余额_multi_贷款利率'] = df['贷款余额'] * df['贷款利率']

    df['贷款发放额_multi_贷款利率_ratio'] = df['贷款发放额'] * df['贷款利率'] / df['贷款发放额_贷款余额_multi_贷款利率']
    df['贷款余额_multi_贷款利率_ratio'] = df['贷款余额'] * df['贷款利率'] / df['贷款发放额_贷款余额_multi_贷款利率']
    df['贷款余额_贷款发放额_ratio'] = df['贷款余额'] / df['贷款发放额_贷款余额']
    df['贷款发放额_贷款余额_ratio'] = df['贷款发放额'] / df['贷款发放额_贷款余额']
    df['个人账户余额_diff_个人账户当年归集余额'] = df['个人账户余额'] - df['个人账户当年归集余额']
    df['个人账户余额_diff_个人账户上年结转余额'] = df['个人账户余额'] - df['个人账户上年结转余额']
    df['行业+经济类型'] = df.apply(lambda row: HYJJLX(row['单位经济类型'], row['单位经济类型'], company_feat), axis=1)
    df['月供'] = df.apply(lambda row: monthly_house_payments(row['贷款发放额'], row['贷款利率']), axis=1)
    df['月供2'] = df.apply(lambda row: monthly_house_payments(row['贷款余额'], row['贷款利率']), axis=1)
    df['还贷能力系数'] = df.apply(lambda row: ability_pay(row['个人缴存基数'], row['个人月缴存额'], row['月供']), axis=1)
    df['月供/个人缴存基数'] = df['月供']/df['个人缴存基数']
    df['月供/个人月缴存额'] = df['月供']/df['个人月缴存额']
    df['归集余额+结转余额'] = df['个人账户上年结转余额']+df['个人账户当年归集余额']
    df['公积金+结转余额'] = df['个人月缴存额']*24+df['个人账户上年结转余额']
    df['公积金-归集余额'] = df['个人月缴存额']*24-df['个人账户当年归集余额']
    df['归集余额+结转余额-个人账户余额'] = df['归集余额+结转余额']-df['个人账户余额']
    df['个人月缴存额/贷款余额'] = df['个人月缴存额']/df['贷款余额']
    df['个人账户余额/贷款余额'] = df['个人账户余额']/df['贷款余额']
    df['个人月缴存额/贷款发放额'] = df['个人月缴存额']/df['贷款发放额']
    df['贷款利率*贷款发放额'] = df['贷款利率']*df['贷款发放额']
    df['贷款利率*贷款余额'] = df['贷款利率']*df['贷款余额']
    df['个人缴存基数/个人账户余额'] = df['个人缴存基数']*df['个人账户余额']
    df['个人缴存基数/个人账户上年结转余额'] = df['个人缴存基数']*df['个人账户上年结转余额']
    df['个人缴存基数/贷款发放额'] = df['个人缴存基数']*df['贷款发放额']
    df['个人缴存基数/贷款余额'] = df['个人缴存基数']*df['贷款余额']
    df['贷款利率/个人缴存基数'] = df['贷款利率']/df['个人缴存基数']
    df['公积金比例'] = df['个人月缴存额']/df['个人缴存基数']
    df['贷款年限类别'] = df['贷款利率'].apply(loan_years)
    df['第N房'] = df['贷款利率'].apply(num_house)
    df['已还贷款'] = df['贷款发放额']-df['贷款余额']
    df['贷款余额/公积金'] = df['贷款余额']/df['个人缴存基数']
    df['已还贷款比例'] = df['已还贷款']/df['贷款发放额']
    df['已还贷款年限'] = df['已还贷款']/df['月供']
    df['贷款余额-个人账户余额'] = df['贷款余额']/df['个人账户余额']
    
    return df

train_data = deal_table(train_data)
test_data = deal_table(test_data)

In [19]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
params = {
# 'boosting_type': 'gbdt',
#     'objective': 'binary',
    'metric': ['auc'],
    'eval_metric': ['auc'],
    'num_leaves': 31,
#     'max_bin': 50,
#         'max_depth': 6,
    "learning_rate": 0.01,
    "subsample": 0.8,
    "colsample_bytree": 0.8,  # 每次迭代中随机选择特征的比例
    "bagging_fraction": 0.8,  # 每次迭代时用的数据比例
    'min_child_samples': 25,
    'n_jobs': -1,
    'silent': True,  # 信息输出设置成1则没有信息输出
    'seed': 1208,
    'n_estimators':45000,
    'scale_pos_weight':0.1,
#         'lambda_l1':0.3,
#         'lambda_l2':0.2,
#     'min_split_gain':0.2,
    'verbose' : -1
    }  #设置出参数


In [20]:
n_folds = 5
oof_lgb = np.zeros(len(train_data))
predictions_lgb = np.zeros(len(test_data))
feat_importance_table = pd.DataFrame(columns=['feat', 'importance'])

for seed in [617, 1208, 916]:
    KF = StratifiedKFold(n_splits=n_folds, random_state=seed)
    for fold_, (trn_idx, val_idx) in enumerate(KF.split(train_data.values, train_label.values)):
    #     trn_data = lgb.Dataset(train_data.iloc[trn_idx],label=train_label.iloc[trn_idx])    
    #     val_data = lgb.Dataset(train_data.iloc[val_idx],label=train_label.iloc[val_idx])
        num_round = 45000
        clf = LGBMClassifier(**params)

        hist=clf.fit(

            X = train_data.iloc[trn_idx],
            y = train_label.iloc[trn_idx],
    #                     fobj=logloss,
            eval_set = [(train_data.iloc[val_idx],train_label.iloc[val_idx])],
            verbose=500,
            early_stopping_rounds=1000,  
    #         eval_metric = tpr_weight_funtion
    #         categorical_feature=feature_list
        )
#         feat_importance_table['importance'+str(fold_)] = clf.feature_importances_
        val_pred = clf.predict_proba(train_data.iloc[val_idx], num_iteration=clf.best_iteration_)[:,1]
        oof_lgb[val_idx] = val_pred
        predictions_lgb[:] += clf.predict_proba(test_data, num_iteration=clf.best_iteration_)[:,1]
        print('--------------------------------'*2)
        print("TPR: {}".format(tpr_weight_funtion(train_label.iloc[val_idx], val_pred)))
        print('--------------------------------'*2)
# feat_importance_table['importance'] = feat_importance_table.mean(axis=1)
# feat_importance_table['feat'] = clf.

    print("AUC score: {}".format(roc_auc_score(train_label, oof_lgb)))
    print("TPR weight: {}".format(tpr_weight_funtion(train_label,oof_lgb)))

Training until validation scores don't improve for 1000 rounds.
[500]	valid_0's auc: 0.939598
[1000]	valid_0's auc: 0.946424
[1500]	valid_0's auc: 0.946754
[2000]	valid_0's auc: 0.946752
Early stopping, best iteration is:
[1101]	valid_0's auc: 0.94705
----------------------------------------------------------------
TPR: 0.4860507246376812
----------------------------------------------------------------
Training until validation scores don't improve for 1000 rounds.
[500]	valid_0's auc: 0.936434
[1000]	valid_0's auc: 0.940579
[1500]	valid_0's auc: 0.94033
Early stopping, best iteration is:
[816]	valid_0's auc: 0.941038
----------------------------------------------------------------
TPR: 0.4759057971014493
----------------------------------------------------------------
Training until validation scores don't improve for 1000 rounds.
[500]	valid_0's auc: 0.921696
[1000]	valid_0's auc: 0.926733
[1500]	valid_0's auc: 0.927297
[2000]	valid_0's auc: 0.927048
[2500]	valid_0's auc: 0.927271
[3

In [21]:
def tpr_weight_funtion(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

In [22]:
# rmse：0.4793978962640551
# l2: 0.4793978962640551
# quantiled:0.47653246282190787
# huber：0.47979688066739207
# fair：0.4786361987667755
# poisson：0.47279651795429817
# tweedie：0.4684439608269858
# binary_error：0.4763511062749365
# cross_entropy：0.473449401523395
# cross_entropy_lambda：0.4729778745012695
# kullback_leibler：0.473449401523395
# binary_logloss：0.473449401523395

In [23]:
0.47435618425825177
0.4803409503083061

0.4803409503083061

In [24]:
# feat_importance_table = pd.DataFrame(columns=['feat', 'importance'])
# feat_importance_table['feat'] = gbm.feature_name()
# feat_importance_table['importance'] = gbm.feature_importance()
# feat_importance_table.sort_values('importance', ascending=False)

In [27]:
# test_pre = gbm.predict(test_data, num_iteration=gbm.best_iteration)
submit['label'] = predictions_lgb / 15
submit.to_csv('../result//0114-02.csv', index=False)

In [26]:
submit.head()

Unnamed: 0,id,label
0,test_0,0.000225
1,test_1,0.001134
2,test_2,0.002371
3,test_3,0.000598
4,test_4,0.001753
