In [41]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import os
import matplotlib.pyplot as plt

In [72]:
target = '标签'
uid = '申请编号'
columns = {}
columns['no_fea'] = [uid, target, '申请时点']
columns['01'] = [
    '贷款类型',
    '是否提供手机号',
    '是否提供电话',
    '手机号是否有效',
    '是否提供email',
    '申请人是否额外提供了文件2',
    '申请人是否额外提供了文件3',
    '申请人是否额外提供了文件4',
    '申请人是否额外提供了文件5',
    '申请人是否额外提供了文件6',
    '申请人是否额外提供了文件7',
    '申请人是否额外提供了文件8',
    '申请人是否额外提供了文件9',
    '是否有车',
    '是否有房',
    '地址是否一致标志1',
    '地址是否一致标志2',
    '地址是否一致标志3',
    '地址是否一致标志4',
    '地址是否一致标志5',
    '地址是否一致标志6']
columns['time'] = [
    '申请周内日',
    '申请时点',
    '出生日期距申请日期天数']
columns['_dt'] = [
    '最近一次换手机号码距申请日天数',
    '出生日期距申请日期天数',
    '工作日期距申请日期天数',
    '注册日期距申请日期天数',
    '身份认证日期距申请日期天数']
columns['money'] = [
    '贷款年金',
    '客户收入',
    '商品价格',
    '信用额度']

In [67]:
target = '标签'
uid = '申请编号'

def calc_auc(y_test, y_proba):
    auc = round(metrics.roc_auc_score(y_test, y_proba), 3)
    return auc

def calc_ks(y_test, y_proba):
    s = pd.qcut(y_proba, 10, labels=False, duplicates='drop')
    fpr, tpr, thresholds = metrics.roc_curve(y_test, s)
    KS = round(max(abs(fpr-tpr)) * 100, 1) # 横坐标倒序
    return KS

def ks_score(y_test, y_proba):
    scale = 4
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba, pos_label=1)
    KS = round(max(list(tpr-fpr)), scale)
    return KS

In [96]:
############## Load

# data_ori= pd.read_csv('./data/0_merge11.csv', header=0, index_col=0)
data_ori= pd.read_csv('./tmp/3_poly.csv', header=0, index_col=0)
data_ori.shape
data_ori.head()

# Drop uid
data = data_ori.drop(uid, axis=1)
''' Ori '''
data.shape

(140000, 168)

Unnamed: 0,申请编号,标签,贷款类型,信用额度,贷款年金,商品价格,陪同申请人,出生日期距申请日期天数,工作日期距申请日期天数,注册日期距申请日期天数,...,是否提供手机号^2,是否提供手机号 地址是否一致标志2,是否提供手机号 注册日期距申请日期天数,是否提供手机号 有车时间,地址是否一致标志2^2,地址是否一致标志2 注册日期距申请日期天数,地址是否一致标志2 有车时间,注册日期距申请日期天数^2,注册日期距申请日期天数 有车时间,有车时间^2
0,0,1,0,460190.889355,17463.042019,419951.511045,7.0,-10107.0,-342.0,-5421.0,...,1.0,0.0,-5421.0,4.0,0.0,-0.0,0.0,29387241.0,-21684.0,16.0
1,1,0,0,424370.659603,15585.046388,384597.975692,7.0,-13980.0,-1110.0,-3387.0,...,1.0,1.0,-3387.0,4.0,1.0,-3387.0,4.0,11471769.0,-13548.0,16.0
2,2,0,0,469330.587153,18837.558252,419951.511045,1.0,-13331.0,-2246.0,-3870.0,...,1.0,0.0,-3870.0,1.0,0.0,-0.0,0.0,14976900.0,-3870.0,1.0
3,3,0,0,464188.35314,16754.587069,409850.500944,7.0,-16540.0,,-970.0,...,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,940900.0,-3880.0,16.0
4,5,0,0,434196.988738,16165.0092,394698.985793,7.0,-17919.0,-11037.0,-9350.0,...,1.0,0.0,-9350.0,4.0,0.0,-0.0,0.0,87422500.0,-37400.0,16.0


' Ori '

(140000, 167)

In [97]:
############## Train, Test

from sklearn.model_selection import train_test_split

# Split
X, X_test = train_test_split(data, test_size=0.3, random_state=123)
y = X.pop(target)
y_test = X_test.pop(target)

''' Training, Test '''
X.shape
X_test.shape

' Training, Test '

(98000, 166)

(42000, 166)

In [98]:
# ### Sample Balance
#
# data_ori = data_ori.drop(uid, axis=1)
# ''' Ori '''
# data_ori.shape
# 
# data1 = data.loc[data[target]==1, :]
# data0 = data.loc[data[target]==0, :]
# frac = len(data1) / len(data0)
# data0 = data0.sample(frac=frac, random_state=123)
# data = pd.concat([data1, data0], axis=0)
# ''' Balance '''
# data[target].value_counts()

# # Fill Null
# data = data[data.columns.difference(columns['_dt'] + columns['money'])]
# data.fillna(-1, inplace=True)

In [99]:
############### LGB Training

def train_lgb(X, y):
    # Data input
    train = lgb.Dataset(X, y)
    params = {'num_leaves' : 2**5,
              'feature_fraction' : 0.9,
              'bagging_fraction' : 0.8,
              'bagging_freq' : 5,
              'random_state' : 123,
              'is_unbalance' : True}
    clf = lgb.train(params,
                    train)
    pred = clf.predict(X)
    ks = ks_score(y, pred)
    print(f'Training KS: {ks}')
    return clf

def test_lgb(clf, X):
    pred = clf.predict(X)
    return pred


### Training 
print('************************ Training ************************')

gbm = train_lgb(X, y)
gbm.save_model(f'./model/1_model_lgb.txt')

print('************************ Test ************************')

# print('* Training *')
# train['score_lgb'] = test_lgb(gbm, X)
# auc = calc_auc(train, 'score_lgb', target)
# ks = calc_ks(train, 'score_lgb', target)
# print(f'AUC:{auc}\tKS:{ks}')

X_test['score_lgb'] = test_lgb(gbm, X_test)
ks = ks_score(y_test, X_test['score_lgb'])
print(f'Test KS: {ks}')

************************ Training ************************
Training KS: 0.4052


<lightgbm.basic.Booster at 0x1a2065e240>

************************ Test ************************
Test KS: 0.3083
