In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import os
import matplotlib.pyplot as plt

In [25]:
target = '标签'
uid = '申请编号'

def calc_auc(y_test, y_proba):
    auc = round(metrics.roc_auc_score(y_test, y_proba), 3)
    return auc

def calc_ks(y_test, y_proba):
    s = pd.qcut(y_proba, 10, labels=False, duplicates='drop')
    fpr, tpr, thresholds = metrics.roc_curve(y_test, s)
    KS = round(max(abs(fpr-tpr)) * 100, 1) # 横坐标倒序
    return KS

def ks_score(y_test, y_proba):
    scale = 4
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba, pos_label=1)
    KS = round(max(list(tpr-fpr)), scale)
    return KS

In [3]:
############## Load

data_ori= pd.read_csv('./data/0_merge11.csv', header=0, index_col=0)
data_ori.shape
data_ori.head()

(140000, 63)

Unnamed: 0,申请编号,标签,贷款类型,信用额度,贷款年金,商品价格,陪同申请人,出生日期距申请日期天数,工作日期距申请日期天数,注册日期距申请日期天数,...,地址是否一致标志3,地址是否一致标志4,地址是否一致标志5,地址是否一致标志6,单位类型,社交圈违约信息2_2,社交圈违约信息2_1,社交圈违约信息1_2,社交圈违约信息1_1,最近一次换手机号码距申请日天数
0,0,1,0,460190.889355,17463.042019,419951.511045,7.0,-10107,-342,-5421,...,0,0,0,0,5,0.0,0.0,0.0,0.0,-887.0
1,1,0,0,424370.659603,15585.046388,384597.975692,7.0,-13980,-1110,-3387,...,0,0,0,1,5,0.0,0.0,0.0,0.0,-271.0
2,2,0,0,469330.587153,18837.558252,419951.511045,1.0,-13331,-2246,-3870,...,1,0,0,0,5,0.0,0.0,0.0,0.0,-332.0
3,3,0,0,464188.35314,16754.587069,409850.500944,7.0,-16540,292204,-970,...,0,0,0,0,57,1.0,0.0,1.0,0.0,-204.0
4,5,0,0,434196.988738,16165.0092,394698.985793,7.0,-17919,-11037,-9350,...,1,1,0,1,53,0.0,0.0,0.0,0.0,-1049.0


In [10]:
############## Train, Test

from sklearn.model_selection import train_test_split

data = data_ori.drop(uid, axis=1)
data.shape

X, X_test = train_test_split(data, test_size=0.3, random_state=123)
y = X.pop(target)
y_test = X_test.pop(target)

X.shape
X_test.shape

(140000, 62)

(98000, 61)

(42000, 61)

In [26]:
############### LGB Training

def train_lgb(X, y):
    # Data input
    train = lgb.Dataset(X, y)
    params = {'num_leaves' : 2**5,
              'feature_fraction' : 0.9,
              'bagging_fraction' : 0.8,
              'bagging_freq' : 5,
              'random_state' : 123}
    clf = lgb.train(params,
                    train)
    pred = clf.predict(X)
    ks = ks_score(y, pred)
    print(f'Training KS: {ks}')
    return clf


### Training 
print('************************ Training ************************')

gbm = train_lgb(X, y)
gbm.save_model(f'./model/1_model_lgb.txt')

************************ Training ************************
Training KS: 0.3941


<lightgbm.basic.Booster at 0x1a202c05f8>

In [27]:
############### LGB Test

def test_lgb(clf, X):
    pred = clf.predict(X)
    return pred
    
print('************************ Test ************************')

# print('* Training *')
# train['score_lgb'] = test_lgb(gbm, X)
# auc = calc_auc(train, 'score_lgb', target)
# ks = calc_ks(train, 'score_lgb', target)
# print(f'AUC:{auc}\tKS:{ks}')

print('* Test *')
X_test['score_lgb'] = test_lgb(gbm, X_test)
ks = ks_score(y_test, X_test['score_lgb'])
print(f'Test KS: {ks}')

************************ Test ************************
* Test *
Test KS: 0.3143
