In [2]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
target = '标签'
uid = '申请编号'
columns = {}
columns['no_fea'] = [uid, target, '申请时点']
columns['01'] = [
    '贷款类型',
    '是否提供手机号',
    '是否提供电话',
    '手机号是否有效',
    '是否提供email',
    '申请人是否额外提供了文件2',
    '申请人是否额外提供了文件3',
    '申请人是否额外提供了文件4',
    '申请人是否额外提供了文件5',
    '申请人是否额外提供了文件6',
    '申请人是否额外提供了文件7',
    '申请人是否额外提供了文件8',
    '申请人是否额外提供了文件9',
    '是否有车',
    '是否有房',
    '地址是否一致标志1',
    '地址是否一致标志2',
    '地址是否一致标志3',
    '地址是否一致标志4',
    '地址是否一致标志5',
    '地址是否一致标志6']
columns['100'] = [
    '性别',
    '客户居住地评分1',
    '客户居住地评分2',
    '教育程度',
    '婚姻状态',
    '居住状态',
    '贷款申请前1小时内征信查询次数',
    '贷款申请前1天内征信查询次数',
    '贷款申请前1周内征信查询次数',
    '贷款申请前1个月内征信查询次数',
    '贷款申请前1个季度内征信查询次数',
    '贷款申请前1年内征信查询次数',
    '收入类型',
    '陪同申请人',
    '孩子个数',
    '职业',
    '家庭成员数',
    '社交圈违约信息1_1',
    '社交圈违约信息2_1',
    '有车时间',
    '单位类型']
columns['time'] = [
    '申请周内日',
    '申请时点',
    '出生日期距申请日期天数']
columns['_dt'] = [
    '最近一次换手机号码距申请日天数',
    '出生日期距申请日期天数',
    '工作日期距申请日期天数',
    '注册日期距申请日期天数',
    '身份认证日期距申请日期天数']
columns['money'] = [
    '贷款年金',
    '客户收入',
    '商品价格',
    '信用额度']

In [4]:
###################### Load 

d1 = pd.read_csv('./data/A_Application.csv')
d1.shape
d2 = pd.read_csv('./data/A_Personas.csv')
d2.shape
d11 = pd.merge(d1, d2, how='inner', on='申请编号')
d11.shape
d11.head()

' Load '

(21511, 31)

(21511, 32)

(21511, 62)

Unnamed: 0,申请编号,贷款类型,信用额度,贷款年金,商品价格,陪同申请人,出生日期距申请日期天数,工作日期距申请日期天数,注册日期距申请日期天数,身份认证日期距申请日期天数,...,地址是否一致标志3,地址是否一致标志4,地址是否一致标志5,地址是否一致标志6,单位类型,社交圈违约信息2_2,社交圈违约信息2_1,社交圈违约信息1_2,社交圈违约信息1_1,最近一次换手机号码距申请日天数
0,122687,0,409022.429715,14982.794125,374496.965591,7.0,-11679,-6017,-5258,-3590,...,0,0,0,0,30,0.0,0.0,0.0,0.0,-355.0
1,32425,0,431340.833176,16800.914167,394698.985793,1.0,-14944,-714,-925,-1789,...,0,1,0,1,5,0.0,0.0,0.0,0.0,5.0
2,2024,0,538169.129203,18860.721801,465406.0565,7.0,-12810,-1082,-6178,-3032,...,0,0,0,0,47,0.0,0.0,0.0,0.0,-765.0
3,25019,1,425570.82194,15367.833489,389648.480742,7.0,-6509,-392,-370,-651,...,0,0,0,0,11,0.0,0.0,0.0,0.0,-129.0
4,162532,0,499323.105558,17338.046266,445204.036298,,-15222,292204,-890,-2055,...,0,0,0,0,57,6.0,0.0,6.0,0.0,-75.0


In [5]:
def set_outlier(col):
    if col < col_min:
        col = col_min
    elif col > col_max:
        col = col_max
    return col

###################### Data Prepare 

d12 = d11.copy()

### 置空
d12.loc[d12['孩子个数']==-1, '孩子个数'] = np.NaN
# 距申请日期>0
for col in columns['_dt']:
    d12.loc[d12[col]>0, col] = np.NaN
    
### Outlier
outliers = pd.read_csv('./tmp/0_outlier1.csv', index_col=0)
outliers.shape

for col in outliers.index:
    col_min = outliers.loc[col, 'min']
    col_max = outliers.loc[col, 'max']
    d12[col] = d12[col].apply(set_outlier)

(61, 2)

In [14]:
def test_lgb(clf, X):
    pred = clf.predict(X)
    return pred

def norm_score(score):
    if score < 0:
        score = 0
    elif score > 1:
        score = 1
    return score

###################### Predict

gbm = lgb.Booster(model_file='./model/model1.txt')
d11['score'] = test_lgb(gbm, d11)
out = d11[[uid, 'score']]
out['score'] = out['score'].apply(norm_score)
out.shape
out.head()

out.to_csv('./model/predict.csv', header=False, index=False)

(21511, 2)

Unnamed: 0,申请编号,score
0,122687,0.075338
1,32425,0.076775
2,2024,0.217198
3,25019,0.09689
4,162532,0.11001
