In [1]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
drop_cols = ['主键', '婚姻状况', '职业', '职称', '职务', '学历', '单位月缴存额', '是否逾期']
# os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
# drop_cols = ['主键', '是否逾期']

In [2]:
root = '../data'
col_dict = {
    'id': '主键',
    'XINGBIE': '性别',
    'CSNY': '出生年月',
    'HYZK': '婚姻状况',
    'ZHIYE': '职业',
    'ZHICHEN': '职称',
    'ZHIWU': '职务',
    'XUELI': '学历',
    'DWJJLX': '单位经济类型',
    'DWSSHY': '单位所属行业',
    'GRJCJS': '个人缴存基数',
    'GRZHZT': '个人账户状态',
    'GRZHYE': '个人账户余额',
    'GRZHSNJZYE': '个人账户上年结转余额',
    'GRZHDNGJYE': '个人账户当年归集余额', 
    'GRYJCE': '个人月缴存额',
    'DWYJCE': '单位月缴存额',
    'DKFFE': '贷款发放额',
    'DKYE': '贷款余额',
    'DKLL': '贷款利率',
    'label': '是否逾期'
}

In [3]:
def tpr_weight_funtion(y_predict,y_true):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true.get_label())
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]

    return 'tpr', 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3, True

In [4]:
def convert(x):
    try:
        timeArray = time.localtime(x)
        otherStyleTime = time.strftime("%Y-%m", timeArray)
        return otherStyleTime
    except:
        return time.strftime("%Y-%m",time.localtime(time.time()))

In [5]:
train = pd.read_csv(root+'/train.csv')
test = pd.read_csv(root+'/test.csv')
submit = pd.read_csv(root+'/submit.csv')
train['CSNY'] = train['CSNY'].apply(convert)
test['CSNY'] = test['CSNY'].apply(convert)

In [6]:
train.columns=train.columns.map(col_dict)
test.columns=test.columns.map(col_dict)
for col in [f for f in train.select_dtypes('int64').columns if f not in ['是否逾期', '贷款发放额']]:
    train[col].fillna('-1', inplace=True)
    test[col].fillna('-1', inplace=True)
    le = LabelEncoder()
    le.fit(pd.concat([train[[col]], test[[col]]], axis=0, ignore_index=True))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [7]:
#首套
#5年以上3.25
#1-5年 2.75
#二套
#5年以上 3.575
#1-5年 3.025
rate_dict = {3.025: 2.521, 3.575: 2.979, 3.25: 2.708, 2.75: 2.292}

def rate_func(x):
    if x == 3.025:
        return 2.521
    if x == 3.575:
        return 2.979
    if x == 3.25:
        return 2.708
    if x == 2.75:
        return 2.292
    return x

def loan_years(x):
    if x == 2.708 or x == 2.979:
        return 0
    return 1

def num_house(x):
    if x == 2.521 or x== 2.979:
        return 0
    return 1

In [8]:
def combine_feat(x, y):
    feat_dic = {}
    for i in range(len(x)):
        pair = (x[i], y[i])
        feat_dic[pair] = feat_dic.get(pair, 0) + 1

    return feat_dic

def HYJJLX(x, y, company_feat):
    #行业+经济类型
    if (x, y) in company_feat:
        return company_feat[(x, y)]
    return -1

company_feat = combine_feat(list(train['单位经济类型'])+list(test['单位经济类型']), list(train['单位所属行业'])+list(test['单位所属行业']))
company_feat = {k: v for k, v in company_feat.items() if v>50}

In [9]:
train['贷款利率'] = train['贷款利率'].apply(rate_func)
test['贷款利率'] = test['贷款利率'].apply(rate_func)
raw_feat_cols = [col for col in list(train.columns) if col not in drop_cols ]
train_data = train[raw_feat_cols]
train_data['出生年月'] = train_data['出生年月'].apply(lambda x: int(str(x).split('-')[0]))
test_data = test[raw_feat_cols]
test_data['出生年月'] = test_data['出生年月'].apply(lambda x: int(str(x).split('-')[0]))
train_label = train['是否逾期']

In [10]:
train_data['行业+经济类型'] = train_data.apply(lambda row: HYJJLX(row['单位经济类型'], row['单位经济类型'], company_feat), axis=1)
train_data['归集余额+结转余额'] = train_data['个人账户上年结转余额']+train_data['个人账户当年归集余额']
train_data['归集余额+结转余额-个人账户余额'] = train_data['归集余额+结转余额']-train_data['个人账户余额']
train_data['个人月缴存额/贷款余额'] = train_data['个人月缴存额']/train_data['贷款余额']
train_data['个人账户余额/贷款余额'] = train_data['个人账户余额']/train_data['贷款余额']
train_data['个人月缴存额/贷款发放额'] = train_data['个人月缴存额']/train_data['贷款发放额']
train_data['贷款利率*贷款发放额'] = train_data['贷款利率']*train_data['贷款发放额']
train_data['贷款利率*贷款余额'] = train_data['贷款利率']*train_data['贷款余额']
train_data['个人缴存基数/个人账户余额'] = train_data['个人缴存基数']*train_data['个人账户余额']
train_data['个人缴存基数/个人账户上年结转余额'] = train_data['个人缴存基数']*train_data['个人账户上年结转余额']
train_data['个人缴存基数/贷款发放额'] = train_data['个人缴存基数']*train_data['贷款发放额']
train_data['个人缴存基数/贷款余额'] = train_data['个人缴存基数']*train_data['贷款余额']
train_data['贷款利率/个人缴存基数'] = train_data['贷款利率']*train_data['个人缴存基数']
train_data['公积金比例'] = train_data['个人月缴存额']/train_data['个人缴存基数']
train_data['贷款年限类别'] = train_data['贷款利率'].apply(loan_years)
train_data['第N房'] = train_data['贷款利率'].apply(num_house)

test_data['行业+经济类型'] = test_data.apply(lambda row: HYJJLX(row['单位经济类型'], row['单位经济类型'], company_feat), axis=1)
test_data['归集余额+结转余额'] = test_data['个人账户上年结转余额']+test_data['个人账户当年归集余额']
test_data['归集余额+结转余额-个人账户余额'] = test_data['归集余额+结转余额']-test_data['个人账户余额']
test_data['个人月缴存额/贷款余额'] = test_data['个人月缴存额']/test_data['贷款余额']
test_data['个人账户余额/贷款余额'] = test_data['个人账户余额']/test_data['贷款余额']
test_data['个人月缴存额/贷款发放额'] = test_data['个人月缴存额']/test_data['贷款发放额']
test_data['贷款利率*贷款发放额'] = test_data['贷款利率']*test_data['贷款发放额']
test_data['贷款利率*贷款余额'] = test_data['贷款利率']*test_data['贷款余额']
test_data['个人缴存基数/个人账户余额'] = test_data['个人缴存基数']*test_data['个人账户余额']
test_data['个人缴存基数/个人账户上年结转余额'] = test_data['个人缴存基数']*test_data['个人账户上年结转余额']
test_data['个人缴存基数/贷款发放额'] = test_data['个人缴存基数']*test_data['贷款发放额']
test_data['个人缴存基数/贷款余额'] = test_data['个人缴存基数']*test_data['贷款余额']
test_data['贷款利率/个人缴存基数'] = test_data['贷款利率']*test_data['个人缴存基数']
test_data['公积金比例'] = test_data['个人月缴存额']/test_data['个人缴存基数']
test_data['贷款年限类别'] = test_data['贷款利率'].apply(loan_years)
test_data['第N房'] = test_data['贷款利率'].apply(num_house)


In [11]:
X_train, X_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.2)

In [12]:
import lightgbm as lgb
params = {
        'objective': 'binary',
        'metric': ['binary_logloss', 'auc'],
        'num_leaves': 31,
        'max_bin': 50,
        'max_depth': 6,
        "learning_rate": 0.01,
        "colsample_bytree": 0.8,  # 每次迭代中随机选择特征的比例
        "bagging_fraction": 0.8,  # 每次迭代时用的数据比例
        'min_child_samples': 25,
        'n_jobs': -1,
        'silent': True,  # 信息输出设置成1则没有信息输出
        'seed': 1208,
        'scale_pos_weight':0.1,
        'verbose' : -1
    }  #设置出参数


In [13]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval  = lgb.Dataset(X_val, y_val, reference=lgb_train)
# , feval = tpr_weight_funtion
gbm = lgb.train(params, lgb_train, num_boost_round=40000, valid_sets=[lgb_train, lgb_eval],verbose_eval=100,
                    early_stopping_rounds=200)

Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.203043	training's auc: 0.883875	valid_1's binary_logloss: 0.207106	valid_1's auc: 0.874546
[200]	training's binary_logloss: 0.188004	training's auc: 0.915476	valid_1's binary_logloss: 0.193936	valid_1's auc: 0.906907
[300]	training's binary_logloss: 0.183867	training's auc: 0.922495	valid_1's binary_logloss: 0.192213	valid_1's auc: 0.911657
[400]	training's binary_logloss: 0.177641	training's auc: 0.930129	valid_1's binary_logloss: 0.188595	valid_1's auc: 0.916282
[500]	training's binary_logloss: 0.171557	training's auc: 0.939345	valid_1's binary_logloss: 0.185674	valid_1's auc: 0.920912
[600]	training's binary_logloss: 0.16547	training's auc: 0.94726	valid_1's binary_logloss: 0.183488	valid_1's auc: 0.923844
[700]	training's binary_logloss: 0.160144	training's auc: 0.953072	valid_1's binary_logloss: 0.181505	valid_1's auc: 0.925566
[800]	training's binary_logloss: 0.155366	training's auc: 

In [14]:
tpr_weight_funtion(gbm.predict(X_val), lgb_eval)

('tpr', 0.47486437613019894, True)

In [15]:
feat_importance_table = pd.DataFrame(columns=['feat', 'importance'])
feat_importance_table['feat'] = gbm.feature_name()
feat_importance_table['importance'] = gbm.feature_importance()
feat_importance_table

Unnamed: 0,feat,importance
0,性别,629
1,出生年月,4213
2,单位经济类型,3885
3,单位所属行业,6285
4,个人缴存基数,1550
5,个人账户状态,937
6,个人账户余额,1921
7,个人账户上年结转余额,2414
8,个人账户当年归集余额,3345
9,个人月缴存额,2489


In [16]:
test_pre = gbm.predict(test_data, num_iteration=gbm.best_iteration)
submit['label'] = test_pre
submit.to_csv('../submit_0108_02.csv', index=False)