In [1]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
drop_cols = ['主键', '婚姻状况', '职业', '职称', '职务', '学历', '是否逾期']
# drop_cols = ['主键', '是否逾期']

In [2]:
root = '../data'
col_dict = {
    'id': '主键',
    'XINGBIE': '性别',
    'CSNY': '出生年月',
    'HYZK': '婚姻状况',
    'ZHIYE': '职业',
    'ZHICHEN': '职称',
    'ZHIWU': '职务',
    'XUELI': '学历',
    'DWJJLX': '单位经济类型',
    'DWSSHY': '单位所属行业',
    'GRJCJS': '个人缴存基数',
    'GRZHZT': '个人账户状态',
    'GRZHYE': '个人账户余额',
    'GRZHSNJZYE': '个人账户上年结转余额',
    'GRZHDNGJYE': '个人账户当年归集余额', 
    'GRYJCE': '个人月缴存额',
    'DWYJCE': '单位月缴存额',
    'DKFFE': '贷款发放额',
    'DKYE': '贷款余额',
    'DKLL': '贷款利率',
    'label': '是否逾期'
}

In [3]:
def tpr_weight_funtion(y_predict,y_true):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true.get_label())
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]

    return 'tpr', 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3, True

In [4]:
def convert(x):
    timeArray = time.localtime(x)
    otherStyleTime = time.strftime("%Y-%m", timeArray)
    return otherStyleTime

In [5]:
train = pd.read_csv(root+'/train.csv')
test = pd.read_csv(root+'/test.csv')
submit = pd.read_csv(root+'/submit.csv')
train['CSNY'] = train['CSNY'].apply(convert)
test['CSNY'] = test['CSNY'].apply(convert)

In [6]:
train.columns=train.columns.map(col_dict)
test.columns=test.columns.map(col_dict)
for col in [f for f in train.select_dtypes('int64').columns if f not in ['是否逾期', '贷款发放额']]:
    train[col].fillna('-1', inplace=True)
    test[col].fillna('-1', inplace=True)
    le = LabelEncoder()
    le.fit(pd.concat([train[[col]], test[[col]]], axis=0, ignore_index=True))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

  y = column_or_1d(y, warn=True)


In [7]:
raw_feat_cols = [col for col in list(train.columns) if col not in drop_cols ]
train_data = train[raw_feat_cols]
train_data['出生年月'] = train_data['出生年月'].apply(lambda x: int(str(x).split('-')[0]))
test_data = test[raw_feat_cols]
test_data['出生年月'] = test_data['出生年月'].apply(lambda x: int(str(x).split('-')[0]))
train_label = train['是否逾期']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
X_train, X_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.3)

In [9]:
import lightgbm as lgb
params = {
        'objective': 'binary',
        'metric': ['binary_logloss', 'auc'],
        'num_leaves': 31,
        'max_bin': 50,
        'max_depth': 6,
        "learning_rate": 0.01,
        "colsample_bytree": 0.8,  # 每次迭代中随机选择特征的比例
        "bagging_fraction": 0.8,  # 每次迭代时用的数据比例
        'min_child_samples': 25,
        'n_jobs': -1,
        'silent': True,  # 信息输出设置成1则没有信息输出
        'seed': 1208,
    }  #设置出参数


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [10]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval  = lgb.Dataset(X_val, y_val, reference=lgb_train)
# , feval = tpr_weight_funtion
gbm = lgb.train(params, lgb_train, num_boost_round=40000, valid_sets=[lgb_train, lgb_eval],verbose_eval=100,
                    early_stopping_rounds=200)

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.917245	training's binary_logloss: 0.161104	valid_1's auc: 0.91347	valid_1's binary_logloss: 0.166004
[200]	training's auc: 0.928318	training's binary_logloss: 0.139786	valid_1's auc: 0.921318	valid_1's binary_logloss: 0.146726
[300]	training's auc: 0.938577	training's binary_logloss: 0.128897	valid_1's auc: 0.926566	valid_1's binary_logloss: 0.138358
[400]	training's auc: 0.945451	training's binary_logloss: 0.122182	valid_1's auc: 0.929525	valid_1's binary_logloss: 0.134036
[500]	training's auc: 0.950939	training's binary_logloss: 0.11723	valid_1's auc: 0.931237	valid_1's binary_logloss: 0.131409
[600]	training's auc: 0.955711	training's binary_logloss: 0.113107	valid_1's auc: 0.932494	valid_1's binary_logloss: 0.129662
[700]	training's auc: 0.959714	training's binary_logloss: 0.10948	valid_1's auc: 0.933285	valid_1's binary_logloss: 0.128358
[800]	training's auc: 0.963387	training's binary_logloss: 

In [11]:
feat_importance_table = pd.DataFrame(columns=['feat', 'importance'])
feat_importance_table['feat'] = gbm.feature_name()
feat_importance_table['importance'] = gbm.feature_importance()
feat_importance_table

Unnamed: 0,feat,importance
0,性别,326
1,出生年月,2472
2,单位经济类型,2293
3,单位所属行业,3556
4,个人缴存基数,2220
5,个人账户状态,569
6,个人账户余额,2038
7,个人账户上年结转余额,2115
8,个人账户当年归集余额,1808
9,个人月缴存额,2518


In [12]:
test_pre = gbm.predict(test_data, num_iteration=gbm.best_iteration)
submit['label'] = test_pre
submit.to_csv('../result/submit_0108_01.csv', index=False)