In [2]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [3]:
from category_encoders.target_encoder import TargetEncoder

# Load data and data preprocessing

In [3]:
seed = 42 # for the same data division

kf = KFold(n_splits=5, random_state=seed,shuffle=True)
df_train = pd.read_csv('train_final.csv')
df_test = pd.read_csv('test_final.csv')

X_train = df_train.drop(columns=['loan_status']).values
Y_train = df_train['loan_status'].values.astype(int)
X_test = df_test.drop(columns=['loan_status']).values
Y_test = df_test['loan_status'].values.astype(int)

In [4]:
X_train.shape, Y_train.shape

((50000, 145), (50000,))

In [5]:
# split data for five fold

five_fold_data = []

for train_index, eval_index in kf.split(X_train):
    x_train, x_eval = X_train[train_index], X_train[eval_index]
    y_train, y_eval = Y_train[train_index], Y_train[eval_index]
    
    five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

# Algorithm

In [6]:
def get_model(param):
    model_list = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[validation_data])
        model_list.append(bst)
    return model_list

# train

In [15]:
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                     'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}

In [26]:
# base param train
param_base_model = get_model(param_base)

# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)

0-th model is training:
[LightGBM] [Info] Number of positive: 31851, number of negative: 8149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2583
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 141
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.796275 -> initscore=1.363174
[LightGBM] [Info] Start training from score 1.363174
[1]	valid_0's binary_logloss: 0.448173
[2]	valid_0's binary_logloss: 0.405734
[3]	valid_0's binary_logloss: 0.372961
[4]	valid_0's binary_logloss: 0.34683
[5]	valid_0's binary_logloss: 0.325461
[6]	valid_0's binary_logloss: 0.307594
[7]	valid_0's binary_logloss: 0.292607
[8]	valid_0's binary_logloss: 0.279725
[9]	valid_0's binary_logloss: 0.268803
[10]	valid_0's binary_logloss: 0.259461
[11]	valid_0's binary_logloss: 0.251464
[12]	valid_0's binary_logloss: 0.244436
[13]	valid_0's binary_logloss: 0.23844
[14]	valid_0's 

# test

In [27]:
def test_model(model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

In [28]:
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)

print('base: {}, fine tuning: {}'.format(base_score, fine_tuning_score))

base: 0.91552, fine tuning: 0.91756


## 将one-hot转为target-encoder

In [4]:
def inverse_onehot(df, cols_prefix):
    cols = [x for x in df.columns if cols_prefix in x]
    onehots = df.loc[:, cols].values
    trans = np.arange(1, onehots.shape[1]+1).reshape(onehots.shape[1], -1)
    return cols, np.dot(onehots, trans)

def inverse_onehot_mat(df, col_prefixes):
    df1 = df.copy(deep=True)
    drop_cols = []
    for col in col_prefixes:
        cols, value = inverse_onehot(df1, col)
        drop_cols.extend(cols)
        df1[col] = value
        
    df1.drop(columns=drop_cols, inplace=True)
    return df1

In [5]:
def split_kfold(k, X_train, Y_train):
    seed = 42 # for the same data division
    kf = KFold(n_splits=k, random_state=seed,shuffle=True)
    
    fold_data = []

    for train_index, eval_index in kf.split(X_train):
        x_train, x_eval = X_train[train_index], X_train[eval_index]
        y_train, y_eval = Y_train[train_index], Y_train[eval_index]

        fold_data.append([(x_train, y_train), (x_eval, y_eval)])
    return fold_data

In [6]:
def train_model(param, kfold_data):
    model_list = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(kfold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[validation_data])
        model_list.append(bst)
    return model_list

In [7]:
def test_model_x(model_list, X_test, Y_test):
    data = X_test
    fold_pred = np.zeros((len(model_list), len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        fold_pred[i] = ypred
    ypred_mean = (fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

In [16]:
def train_and_test(params, base_score, X_train, y_train, X_test, y_test):
    params['verbosity'] =-1 #关闭训练过程打中间信息输出
    kfold_data = split_kfold(5, X_train, y_train)
    models = train_model(params, kfold_data)
    score = test_model_x(models, X_test, y_test)
    print('base: {}, tuning: {}'.format(base_score, score))

In [19]:
def impute_nan(x, method='median'):
    _, ncol = x.shape
    result = np.empty_like(x)

    for col in range(ncol):
        if method == 'median':
            data = x[:, col]
            impute_value = np.median(data[~pd.isnull(data) & (data != np.inf) & (data != -np.inf)])
        else:
            raise NotImplementedError()

        func = np.vectorize(lambda x: impute_value if pd.isnull(x) else x)
        result[:, col] = func(x[:, col])
    return result

In [8]:
df_train = pd.read_csv('train_final.csv')
df_test = pd.read_csv('test_final.csv')

In [9]:
x_train, y_train = df_train.drop(columns='loan_status'), df_train.loc[:, ['loan_status']]
x_test, y_test = df_test.drop(columns='loan_status'), df_test.loc[:, 'loan_status']

In [10]:
# 设置要将onehot转换为target-encoder的 特征
cols = ['discrete_addr_state', 'discrete_grade'] #, 'discrete_sub_grade', 'discrete_emp_length']
x_train_ronehot = inverse_onehot_mat(x_train, cols)
x_test_ronehot = inverse_onehot_mat(x_test, cols)

In [11]:
encoder = TargetEncoder(cols=cols, 
                        handle_unknown='value',  
                        handle_missing='value').fit(x_train_ronehot, y_train) # 在训练集上训练

  elif pd.api.types.is_categorical(cols):


In [12]:
drop_cols = ['discrete_application_type_1_one_hot', 'discrete_application_type_2_one_hot']
x_train_dealed = encoder.transform(x_train_ronehot).drop(columns=drop_cols)
x_test_dealed = encoder.transform(x_test_ronehot).drop(columns=drop_cols)

In [14]:
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}
models_base = train_model(param_base, split_kfold(5, impute_nan(x_train_dealed.values), y_train.values.astype(int).squeeze()))
score_base = test_model_x(models_base, impute_nan(x_test_dealed.values), y_test.values.squeeze())

0-th model is training:
[LightGBM] [Info] Number of positive: 31851, number of negative: 8149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2525
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.796275 -> initscore=1.363174
[LightGBM] [Info] Start training from score 1.363174
[1]	valid_0's binary_logloss: 0.448173
[2]	valid_0's binary_logloss: 0.405685
[3]	valid_0's binary_logloss: 0.372945
[4]	valid_0's binary_logloss: 0.346858
[5]	valid_0's binary_logloss: 0.325477
[6]	valid_0's binary_logloss: 0.307579
[7]	valid_0's binary_logloss: 0.292439
[8]	valid_0's binary_logloss: 0.279671
[9]	valid_0's binary_logloss: 0.268726
[10]	valid_0's binary_logloss: 0.259412
[11]	valid_0's binary_logloss: 0.251414
[12]	valid_0's binary_logloss: 0.244402
[13]	valid_0's binary_logloss: 0.238227




[14]	valid_0's binary_logloss: 0.233052
[15]	valid_0's binary_logloss: 0.228563
[16]	valid_0's binary_logloss: 0.224707
[17]	valid_0's binary_logloss: 0.22128
[18]	valid_0's binary_logloss: 0.218265
[19]	valid_0's binary_logloss: 0.215627
[20]	valid_0's binary_logloss: 0.213387
[21]	valid_0's binary_logloss: 0.211459
[22]	valid_0's binary_logloss: 0.209627
[23]	valid_0's binary_logloss: 0.208233
[24]	valid_0's binary_logloss: 0.207005
[25]	valid_0's binary_logloss: 0.205791
[26]	valid_0's binary_logloss: 0.2049
[27]	valid_0's binary_logloss: 0.203982
[28]	valid_0's binary_logloss: 0.20326
[29]	valid_0's binary_logloss: 0.202784
[30]	valid_0's binary_logloss: 0.202312
[31]	valid_0's binary_logloss: 0.201855
[32]	valid_0's binary_logloss: 0.201454
[33]	valid_0's binary_logloss: 0.201102
[34]	valid_0's binary_logloss: 0.200732
[35]	valid_0's binary_logloss: 0.200311
[36]	valid_0's binary_logloss: 0.200016
[37]	valid_0's binary_logloss: 0.199858
[38]	valid_0's binary_logloss: 0.199698
[39]

## 构造衍生变量
1. 'discrete_addr_state', 'discrete_grade' 进行target encoding, 超参采用.
```json
{'num_thread': 8,'num_leaves': 100, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
```
准确率： base: 0.91634, tuning: 0.91766

In [20]:
param_tuning1= {'num_thread': 8,'num_leaves': 100, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
train_and_test(param_tuning1, score_base, impute_nan(x_train_dealed.values), y_train.values.squeeze(), impute_nan(x_test_dealed.values), y_test.values.squeeze())

0-th model is training:
[1]	valid_0's binary_logloss: 0.507101
[2]	valid_0's binary_logloss: 0.505184




[3]	valid_0's binary_logloss: 0.50491
[4]	valid_0's binary_logloss: 0.503017
[5]	valid_0's binary_logloss: 0.501145
[6]	valid_0's binary_logloss: 0.499294
[7]	valid_0's binary_logloss: 0.497472
[8]	valid_0's binary_logloss: 0.495658
[9]	valid_0's binary_logloss: 0.493858
[10]	valid_0's binary_logloss: 0.492072
[11]	valid_0's binary_logloss: 0.490317
[12]	valid_0's binary_logloss: 0.490037
[13]	valid_0's binary_logloss: 0.4883
[14]	valid_0's binary_logloss: 0.486583
[15]	valid_0's binary_logloss: 0.484874
[16]	valid_0's binary_logloss: 0.483185
[17]	valid_0's binary_logloss: 0.481514
[18]	valid_0's binary_logloss: 0.479861
[19]	valid_0's binary_logloss: 0.47822
[20]	valid_0's binary_logloss: 0.476594
[21]	valid_0's binary_logloss: 0.474987
[22]	valid_0's binary_logloss: 0.473414
[23]	valid_0's binary_logloss: 0.471837
[24]	valid_0's binary_logloss: 0.470294
[25]	valid_0's binary_logloss: 0.468746
[26]	valid_0's binary_logloss: 0.467225
[27]	valid_0's binary_logloss: 0.466974
[28]	valid_

In [18]:
param_tuning2= {'num_thread': 8,'num_leaves': 100, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}
train_and_test(param_tuning2, score_base, x_train_dealed.values, y_train.values.squeeze(), x_test_dealed.values, y_test.values.squeeze())

0-th model is training:
[1]	valid_0's binary_logloss: 0.507104




[2]	valid_0's binary_logloss: 0.505182
[3]	valid_0's binary_logloss: 0.504908
[4]	valid_0's binary_logloss: 0.503016
[5]	valid_0's binary_logloss: 0.501142
[6]	valid_0's binary_logloss: 0.499294
[7]	valid_0's binary_logloss: 0.497466
[8]	valid_0's binary_logloss: 0.495649
[9]	valid_0's binary_logloss: 0.49385
[10]	valid_0's binary_logloss: 0.492067
[11]	valid_0's binary_logloss: 0.49031
[12]	valid_0's binary_logloss: 0.490029
[13]	valid_0's binary_logloss: 0.488286
[14]	valid_0's binary_logloss: 0.486569
[15]	valid_0's binary_logloss: 0.484862
[16]	valid_0's binary_logloss: 0.483171
[17]	valid_0's binary_logloss: 0.481506
[18]	valid_0's binary_logloss: 0.479852
[19]	valid_0's binary_logloss: 0.478212
[20]	valid_0's binary_logloss: 0.476591
[21]	valid_0's binary_logloss: 0.474984
[22]	valid_0's binary_logloss: 0.47341
[23]	valid_0's binary_logloss: 0.47183
[24]	valid_0's binary_logloss: 0.470289
[25]	valid_0's binary_logloss: 0.46873
[26]	valid_0's binary_logloss: 0.467217
[27]	valid_0'

In [37]:
param_fine_tuning = {'verbosity':0, 'num_thread': 8,'num_leaves': 100, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                     'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}

In [38]:
# models_base = train_model(param_base, split_kfold(5, x_train_dealed.values, y_train.values.astype(int).squeeze()))

models_fe = train_model(param_fine_tuning, split_kfold(5, x_train_dealed.values, y_train.values.astype(int).squeeze()))

0-th model is training:
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




[1]	valid_0's binary_logloss: 0.507133
[2]	valid_0's binary_logloss: 0.505227
[3]	valid_0's binary_logloss: 0.50334
[4]	valid_0's binary_logloss: 0.501456
[5]	valid_0's binary_logloss: 0.499592
[6]	valid_0's binary_logloss: 0.49925
[7]	valid_0's binary_logloss: 0.497418
[8]	valid_0's binary_logloss: 0.495608
[9]	valid_0's binary_logloss: 0.493807
[10]	valid_0's binary_logloss: 0.492026
[11]	valid_0's binary_logloss: 0.491705
[12]	valid_0's binary_logloss: 0.491386
[13]	valid_0's binary_logloss: 0.491108
[14]	valid_0's binary_logloss: 0.489354
[15]	valid_0's binary_logloss: 0.489024
[16]	valid_0's binary_logloss: 0.48732
[17]	valid_0's binary_logloss: 0.485607
[18]	valid_0's binary_logloss: 0.483906
[19]	valid_0's binary_logloss: 0.482234
[20]	valid_0's binary_logloss: 0.480577
[21]	valid_0's binary_logloss: 0.478931
[22]	valid_0's binary_logloss: 0.478622
[23]	valid_0's binary_logloss: 0.478316
[24]	valid_0's binary_logloss: 0.476688
[25]	valid_0's binary_logloss: 0.475098
[26]	valid_0

In [67]:
#     'bagging_fraction': 0.9,
#     #'bagging_freq':5,
#  'boosting': 'goss', # try goss
#  #'drop_rate': 0.1,
#  'extra_trees': True,
#  'feature_fraction': 0.8,
#  #'lambda_l1': 5.039051326772622,
#  #'lambda_l2': 9.176853482302416,
#  'learning_rate': 0.01,
#  'metric': 'binary_error',
#  'min_gain_to_split': 0.5,
#  'num_leaves': 31,
#  'num_round': 1000,
#  'num_threads': 4,
#  'objective': 'binary'
#  #'uniform_drop': False
# }

param_fine_tuning = {'num_thread': 8,'num_leaves': 127, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
#                      'boosting': 'goss', 'extra_trees': True,
                     'learning_rate': 1e-2, 'feature_fraction': 1, 'bagging_fraction': 1}
models_fe = train_model(param_fine_tuning, split_kfold(5, x_train_dealed.values, y_train.values.astype(int).squeeze()))

0-th model is training:
[LightGBM] [Info] Number of positive: 31851, number of negative: 8149
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2486
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 43




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.796275 -> initscore=1.363174
[LightGBM] [Info] Start training from score 1.363174
[1]	valid_0's binary_logloss: 0.502623
[2]	valid_0's binary_logloss: 0.496428
[3]	valid_0's binary_logloss: 0.490439
[4]	valid_0's binary_logloss: 0.484672
[5]	valid_0's binary_logloss: 0.479082
[6]	valid_0's binary_logloss: 0.473693
[7]	valid_0's binary_logloss: 0.468419
[8]	valid_0's binary_logloss: 0.463321
[9]	valid_0's binary_logloss: 0.458391
[10]	valid_0's binary_logloss: 0.453588
[11]	valid_0's binary_logloss: 0.448947
[12]	valid_0's binary_logloss: 0.444421
[13]	valid_0's binary_logloss: 0.440018
[14]	valid_0's binary_logloss: 0.435735
[15]	valid_0's binary_logloss: 0.431559
[16]	valid_0's binary_logloss: 0.427481
[17]	valid_0's binary_logloss: 0.423524
[18]	valid_0's binary_logloss: 0.419643
[19]	valid_0's binary_logloss: 0.415837
[20]	valid_0's binary_logloss: 0.412127
[21]	valid_0's binary_logloss: 0.408502
[22]	valid_0's binary_logloss: 0.4049

In [39]:
score_fe = test_model_x(models_fe, x_test_dealed.values, y_test.values.squeeze())

print('base: {}, tuning: {}'.format(score_base, score_fe))

base: 0.91634, tuning: 0.91766
