In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [3]:
import copy

In [4]:
import io
import multiprocessing
import hyperopt.pyll

In [5]:
from sklearn.model_selection import KFold
from contextlib import redirect_stdout
from copy import deepcopy
from dataclasses import dataclass, asdict

from hyperopt import fmin, tpe, hp
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

cpu_count = 4
use_gpu = False

# EDA

In [6]:
df_train = pd.read_csv('train_final.csv')
df_test = pd.read_csv('test_final.csv')

In [12]:
df_train.columns.to_list()

['continuous_annual_inc',
 'continuous_annual_inc_joint',
 'continuous_delinq_2yrs',
 'continuous_dti',
 'continuous_dti_joint',
 'continuous_fico_range_high',
 'continuous_fico_range_low',
 'continuous_funded_amnt',
 'continuous_funded_amnt_inv',
 'continuous_inq_last_6mths',
 'continuous_installment',
 'continuous_int_rate',
 'continuous_last_fico_range_high',
 'continuous_last_fico_range_low',
 'continuous_loan_amnt',
 'loan_status',
 'continuous_mths_since_last_delinq',
 'continuous_mths_since_last_major_derog',
 'continuous_mths_since_last_record',
 'continuous_open_acc',
 'continuous_pub_rec',
 'discrete_addr_state_1_one_hot',
 'discrete_addr_state_2_one_hot',
 'discrete_addr_state_3_one_hot',
 'discrete_addr_state_4_one_hot',
 'discrete_addr_state_5_one_hot',
 'discrete_addr_state_6_one_hot',
 'discrete_addr_state_7_one_hot',
 'discrete_addr_state_8_one_hot',
 'discrete_addr_state_9_one_hot',
 'discrete_addr_state_10_one_hot',
 'discrete_addr_state_11_one_hot',
 'discrete_addr_s

In [16]:
# 50000 samples
df_train['loan_status'].value_counts()

1    39788
0    10212
Name: loan_status, dtype: int64

In [85]:
cols = df_train.columns.to_list()
x_cols = [v for v in cols if (v not in ['loan_status'])]

In [9]:
# 创建训练集
train_x = df_train.loc[:, x_cols]
train_y = df_train.loc[:, ['loan_status']]
train_dataset = lgb.Dataset(train_x, train_y)

In [10]:
#创建测试集
test_x = df_test.loc[:, x_cols]
test_y = df_test.loc[:, ['loan_status']]
test_dataset = lgb.Dataset(test_x, test_y)

In [104]:
# 设置算法参数
hyp_params = {'num_leaves': 31, 'objective': 'binary', 'metric':'binary_error'}
num_round = 2000

In [41]:
model = lgb.train(hyp_params, train_dataset, num_boost_round=2000, valid_sets=[train_dataset])

[LightGBM] [Info] Number of positive: 39788, number of negative: 10212
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2611
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 141
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.795760 -> initscore=1.360002
[LightGBM] [Info] Start training from score 1.360002
[1]	training's binary_error: 0.20424
[2]	training's binary_error: 0.20424
[3]	training's binary_error: 0.20424
[4]	training's binary_error: 0.20424
[5]	training's binary_error: 0.14838
[6]	training's binary_error: 0.11022
[7]	training's binary_error: 0.099
[8]	training's binary_error: 0.09388
[9]	training's binary_error: 0.08814
[10]	training's binary_error: 0.08404
[11]	training's binary_error: 0.08162
[12]	training's binary_error: 0.08022
[13]	training's binary_error: 0.07902
[14]	training's binary_error: 0.0782
[15]	training's binary_erro

In [26]:
@dataclass
class LGBOpt:
    num_threads: any = hp.choice('num_threads', [cpu_count])
    num_leaves: any = hp.choice('num_leaves', [64])
    metric: any = hp.choice('metric', ['binary_error'])
    num_round: any = hp.choice('num_rounds', [1000])
    objective: any = hp.choice('objective', ['binary'])
    learning_rate: any = hp.uniform('learning_rate', 0.01, 0.1)
    feature_fraction: any = hp.uniform('feature_fraction', 0.5, 1.0)
    bagging_fraction: any = hp.uniform('bagging_fraction', 0.8, 1.0)
    device_type: any = hp.choice('device_tpye', ['gpu']) if use_gpu else hp.choice('device_type',
                                                                                   ['cpu'])
    boosting: any = hp.choice('boosting', ['gbdt', 'dart', 'goss'])
    extra_trees: any = hp.choice('extra_tress', [False, True])
    drop_rate: any = hp.uniform('drop_rate', 0, 0.2)
    uniform_drop: any = hp.choice('uniform_drop', [True, False])
    lambda_l1: any = hp.uniform('lambda_l1', 0, 10)  # TODO: Check range
    lambda_l2: any = hp.uniform('lambda_l2', 0, 10)  # TODO: Check range
    min_gain_to_split: any = hp.uniform('min_gain_to_split', 0, 1)  # TODO: Check range
    min_data_in_bin = hp.choice('min_data_in_bin', [3, 5, 10, 15, 20, 50])

    @staticmethod
    def get_common_params():
        return {'num_thread': 4, 'num_leaves': 12, 'metric': 'binary', 'objective': 'binary',
                'num_round': 1000, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}

In [12]:
class FitterBase(object):
    def __init__(self, label, metric, max_eval=100, opt=None):
        self.label = label
        self.metric = metric
        self.opt_params = dict()
        self.max_eval = max_eval
        self.opt = opt
        
    def get_loss(self, y, y_pred):
        return 1 - accuracy_score(y, y_pred)
    

In [95]:
class LGBFitter(FitterBase):
    def __init__(self, label='label', metric='error', opt: LGBOpt = None, max_eval=100):
        super(LGBFitter, self).__init__(label, metric, max_eval)
        if opt is not None:
            self.opt = opt
        else:
            self.opt = LGBOpt()
        self.best_round = None
        self.clf = None

    def train(self, train_df, eval_df, params=None, use_best_eval=True):
        self.best_round = None
        dtrain = lgb.Dataset(train_df.drop(columns=[self.label]), train_df[self.label])
        deval = lgb.Dataset(eval_df.drop(columns=[self.label]), eval_df[self.label])
        evallist = [dtrain, deval]
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)
        print(use_params)
        num_round = use_params.pop('num_round')
        if use_best_eval:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            min_error = np.inf
            min_index = 0
            for idx in range(len(output) - 1):
                if len(output[idx].split("\t")) == 3:
                    temp = float(output[idx].split("\t")[2].split(":")[1])
                    if min_error > temp:
                        min_error = temp
                        min_index = int(output[idx].split("\t")[0][1:-1])
            print("The minimum is attained in round %d, min_error:%f" % (min_index + 1, min_error))
            self.best_round = min_index + 1
            return output
        else:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            self.best_round = num_round
            return output

    def search(self, train_df, eval_df, use_best_eval=True):
        self.opt_params = dict()

        def train_impl(params):
            self.train(train_df, eval_df, params, use_best_eval)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                           num_iteration=self.best_round) > 0.5).astype(int)
            return self.get_loss(eval_df[self.label], y_pred)

        self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def search_k_fold(self, k_fold, data, use_best_eval=True):
        self.opt_params = dict()

        def train_impl_nfold(params):
            loss = list()
            for train_id, eval_id in k_fold.split(data):
                train_df = data.loc[train_id]
                eval_df = data.loc[eval_id]
                self.train(train_df, eval_df, params, use_best_eval)
                if self.metric == 'auc':
                    y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
                else:
                    y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                               num_iteration=self.best_round) > 0.5).astype(int)
                loss.append(self.get_loss(eval_df[self.label], y_pred))
            return np.mean(loss)

        self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True, use_best_eval=True):
        acc_result = list()
        train_pred = np.empty(train_data.shape[0])
        test_pred = np.empty(test_data.shape[0])
        if drop_test_y:
            dtest = test_data.drop(columns=self.label)
        else:
            dtest = test_data

        models = list()
        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.loc[train_id]
            eval_df = train_data.loc[eval_id]
            self.train(train_df, eval_df, params, use_best_eval)
            models.append(copy.deepcopy(self.clf))
            train_pred[eval_id] = self.clf.predict(eval_df.drop(columns=self.label), num_iteration=self.best_round)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round) > 0.5).astype(int)
            acc_result.append(self.get_loss(eval_df[self.label], y_pred))
            test_pred += self.clf.predict(dtest, num_iteration=self.best_round)
        test_pred /= k_fold.n_splits
        return train_pred, test_pred, acc_result, models        

In [97]:
fitter = LGBFitter(label='loan_status')

# 缺省变量及参数
1. 变量共 145个
2. params = {'num_thread': 4, 'num_leaves': 12, 'metric': 'binary_error', 'objective': 'binary',
                'num_round': 2000, 'learning_rate': 0.02, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}

In [100]:

params = {'num_thread': 4, 'num_leaves': 12, 'metric': 'binary_error', 'objective': 'binary',
                'num_round': 2000, 'learning_rate': 0.02, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}
#fitter.train_k_fold(kfold, df_train, df_test, params = params)
fitter.train(df_train, df_test, params=params)

{'num_thread': 4, 'num_leaves': 12, 'metric': 'binary_error', 'objective': 'binary', 'num_round': 2000, 'learning_rate': 0.02, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}
The minimum is attained in round 430, min_error:0.082020


['[LightGBM] [Info] Number of positive: 39788, number of negative: 10212',
 'You can set `force_col_wise=true` to remove the overhead.',
 '[LightGBM] [Info] Total Bins 2611',
 '[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 141',
 '[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.795760 -> initscore=1.360002',
 '[LightGBM] [Info] Start training from score 1.360002',
 "[1]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[2]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[3]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[4]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[5]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[6]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[7]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[8]\ttraining's binary_error: 0.20424\tvalid_1's binary_erro

# 各变量的贡献度查看
1. 各变量的贡献较为均匀(除application_type外，其余没有明显不相干变量）。
2. 部分分类变量 可适当减少其分类取值。

In [59]:
importance_df = pd.DataFrame({
    'feature': fitter.clf.feature_name(),
    'importance': fitter.clf.feature_importance('gain')
}).sort_values('importance', ascending=False)

In [89]:
importance_df[importance_df['feature'].str.contains('application_type')]

Unnamed: 0,feature,importance
69,discrete_application_type_1_one_hot,5.96574
70,discrete_application_type_2_one_hot,0.0


In [65]:
# importance_df.to_csv('feature_importance.csv', index=False)

# 特征工程
1. discrete_purpose_11_one_hot, discrete_purpose_8_one_hot, discrete_purpose_7_one_hot, discrete_purpose_12_one_hot 几类合并？
2. application_type 去掉后，最小误差  0.081540

In [138]:
opt_params = {
    'bagging_fraction': 0.9,
 'boosting': 'goss', # try goss
 #'drop_rate': 0.1,
 'extra_trees': True,
 'feature_fraction': 0.75,
 #'lambda_l1': 5.039051326772622,
 #'lambda_l2': 9.176853482302416,
 'learning_rate': 0.01,
 'metric': 'binary_error',
 'min_gain_to_split': 0.5,
 'num_leaves': 31,
 'num_round': 1000,
 'num_threads': 4,
 'objective': 'binary',
 #'uniform_drop': False
}
#fitter.train_k_fold(kfold, df_train, df_test, params = opt_params)

In [140]:
drop_cols=['discrete_application_type_1_one_hot', 'discrete_application_type_2_one_hot', 'discrete_purpose_12_one_hot']
fitter.train(df_train.drop(columns=drop_cols), df_test.drop(columns=drop_cols), params=opt_params)

{'bagging_fraction': 0.9, 'boosting': 'goss', 'extra_trees': True, 'feature_fraction': 0.75, 'learning_rate': 0.01, 'metric': 'binary_error', 'min_gain_to_split': 0.5, 'num_leaves': 31, 'num_round': 1000, 'num_threads': 4, 'objective': 'binary'}
The minimum is attained in round 881, min_error:0.081780


['[LightGBM] [Info] Number of positive: 39788, number of negative: 10212',
 'You can set `force_row_wise=true` to remove the overhead.',
 'And if memory is not enough, you can set `force_col_wise=true`.',
 '[LightGBM] [Info] Total Bins 2611',
 '[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 141',
 '[LightGBM] [Info] Using GOSS',
 '[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.795760 -> initscore=1.360002',
 '[LightGBM] [Info] Start training from score 1.360002',
 "[1]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[2]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[3]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[4]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[5]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[6]\ttraining's binary_error: 0.20424\tvalid_1's binary_error: 0.19548",
 "[7]\ttraining's binary_error: 0.20

In [105]:
kfold = KFold(n_splits=5)

In [31]:
fitter.search_k_fold(kfold, df_train)

The minimum is attained in round 58                    
The minimum is attained in round 102                   
The minimum is attained in round 72                    
The minimum is attained in round 244                   
The minimum is attained in round 318                   
The minimum is attained in round 93                                                  
The minimum is attained in round 40                                                  
The minimum is attained in round 49                                                  
The minimum is attained in round 237                                                 
The minimum is attained in round 192                                                 
The minimum is attained in round 58                                                  
The minimum is attained in round 34                                     
The minimum is attained in round 42                                     
The minimum is attained in round 41                       