In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np

# dataset

In [2]:
train = pd.read_csv('./train_final.csv', engine='python')
test  = pd.read_csv('./test_final.csv', engine='python')

In [3]:
x_train = train.drop(['loan_status'], axis=1)
y_train = train['loan_status']

x_test = test.drop(['loan_status'], axis=1)
y_test = test['loan_status']

In [4]:
train.shape

(50000, 146)

In [5]:
x = pd.concat([x_train, x_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

data = pd.concat([x, y], axis=1)
data.shape

(100000, 146)

In [6]:
TRAIN_IDX = x_train.shape[0]
TEST_IDX = TRAIN_IDX + x_test.shape[0]

In [7]:
data.columns.to_list()

['continuous_annual_inc',
 'continuous_annual_inc_joint',
 'continuous_delinq_2yrs',
 'continuous_dti',
 'continuous_dti_joint',
 'continuous_fico_range_high',
 'continuous_fico_range_low',
 'continuous_funded_amnt',
 'continuous_funded_amnt_inv',
 'continuous_inq_last_6mths',
 'continuous_installment',
 'continuous_int_rate',
 'continuous_last_fico_range_high',
 'continuous_last_fico_range_low',
 'continuous_loan_amnt',
 'continuous_mths_since_last_delinq',
 'continuous_mths_since_last_major_derog',
 'continuous_mths_since_last_record',
 'continuous_open_acc',
 'continuous_pub_rec',
 'discrete_addr_state_1_one_hot',
 'discrete_addr_state_2_one_hot',
 'discrete_addr_state_3_one_hot',
 'discrete_addr_state_4_one_hot',
 'discrete_addr_state_5_one_hot',
 'discrete_addr_state_6_one_hot',
 'discrete_addr_state_7_one_hot',
 'discrete_addr_state_8_one_hot',
 'discrete_addr_state_9_one_hot',
 'discrete_addr_state_10_one_hot',
 'discrete_addr_state_11_one_hot',
 'discrete_addr_state_12_one_hot'

## basic data manipulation

In [8]:
data['loan_status'].unique()

array([1, 0], dtype=int64)

In [9]:
data['loan_status'].value_counts()

1    80014
0    19986
Name: loan_status, dtype: int64

# Tree based models

In [10]:
train = data.iloc[:TRAIN_IDX, :]
test = data.iloc[TRAIN_IDX:TEST_IDX, :]

In [11]:
import lightgbm as lgb
train_dataset = lgb.Dataset(train.drop(columns='loan_status'), train['loan_status'])
test_dataset = lgb.Dataset(test.drop(columns='loan_status'), test['loan_status'])

In [12]:
param = {'num_leaves': 31, 'objective': 'binary', 'metric':'binary_error'}
num_round = 2000

In [13]:
# model = lgb.train(param, train_dataset, num_boost_round=num_round, valid_sets=[train_dataset, test_dataset])

## wrapper

In [14]:
import io
import multiprocessing
from contextlib import redirect_stdout
from copy import deepcopy
from dataclasses import dataclass, asdict
import hyperopt.pyll
from hyperopt import fmin, tpe, hp
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
import torch

import copy
cpu_count = 12
use_gpu = False
@dataclass
class LGBOpt:
    num_threads: any = hp.choice('num_threads', [cpu_count])
    num_leaves: any = hp.choice('num_leaves', [4, 6, 12, 20, 32])
    metric: any = hp.choice('metric', ['binary_error'])
    num_round: any = hp.choice('num_round', [2000, 4000])
    objective: any = hp.choice('objective', ['binary'])
    learning_rate: any = hp.uniform('learning_rate', 0.01, 0.1)
    feature_fraction: any = hp.uniform('feature_fraction', 0.5, 1.0)
    bagging_fraction: any = hp.uniform('bagging_fraction', 0.8, 1.0)
#     device_type: any = hp.choice('device_tpye', ['gpu']) if use_gpu else hp.choice('device_type',
#                                                                                    ['cpu'])
    boosting: any = hp.choice('boosting', ['gbdt'])
#     extra_trees: any = hp.choice('extra_tress', [False, True])
#     drop_rate: any = hp.uniform('drop_rate', 0, 0.2)
#     uniform_drop: any = hp.choice('uniform_drop', [True, False])
    lambda_l1: any = hp.uniform('lambda_l1', 0, 10)  # TODO: Check range
    lambda_l2: any = hp.uniform('lambda_l2', 0, 10)  # TODO: Check range
#     min_gain_to_split: any = hp.uniform('min_gain_to_split', 0, 1)  # TODO: Check range
    min_data_in_bin = hp.choice('min_data_in_bin', [3, 5, 10, 15, 20, 50])

    @staticmethod
    def get_common_params():
        return {'num_thread': 4, 'num_leaves': 12, 'metric': 'binary', 'objective': 'binary',
                'num_round': 1000, 'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}
    

In [15]:
class FitterBase(object):
    def __init__(self, label, metric, max_eval=100, opt=None):
        self.label = label
        self.metric = metric
        self.opt_params = dict()
        self.max_eval = max_eval
        self.opt = opt

    def get_loss(self, y, y_pred):
        if self.metric == 'error':
            return 1 - accuracy_score(y, y_pred)
        elif self.metric == 'precision':
            return 1 - precision_score(y, y_pred)
        elif self.metric == 'recall':
            return 1 - recall_score(y, y_pred)
        elif self.metric == 'macro_f1':
            return 1 - f1_score(y, y_pred, average='macro')
        elif self.metric == 'micro_f1':
            return 1 - f1_score(y, y_pred, average='micro')
        elif self.metric == 'auc':  # TODO: Add a warning checking if y_predict is all [0, 1], it should be probability
            return 1 - roc_auc_score(y, y_pred)
        else:
            raise Exception("Not implemented yet.")


In [16]:
class LGBFitter(FitterBase):
    def __init__(self, label='label', metric='error', opt: LGBOpt = None, max_eval=100):
        super(LGBFitter, self).__init__(label, metric, max_eval)
        if opt is not None:
            self.opt = opt
        else:
            self.opt = LGBOpt()
        self.best_round = None
        self.clf = None

    def train(self, train_df, eval_df, params=None, use_best_eval=True):
        self.best_round = None
        dtrain = lgb.Dataset(train_df.drop(columns=[self.label]), train_df[self.label])
        deval = lgb.Dataset(eval_df.drop(columns=[self.label]), eval_df[self.label])
        evallist = [dtrain, deval]
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)

        num_round = use_params.pop('num_round')
        if use_best_eval:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            min_error = np.inf
            min_index = 0
            for idx in range(len(output) - 1):
                if len(output[idx].split("\t")) == 3:
                    temp = float(output[idx].split("\t")[2].split(":")[1])
                    if min_error > temp:
                        min_error = temp
                        min_index = int(output[idx].split("\t")[0][1:-1])
            print("The minimum is attained in round %d" % (min_index + 1))
            self.best_round = min_index + 1
            return output
        else:
            with io.StringIO() as buf, redirect_stdout(buf):
                self.clf = lgb.train(use_params, dtrain, num_round, valid_sets=evallist)
                output = buf.getvalue().split("\n")
            self.best_round = num_round
            return output

    def search(self, train_df, eval_df, use_best_eval=True):
        self.opt_params = dict()

        def train_impl(params):
            self.train(train_df, eval_df, params, use_best_eval)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                           num_iteration=self.best_round) > 0.5).astype(int)
            return self.get_loss(eval_df[self.label], y_pred)

        self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def search_k_fold(self, k_fold, data, use_best_eval=True):
        self.opt_params = dict()

        def train_impl_nfold(params):
            loss = list()
             
            print(params)
            for train_id, eval_id in k_fold.split(data):
                train_df = data.loc[train_id]
                eval_df = data.loc[eval_id]
                self.train(train_df, eval_df, params, use_best_eval)
                if self.metric == 'auc':
                    y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
                else:
                    y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                               num_iteration=self.best_round) > 0.5).astype(int)
                loss.append(self.get_loss(eval_df[self.label], y_pred))
            print(100*np.mean(loss), '%')
            return np.mean(loss)

        self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval)

    def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True, use_best_eval=True):
        acc_result = list()
        test_acc_result = list()
        train_pred = np.empty(train_data.shape[0])
        test_pred = np.empty(test_data.shape[0])
        test_preds = None
        
        if drop_test_y:
            dtest = test_data.drop(columns=self.label)
        else:
            dtest = test_data

        models = list()
        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.loc[train_id]
            eval_df = train_data.loc[eval_id]
            self.train(train_df, eval_df, params, use_best_eval)
            models.append(copy.deepcopy(self.clf))
            train_pred[eval_id] = self.clf.predict(eval_df.drop(columns=self.label), num_iteration=self.best_round)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]), num_iteration=self.best_round)
            else:
                y_pred = (self.clf.predict(eval_df.drop(columns=[self.label]),
                                           num_iteration=self.best_round) > 0.5).astype(int)
            acc_result.append(self.get_loss(eval_df[self.label], y_pred))
            test_pred = self.clf.predict(dtest, num_iteration=self.best_round)
        
            # save k-fold test_preds
            test_pred = np.expand_dims(test_pred, axis=0)
            if test_preds is None:
                test_preds = test_pred
            else:
                test_preds = np.concatenate((test_preds, test_pred), axis=0)
        return train_pred, test_preds, acc_result, models

## train

In [18]:
fitter = LGBFitter(label='loan_status')

from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)

In [88]:
import numpy as np
num_leaves = [6, 12, 20, 32, 48, 64, 80, 96, 127]
# num_leaves = [i for i in range(6, 14, 2)]
for num_leave in num_leaves:
# lrs = [0.005, 0.01, 0.02]
# for lr in lrs:
# fractions = [0.6, 0.7, 0.8, 0.9, 1.0]
# for fraction in fractions:
# l1s = np.linspace(0, 2, num=20)
# for l1 in l1s:
    params = {'num_thread': 12, 'num_leaves': num_leave, 'boosting': 'dart', 'metric': 'binary', 'objective': 'binary', 
          'num_round': 1000, 'learning_rate': 0.1, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}
    print(params)
    _, _, error_rate, _ = fitter.train_k_fold(kfold, train, test, params = params)
    print(100 * np.mean(error_rate), '%')

{'num_thread': 12, 'num_leaves': 6, 'boosting': 'dart', 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 'learning_rate': 0.1, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}
The minimum is attained in round 504
Finished loading model, total used 1000 iterations
The minimum is attained in round 489
Finished loading model, total used 1000 iterations
The minimum is attained in round 844
Finished loading model, total used 1000 iterations
The minimum is attained in round 620
Finished loading model, total used 1000 iterations
The minimum is attained in round 513
Finished loading model, total used 1000 iterations
7.990000000000001 %
{'num_thread': 12, 'num_leaves': 12, 'boosting': 'dart', 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 'learning_rate': 0.1, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}
The minimum is attained in round 434
Finished loading model, total used 1000 iterations
The minimum is attained in round 540
Finished loading model, total used

KeyboardInterrupt: 

# 当前最优参数，做test

In [102]:
#########################################################################################################
#                                         boosting = 'gbdt'                                             #
#########################################################################################################
# 7.983999999999999 %  from self
# Vote           8.204%
# mean, Predict, 8.206000000000003%
params1 = {'num_thread': 12, 'num_leaves': 12, 'metric': 'binary', 'objective': 'binary', 
          'num_round': 4000, 'learning_rate': 0.004, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}

In [103]:
# 7.847999999999999 % from hyperopt
# Vote           8.177999999999997%
# mean, Predict, 8.189999999999998%
params2 = {'bagging_fraction': 0.936883991179611, 'boosting': 'goss', 'device_type': 'cpu', 
          'drop_rate': 0.11852930241041167, 'extra_trees': True, 'feature_fraction': 0.5227806161720856, 
          'lambda_l1': 4.3254509530619, 'lambda_l2': 2.5119994536346746, 'learning_rate': 0.0608188253086666, 
          'metric': 'binary_error', 'min_gain_to_split': 0.2888168506543208, 'num_leaves': 6, 
          'num_round': 2000, 'num_threads': 12, 'objective': 'binary', 'uniform_drop': False}

In [104]:
# 7.962 % plus regularization
# Vote           8.225999999999999%
# mean, Predict, 8.238000000000001%
params3 = {'num_thread': 12, 'num_leaves': 12, 'metric': 'binary', 'objective': 'binary', 'num_round': 200, 
          'learning_rate': 0.1, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 
          'lambda_l1': 0.6063157894736843, 'lambda_l2': 0.26}

In [105]:
#########################################################################################################
#                                         boosting = 'goss'                                             #
#########################################################################################################
# 7.928000000000002 %
# Vote           8.187999999999995
# mean, Predict, 8.206000000000003
params4 = {'num_thread': 12, 'num_leaves': 8, 'boosting': 'goss', 'metric': 'binary', 'objective': 'binary', 
          'num_round': 4000, 'learning_rate': 0.005, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}

In [106]:
#########################################################################################################
#                                         boosting = 'dart'                                             #
#########################################################################################################
# 7.990000000000001 %
# Vote           
# mean, Predict, 

params5 = {'num_thread': 12, 'num_leaves': 6, 'boosting': 'dart', 'metric': 'binary', 
           'objective': 'binary', 'num_round': 1000, 'learning_rate': 0.1, 'feature_fraction': 0.8, 
           'bagging_fraction': 0.8}

In [107]:
#########################################################################################################
#                                         merge                                                         #
#########################################################################################################
test_preds = None
params = [params1, params4, params5]
for i in range(len(params)):
    print(params[i])
    _, test_pred, error_rate, _ = fitter.train_k_fold(kfold, train, test, params = params[i])
    print(100 * np.mean(error_rate), '%')
    
    if test_preds is None:
        test_preds = test_pred
    else:
        test_preds = np.concatenate((test_preds, test_pred), axis=0)

print(test_preds.shape)

{'num_thread': 12, 'num_leaves': 12, 'metric': 'binary', 'objective': 'binary', 'num_round': 4000, 'learning_rate': 0.004, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}
The minimum is attained in round 2143
Finished loading model, total used 4000 iterations
The minimum is attained in round 2194
Finished loading model, total used 4000 iterations
The minimum is attained in round 2226
Finished loading model, total used 4000 iterations
The minimum is attained in round 3538
Finished loading model, total used 4000 iterations
The minimum is attained in round 1695
Finished loading model, total used 4000 iterations
7.983999999999999 %
{'num_thread': 12, 'num_leaves': 8, 'boosting': 'goss', 'metric': 'binary', 'objective': 'binary', 'num_round': 4000, 'learning_rate': 0.005, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}
The minimum is attained in round 3183
Finished loading model, total used 4000 iterations
The minimum is attained in round 2095
Finished loading model, total used 4000 ite

In [108]:
# test
def test_error(test, test_preds, label='loan_status', flag=True):
    """
    flag: True,      , Vote
        : False, mean, Predict
    """
    loss = FitterBase(label=label, metric='error')
    test_label = test[label]
    threshold = np.ceil(test_preds.shape[0] / 2).astype('int')
#     print(threshold)
    if flag:
        print('Vote')
        preds = (test_preds > 0.5).astype(int)
        pred = (np.sum(preds, axis=0) > threshold).astype(int) 
    else:
        print('mean, Predict')
        pred = (np.mean(test_preds, axis=0) > 0.5).astype(int)
    return 100 * loss.get_loss(test_label, pred)

print(test_error(test, test_preds))    
print(test_error(test, test_preds, flag=False))

Vote
8.179999999999998
mean, Predict
8.153999999999995


# hyperopt

In [208]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)

opt = LGBOpt()
fitter = LGBFitter(label='loan_status', opt=opt)

fitter.search_k_fold(kfold, train, use_best_eval=True)
# _, _, error_rate, _ = fitter.train_k_fold(kfold, train, test, params=None)
# print(params)
# print(100 * np.mean(error_rate), '%')

{'bagging_fraction': 0.9019749724390823, 'boosting': 'gbdt', 'feature_fraction': 0.571313880484396, 'lambda_l1': 2.9227308946837747, 'lambda_l2': 1.8184290733490016, 'learning_rate': 0.017191306108113986, 'metric': 'binary_error', 'num_leaves': 6, 'num_round': 4000, 'num_threads': 12, 'objective': 'binary'}
The minimum is attained in round 1083                                                                                  
The minimum is attained in round 3056                                                                                  
The minimum is attained in round 554                                                                                   
The minimum is attained in round 2626                                                                                  
The minimum is attained in round 2555                                                                                  
7.956                                                                                      

The minimum is attained in round 228                                                                                   
The minimum is attained in round 55                                                                                    
The minimum is attained in round 143                                                                                   
The minimum is attained in round 585                                                                                   
The minimum is attained in round 77                                                                                    
7.962                                                                                                                  
%                                                                                                                      
{'bagging_fraction': 0.9102855993699241, 'boosting': 'gbdt', 'feature_fraction': 0.808693155937112, 'lambda_l1': 8.443214957672417, 'lambda_l2': 6.4859269475458

The minimum is attained in round 153                                                                                   
The minimum is attained in round 645                                                                                   
The minimum is attained in round 636                                                                                   
7.932000000000001                                                                                                      
%                                                                                                                      
{'bagging_fraction': 0.8109532933792166, 'boosting': 'gbdt', 'feature_fraction': 0.6239789204380648, 'lambda_l1': 1.3028217050572921, 'lambda_l2': 0.1369455259306145, 'learning_rate': 0.09941644123998278, 'metric': 'binary_error', 'num_leaves': 4, 'num_round': 2000, 'num_threads': 12, 'objective': 'binary'}
The minimum is attained in round 352                                                       

The minimum is attained in round 114                                                                                   
7.986                                                                                                                  
%                                                                                                                      
{'bagging_fraction': 0.8625488910868988, 'boosting': 'gbdt', 'feature_fraction': 0.6366447548272688, 'lambda_l1': 0.5474914359652187, 'lambda_l2': 3.4460691142886475, 'learning_rate': 0.09566202725484102, 'metric': 'binary_error', 'num_leaves': 4, 'num_round': 2000, 'num_threads': 12, 'objective': 'binary'}
The minimum is attained in round 104                                                                                   
The minimum is attained in round 945                                                                                   
The minimum is attained in round 148                                                       

%                                                                                                                      
{'bagging_fraction': 0.8830114733386507, 'boosting': 'gbdt', 'feature_fraction': 0.5292249665356663, 'lambda_l1': 9.086509319773187, 'lambda_l2': 3.7537327874852355, 'learning_rate': 0.02165703995225189, 'metric': 'binary_error', 'num_leaves': 12, 'num_round': 4000, 'num_threads': 12, 'objective': 'binary'}
The minimum is attained in round 572                                                                                   
The minimum is attained in round 154                                                                                   
The minimum is attained in round 711                                                                                   
The minimum is attained in round 712                                                                                   
The minimum is attained in round 275                                                       

The minimum is attained in round 423                                                                                   
The minimum is attained in round 891                                                                                   
The minimum is attained in round 195                                                                                   
The minimum is attained in round 1364                                                                                  
The minimum is attained in round 1195                                                                                  
7.904                                                                                                                  
%                                                                                                                      
{'bagging_fraction': 0.962094471857831, 'boosting': 'gbdt', 'feature_fraction': 0.735887663339937, 'lambda_l1': 2.3082544458934735, 'lambda_l2': 0.3458250133282

The minimum is attained in round 244                                                                                   
The minimum is attained in round 269                                                                                   
The minimum is attained in round 311                                                                                   
7.9399999999999995                                                                                                     
%                                                                                                                      
{'bagging_fraction': 0.9533589534113839, 'boosting': 'gbdt', 'feature_fraction': 0.661781186318499, 'lambda_l1': 0.010547106460079059, 'lambda_l2': 2.977101169303717, 'learning_rate': 0.0929037736548816, 'metric': 'binary_error', 'num_leaves': 4, 'num_round': 4000, 'num_threads': 12, 'objective': 'binary'}
The minimum is attained in round 774                                                        

The minimum is attained in round 2928                                                                                  
7.909999999999999                                                                                                      
%                                                                                                                      
{'bagging_fraction': 0.9651409139986046, 'boosting': 'gbdt', 'feature_fraction': 0.8409241783241013, 'lambda_l1': 1.5972255047064463, 'lambda_l2': 4.138004266819378, 'learning_rate': 0.023405262314720315, 'metric': 'binary_error', 'num_leaves': 12, 'num_round': 2000, 'num_threads': 12, 'objective': 'binary'}
The minimum is attained in round 188                                                                                   
The minimum is attained in round 124                                                                                   
The minimum is attained in round 648                                                      