In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# 0.Base Line

#### Load data and data preprocessing

In [2]:
seed = 42 # for the same data division

kf = KFold(n_splits=5, random_state=seed,shuffle=True)
df_train = pd.read_csv('./data/train_final.csv')
df_test = pd.read_csv('./data/test_final.csv')

X_train = df_train.drop(columns=['loan_status']).values
Y_train = df_train['loan_status'].values.astype(int)
X_test = df_test.drop(columns=['loan_status']).values
Y_test = df_test['loan_status'].values.astype(int)

In [3]:
X_train.shape, Y_train.shape

((50000, 145), (50000,))

In [10]:
X_test.shape, Y_test.shape

((50000, 145), (50000,))

In [11]:
# split data for five fold

five_fold_data = []

for train_index, eval_index in kf.split(X_train):
    x_train, x_eval = X_train[train_index], X_train[eval_index]
    y_train, y_eval = Y_train[train_index], Y_train[eval_index]
    
    five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

#### Algorithm

In [12]:
def get_model(param):
    model_list = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[validation_data])
        model_list.append(bst)
    return model_list

#### Train

In [13]:
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                     'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}

In [14]:
# base param train
param_base_model = get_model(param_base)

# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)

0-th model is training:
[LightGBM] [Info] Number of positive: 31851, number of negative: 8149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2583
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 141
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.796275 -> initscore=1.363174
[LightGBM] [Info] Start training from score 1.363174
[1]	valid_0's binary_logloss: 0.448173
[2]	valid_0's binary_logloss: 0.405734
[3]	valid_0's binary_logloss: 0.372961
[4]	valid_0's binary_logloss: 0.34683
[5]	valid_0's binary_logloss: 0.325461
[6]	valid_0's binary_logloss: 0.307594
[7]	valid_0's binary_logloss: 0.292607
[8]	valid_0's binary_logloss: 0.279725
[9]	valid_0's binary_logloss: 0.268803
[10]	valid_0's binary_logloss: 0.259461
[11]	valid_0's binary_logloss: 0.251464
[12]	valid_0's binary_logloss: 0.244436
[13]	valid_0's binary_logloss: 0.23844
[14]	valid_0's 



[41]	valid_0's binary_logloss: 0.199177
[42]	valid_0's binary_logloss: 0.199119
[43]	valid_0's binary_logloss: 0.19894
[44]	valid_0's binary_logloss: 0.198862
[45]	valid_0's binary_logloss: 0.198855
[46]	valid_0's binary_logloss: 0.19867
[47]	valid_0's binary_logloss: 0.198656
[48]	valid_0's binary_logloss: 0.198655
[49]	valid_0's binary_logloss: 0.198536
[50]	valid_0's binary_logloss: 0.198496
[51]	valid_0's binary_logloss: 0.198509
[52]	valid_0's binary_logloss: 0.198643
[53]	valid_0's binary_logloss: 0.198483
[54]	valid_0's binary_logloss: 0.198511
[55]	valid_0's binary_logloss: 0.198458
[56]	valid_0's binary_logloss: 0.198439
[57]	valid_0's binary_logloss: 0.198413
[58]	valid_0's binary_logloss: 0.198394
[59]	valid_0's binary_logloss: 0.198315
[60]	valid_0's binary_logloss: 0.198313
[61]	valid_0's binary_logloss: 0.198352
[62]	valid_0's binary_logloss: 0.198458
[63]	valid_0's binary_logloss: 0.198521
[64]	valid_0's binary_logloss: 0.198466
[65]	valid_0's binary_logloss: 0.198542
[6

#### Test

In [15]:
def test_model(model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

In [16]:
base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)

print('base: {}, fine tuning: {}'.format(base_score, fine_tuning_score))

base: 0.91552, fine tuning: 0.91756


# 1. Adding derived variables

#### Load data

In [17]:
df_train = pd.read_csv('./data/train_final.csv')
df_test = pd.read_csv('./data/test_final.csv')

#### Adding derived variable ---- the ratio of "annual income" + "Interest Rate on the loan" and "intallment" .

In [18]:
df_train['ratio_inc_installment'] = round(df_train['continuous_annual_inc'] / (df_train['continuous_int_rate'] + df_train['continuous_installment'])).astype(int)
df_test['ratio_inc_installment'] = round(df_test['continuous_annual_inc'] / (df_test['continuous_int_rate'] + df_test['continuous_installment'])).astype(int)

#### Adding derived variable ---- never delinquent

In [19]:
df_train['never_delinq'] = df_train['continuous_mths_since_last_delinq'].isna()
df_train['never_delinq'] = df_train['never_delinq'].map(lambda x: 1 if x else 0)

df_test['never_delinq'] = df_test['continuous_mths_since_last_delinq'].isna()
df_test['never_delinq'] = df_test['never_delinq'].map(lambda x: 1 if x else 0)

#### Adding derived variable ---- the grade of months since last delinquent.

In [20]:
mean_train = df_train['continuous_mths_since_last_delinq'].mean()
df_train['grade_mhts_since_last_delinq'] = df_train['continuous_mths_since_last_delinq'].map(lambda x: 0 if np.isnan(x) or x < mean_train * 0.25 else 1 if x < mean_train * 0.5 else 2 if x < mean_train * 0.75 else 3)
mean_test = df_test['continuous_mths_since_last_delinq'].mean()
df_test['grade_mhts_since_last_delinq'] = df_test['continuous_mths_since_last_delinq'].map(lambda x: 0 if np.isnan(x) or x < mean_test * 0.25 else 1 if x < mean_test * 0.5 else 2 if x < mean_test * 0.75 else 3)

#### Data preprocessing

In [21]:
X_train = df_train.drop(columns=['loan_status']).values
Y_train = df_train['loan_status'].values.astype(int)
X_test = df_test.drop(columns=['loan_status']).values
Y_test = df_test['loan_status'].values.astype(int)

#### Split data for five fold

In [22]:
seed = 42 # for the same data division

kf = KFold(n_splits=5, random_state=seed,shuffle=True)

five_fold_data = []

for train_index, eval_index in kf.split(X_train):
    x_train, x_eval = X_train[train_index], X_train[eval_index]
    y_train, y_eval = Y_train[train_index], Y_train[eval_index]
    
    five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

#### Algorithm

In [23]:
def get_model(param):
    model_list = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[validation_data])
        model_list.append(bst)
    return model_list

#### Train

In [24]:
param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                     'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}

In [25]:
# base param train
param_base_model = get_model(param_base)

# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)

0-th model is training:
[LightGBM] [Info] Number of positive: 31851, number of negative: 8149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2844
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 144
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.796275 -> initscore=1.363174
[LightGBM] [Info] Start training from score 1.363174
[1]	valid_0's binary_logloss: 0.448263
[2]	valid_0's binary_logloss: 0.405778
[3]	valid_0's binary_logloss: 0.373058
[4]	valid_0's binary_logloss: 0.346731
[5]	valid_0's binary_logloss: 0.32527
[6]	valid_0's binary_logloss: 0.307357
[7]	valid_0's binary_logloss: 0.292431
[8]	valid_0's binary_logloss: 0.279772
[9]	valid_0's binary_logloss: 0.268756
[10]	valid_0's binary_logloss: 0.259293
[11]	valid_0's binary_logloss: 0.251245
[12]	valid_0's binary_logloss: 0.244273
[13]	valid_0's binary_logloss: 0.238266
[14]	valid_0's

#### Test

In [26]:
def test_model(model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

In [27]:
new_base_score = test_model(param_base_model)
new_fine_tuning_score = test_model(param_fine_tuning_model)

print(f'New base: {new_base_score}, new fine tuning: {new_fine_tuning_score}.')
print(f'Promoting of base:{new_base_score - base_score}, promoting of fine tuning:{new_fine_tuning_score - fine_tuning_score}.')

New base: 0.91624, new fine tuning: 0.91768.
Promoting of base:0.0007200000000000539, promoting of fine tuning:0.00012000000000000899.


# 2.Model with three non-deep-learning models ,and integrate them.

#### Split data for five fold

In [28]:
kf = KFold(n_splits=5, shuffle=True)

five_fold_data = []

for train_index, eval_index in kf.split(X_train):
    x_train, x_eval = X_train[train_index], X_train[eval_index]
    y_train, y_eval = Y_train[train_index], Y_train[eval_index]
    
    five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

#### XGBoost

In [29]:
from xgboost import XGBClassifier

In [30]:
# Algorithm
def get_xgboost_model():
    model_list = []
    for idx, [(x_train, y_train), (x_test, y_test)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        xgbc = XGBClassifier()
        xgbc.fit(x_train, y_train, eval_set=[(x_test, y_test)])
        model_list.append(xgbc)
    return model_list

In [31]:
# Train
xgboost_modles = get_xgboost_model()

0-th model is training:




[0]	validation_0-logloss:0.50056
[1]	validation_0-logloss:0.39533
[2]	validation_0-logloss:0.33009
[3]	validation_0-logloss:0.28828
[4]	validation_0-logloss:0.26042
[5]	validation_0-logloss:0.24171
[6]	validation_0-logloss:0.22882
[7]	validation_0-logloss:0.22014
[8]	validation_0-logloss:0.21446
[9]	validation_0-logloss:0.21046
[10]	validation_0-logloss:0.20780
[11]	validation_0-logloss:0.20593
[12]	validation_0-logloss:0.20471
[13]	validation_0-logloss:0.20402
[14]	validation_0-logloss:0.20378
[15]	validation_0-logloss:0.20313
[16]	validation_0-logloss:0.20292
[17]	validation_0-logloss:0.20298
[18]	validation_0-logloss:0.20315
[19]	validation_0-logloss:0.20345
[20]	validation_0-logloss:0.20338
[21]	validation_0-logloss:0.20345
[22]	validation_0-logloss:0.20349
[23]	validation_0-logloss:0.20375
[24]	validation_0-logloss:0.20377
[25]	validation_0-logloss:0.20371
[26]	validation_0-logloss:0.20363
[27]	validation_0-logloss:0.20373
[28]	validation_0-logloss:0.20405
[29]	validation_0-loglos

In [32]:
# get mean_model
def ypred_mean(model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return ypred_mean

In [33]:
xgboost_ypred_mean = ypred_mean(xgboost_modles)
print(f'Accuracy score of XGBoost:{accuracy_score(xgboost_ypred_mean, Y_test)}')

Accuracy score of XGBoost:0.91734


#### LightGBM

In [34]:
# Algorithm
def get_lgb_model(param):
    model_list = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[validation_data])
        model_list.append(bst)
    return model_list

In [35]:
# Train
param = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                     'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}

lgb_models = get_lgb_model(param)

0-th model is training:
[LightGBM] [Info] Number of positive: 31797, number of negative: 8203
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2844
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 144
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.794925 -> initscore=1.354872
[LightGBM] [Info] Start training from score 1.354872
[1]	valid_0's binary_logloss: 0.499853
[2]	valid_0's binary_logloss: 0.498029
[3]	valid_0's binary_logloss: 0.496219
[4]	valid_0's binary_logloss: 0.494433
[5]	valid_0's binary_logloss: 0.492685
[6]	valid_0's binary_logloss: 0.490931
[7]	valid_0's binary_logloss: 0.489192
[8]	valid_0's binary_logloss: 0.488869
[9]	valid_0's binary_logloss: 0.487147
[10]	valid_0's binary_logloss: 0.48684
[11]	valid_0's binary_logloss: 0.485178
[12]	valid_0's binary_logloss: 0.484891
[13]	valid_0's binary_logloss: 0.484582
[14]	valid_0's

In [36]:
# get mean_model
def ypred_mean(model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return ypred_mean

In [38]:
lgb_ypred_mean = ypred_mean(lgb_models)
print(f'Accuracy score of LightGBM:{accuracy_score(lgb_ypred_mean, Y_test)}')

Accuracy score of LightGBM:0.9177


#### catBoost

In [39]:
from catboost import CatBoostClassifier

In [40]:
# Algorithm
def get_catboost_model():
    model_list = []
    for idx, [(x_train, y_train), (x_test, y_test)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        model = CatBoostClassifier(iterations=1000, 
                                   learning_rate=0.01, 
                                   l2_leaf_reg=3.5, 
                                   depth=8, 
                                   rsm=0.98, 
                                   loss_function= 'Logloss',
                                   eval_metric='AUC',
                                   use_best_model=True)
        model.fit(x_train, y_train, eval_set=(x_test, y_test))
        model_list.append(model)
    return model_list

In [41]:
# Train
catboost_modles = get_catboost_model()

0-th model is training:
0:	test: 0.9455788	best: 0.9455788 (0)	total: 95.2ms	remaining: 1m 35s
1:	test: 0.9518902	best: 0.9518902 (1)	total: 119ms	remaining: 59.2s
2:	test: 0.9537520	best: 0.9537520 (2)	total: 138ms	remaining: 45.9s
3:	test: 0.9535702	best: 0.9537520 (2)	total: 159ms	remaining: 39.6s
4:	test: 0.9547504	best: 0.9547504 (4)	total: 174ms	remaining: 34.6s
5:	test: 0.9563300	best: 0.9563300 (5)	total: 192ms	remaining: 31.8s
6:	test: 0.9567863	best: 0.9567863 (6)	total: 212ms	remaining: 30.1s
7:	test: 0.9567951	best: 0.9567951 (7)	total: 231ms	remaining: 28.7s
8:	test: 0.9566352	best: 0.9567951 (7)	total: 250ms	remaining: 27.5s
9:	test: 0.9568633	best: 0.9568633 (9)	total: 267ms	remaining: 26.5s
10:	test: 0.9568009	best: 0.9568633 (9)	total: 283ms	remaining: 25.4s
11:	test: 0.9567887	best: 0.9568633 (9)	total: 303ms	remaining: 25s
12:	test: 0.9566693	best: 0.9568633 (9)	total: 320ms	remaining: 24.3s
13:	test: 0.9567155	best: 0.9568633 (9)	total: 339ms	remaining: 23.8s
14:	te

In [42]:
# get mean_model
def ypred_mean(model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return ypred_mean

In [43]:
catboost_ypred_mean = ypred_mean(catboost_modles)
print(f'Accuracy score of catBoost:{accuracy_score(catboost_ypred_mean, Y_test)}')

Accuracy score of catBoost:0.91788


#### Integrate

In [44]:
def modle_integrate(data, model_list):
    n = len(model_list)
    n_fold_pred = np.zeros((n, len(data)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data)
        n_fold_pred[i] = ypred
    ypred_mean = (n_fold_pred.mean(axis=-2)>0.5).astype(int)
    return ypred_mean

In [45]:
model_list = []
model_list.extend(xgboost_modles)
model_list.extend(lgb_models)
model_list.extend(catboost_modles)
integrate_ypred_mean = modle_integrate(X_test, model_list)
print(f'Accuracy score of integrating models:{accuracy_score(integrate_ypred_mean, Y_test)}')

Accuracy score of integrating models:0.91832


# 3.Modeling with TabNet

In [30]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

In [31]:
# nan -> 0
X_train[np.isnan(X_train)] = 0
X_test[np.isnan(X_test)] = 0

In [32]:
model_tabnet = TabNetClassifier()

Device used : cpu


In [33]:
model_tabnet.fit(X_train=X_train, y_train=Y_train,
               patience=5,max_epochs=100,
               eval_metric=['auc'])

No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 0.38115 |  0:00:06s
epoch 1  | loss: 0.22671 |  0:00:13s
epoch 2  | loss: 0.21494 |  0:00:20s
epoch 3  | loss: 0.21144 |  0:00:28s
epoch 4  | loss: 0.21193 |  0:00:37s
epoch 5  | loss: 0.20911 |  0:00:45s
epoch 6  | loss: 0.2091  |  0:00:54s
epoch 7  | loss: 0.20992 |  0:01:03s
epoch 8  | loss: 0.20882 |  0:01:13s
epoch 9  | loss: 0.21037 |  0:01:23s
epoch 10 | loss: 0.20943 |  0:01:32s
epoch 11 | loss: 0.20905 |  0:01:42s
epoch 12 | loss: 0.2089  |  0:01:52s
epoch 13 | loss: 0.20781 |  0:02:02s
epoch 14 | loss: 0.20787 |  0:02:12s
epoch 15 | loss: 0.20799 |  0:02:22s
epoch 16 | loss: 0.20769 |  0:02:32s
epoch 17 | loss: 0.20771 |  0:02:43s
epoch 18 | loss: 0.20943 |  0:02:53s
epoch 19 | loss: 0.20735 |  0:03:03s
epoch 20 | loss: 0.20814 |  0:03:13s
epoch 21 | loss: 0.20807 |  0:03:23s
epoch 22 | loss: 0.20779 |  0:03:33s
epoch 23 | loss: 0.20752 |  0:03:43s
epoch 24 | loss: 0.20735 |  0:03:53s
ep

In [35]:
tabnet_ypred = model_tabnet.predict(X_test)
print(f'Accuracy score of TabNet model:{accuracy_score(tabnet_ypred, Y_test)}')

Accuracy score of TabNet model:0.91556


# 4.Tuning to the TabNet

#### Network parameters

In [52]:
model_tabnet_2 = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params = {"gamma": 0.95,
                     "step_size": 20},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
)

Device used : cpu


In [53]:
max_epochs = 10
model_tabnet_2.fit(
    X_train=X_train, y_train=Y_train,
    max_epochs=max_epochs, patience=100,
    batch_size=16384, virtual_batch_size=256
)

No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 1.24497 |  0:00:26s
epoch 1  | loss: 0.63663 |  0:00:58s
epoch 2  | loss: 0.4716  |  0:01:38s
epoch 3  | loss: 0.38133 |  0:02:14s
epoch 4  | loss: 0.3179  |  0:02:52s
epoch 5  | loss: 0.28192 |  0:03:31s
epoch 6  | loss: 0.25444 |  0:04:11s
epoch 7  | loss: 0.25694 |  0:04:51s
epoch 8  | loss: 0.26041 |  0:05:28s
epoch 9  | loss: 0.25774 |  0:06:06s


In [54]:
tabnet_ypred = model_tabnet_2.predict(X_test)
print(f'Accuracy score of TabNet model:{accuracy_score(tabnet_ypred, Y_test)}')

Accuracy score of TabNet model:0.80476
