In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
import math

In [2]:
train_data = pd.read_csv('./data/train_final.csv')
test_data = pd.read_csv('./data/test_final.csv')

### BaseLine

In [3]:
train_y = train_data.loan_status
train_x = train_data.drop(['loan_status'], axis=1)
test_y = test_data.loan_status
test_x = test_data.drop(['loan_status'], axis=1)

In [4]:
train_set = lgb.Dataset(train_x, train_y)
test_set = lgb.Dataset(test_x, test_y, reference=train_set)

In [5]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1', 'auc'},
    'num_leaves': 7,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [6]:
model = lgb.train(params, train_set, valid_sets=test_set, num_boost_round=100, early_stopping_rounds=10)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's l2: 0.139183	valid_0's l1: 0.300462	valid_0's auc: 0.938528
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.124523	valid_0's l1: 0.28303	valid_0's auc: 0.939113
[3]	valid_0's l2: 0.112617	valid_0's l1: 0.267277	valid_0's auc: 0.944291
[4]	valid_0's l2: 0.10302	valid_0's l1: 0.25312	valid_0's auc: 0.944913
[5]	valid_0's l2: 0.0951464	valid_0's l1: 0.240316	valid_0's auc: 0.945884
[6]	valid_0's l2: 0.0886622	valid_0's l1: 0.228639	valid_0's auc: 0.946207
[7]	valid_0's l2: 0.0834292	valid_0's l1: 0.218204	valid_0's auc: 0.949131
[8]	valid_0's l2: 0.0792008	valid_0's l1: 0.208852	valid_0's auc: 0.949273
[9]	valid_0's l2: 0.0757594	valid_0's l1: 0.200396	valid_0's auc: 0.949435
[10]	valid_0's l2: 0.0729348	valid_0's l1: 0.192669	valid_0's auc: 0.949527
[11]	valid_0's l2: 0.0706672	valid_0's l1: 0.185717	valid_0's auc: 0.9496

In [7]:
predictions = model.predict(test_x, num_iteration=model.best_iteration)

In [8]:
mean_absolute_error = metrics.mean_absolute_error(predictions, test_y)
mean_squared_error = metrics.mean_squared_error(predictions, test_y)
print("Mean Absolute Error: ", mean_absolute_error)
print("Mean Squared Eror: ", mean_squared_error)

Mean Absolute Error:  0.11806305805788035
Mean Squared Eror:  0.05971781778676541


### 添加衍生变量 -- 年收入与每月还款额的比值

In [9]:
train_y = train_data.loan_status
train_x = train_data.drop(['loan_status'], axis=1)
test_y = test_data.loan_status
test_x = test_data.drop(['loan_status'], axis=1)

In [10]:
train_x['ratio_inc_installment'] = round(train_x['continuous_annual_inc'] / train_x['continuous_installment'])
test_x['ratio_inc_installment'] = round(test_x['continuous_annual_inc'] / test_x['continuous_installment'])

In [11]:
train_set = lgb.Dataset(train_x, train_y)
test_set = lgb.Dataset(test_x, test_y, reference=train_set)

In [12]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1', 'auc'},
    'num_leaves': 7,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [13]:
model = lgb.train(params, train_set, valid_sets=test_set, num_boost_round=100, early_stopping_rounds=10)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's l2: 0.139183	valid_0's l1: 0.300462	valid_0's auc: 0.938528
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.124523	valid_0's l1: 0.28303	valid_0's auc: 0.939113
[3]	valid_0's l2: 0.112617	valid_0's l1: 0.267277	valid_0's auc: 0.944291
[4]	valid_0's l2: 0.10302	valid_0's l1: 0.25312	valid_0's auc: 0.944913
[5]	valid_0's l2: 0.0951422	valid_0's l1: 0.240322	valid_0's auc: 0.945826
[6]	valid_0's l2: 0.0886607	valid_0's l1: 0.228641	valid_0's auc: 0.945983
[7]	valid_0's l2: 0.083429	valid_0's l1: 0.218195	valid_0's auc: 0.949033
[8]	valid_0's l2: 0.0792037	valid_0's l1: 0.208845	valid_0's auc: 0.949253
[9]	valid_0's l2: 0.0757625	valid_0's l1: 0.20039	valid_0's auc: 0.949445
[10]	valid_0's l2: 0.0729376	valid_0's l1: 0.192674	valid_0's auc: 0.949542
[11]	valid_0's l2: 0.0706369	valid_0's l1: 0.185672	valid_0's auc: 0.949698

In [14]:
predictions = model.predict(test_x, num_iteration=model.best_iteration)

In [15]:
mean_absolute_error = metrics.mean_absolute_error(predictions, test_y)
mean_squared_error = metrics.mean_squared_error(predictions, test_y)
print("Mean Absolute Error: ", mean_absolute_error)
print("Mean Squared Eror: ", mean_squared_error)

Mean Absolute Error:  0.11838744038792609
Mean Squared Eror:  0.059653877876836896


### 对比：
* auc 提升了0.0004

### 添加衍生变量--"从没逾期"("never_delinq")

In [16]:
train_x['never_delinq'] = train_x['continuous_mths_since_last_major_derog'].isna()
train_x['never_delinq'] = train_x['never_delinq'].map(lambda x: 0 if x else 1)

test_x['never_delinq'] = test_x['continuous_mths_since_last_major_derog'].isna()
test_x['never_delinq'] = test_x['never_delinq'].map(lambda x: 0 if x else 1)

In [18]:
train_set = lgb.Dataset(train_x, train_y)
test_set = lgb.Dataset(test_x, test_y, reference=train_set)

In [19]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1', 'auc'},
    'num_leaves': 7,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [20]:
model = lgb.train(params, train_set, valid_sets=test_set, num_boost_round=100, early_stopping_rounds=10)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's l2: 0.139183	valid_0's l1: 0.300462	valid_0's auc: 0.938528
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l2: 0.124523	valid_0's l1: 0.28303	valid_0's auc: 0.939113
[3]	valid_0's l2: 0.112617	valid_0's l1: 0.267277	valid_0's auc: 0.944291
[4]	valid_0's l2: 0.10302	valid_0's l1: 0.25312	valid_0's auc: 0.944913
[5]	valid_0's l2: 0.0951464	valid_0's l1: 0.240316	valid_0's auc: 0.945884
[6]	valid_0's l2: 0.0886622	valid_0's l1: 0.228639	valid_0's auc: 0.946207
[7]	valid_0's l2: 0.0834292	valid_0's l1: 0.218204	valid_0's auc: 0.949131
[8]	valid_0's l2: 0.0792008	valid_0's l1: 0.208852	valid_0's auc: 0.949273
[9]	valid_0's l2: 0.0757594	valid_0's l1: 0.200396	valid_0's auc: 0.949435
[10]	valid_0's l2: 0.0729348	valid_0's l1: 0.192669	valid_0's auc: 0.949527
[11]	valid_0's l2: 0.070628	valid_0's l1: 0.185662	valid_0's auc: 0.94969

In [21]:
predictions = model.predict(test_x, num_iteration=model.best_iteration)

In [22]:
mean_absolute_error = metrics.mean_absolute_error(predictions, test_y)
mean_squared_error = metrics.mean_squared_error(predictions, test_y)
print("Mean Absolute Error: ", mean_absolute_error)
print("Mean Squared Eror: ", mean_squared_error)

Mean Absolute Error:  0.1175800518059869
Mean Squared Eror:  0.05963361584700895


### 对比：
* auc 没有提升了，反而降低了