In [2]:
import lightgbm as lgb

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn import decomposition
import math

import re
%matplotlib inline

In [4]:
def read_data(filename):
    data = pd.read_csv(filename, header=0, sep=',', encoding='pt154')
    data = data.rename(index=str, columns={'client_id': '_ID_', 'open_acount_flg': '_VAL_'})
    return data

In [5]:
train = read_data('../data/dummy_encoding_all_city_train.csv')
test = read_data('../data/dummy_encoding_all_city_test.csv')

In [6]:
train.head()

Unnamed: 0,_ID_,age,credit_sum,score_shk,monthly_income,credit_count,overdue_credit_count,open_account_flg,gender=0,gender=1,...,living_region=78,living_region=79,living_region=80,living_region=81,living_region=82,living_region=83,living_region=84,living_region=85,living_region=86,living_region=87
0,1,48,60.0,0.770249,30000.0,1.0,1.0,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,28,11.0,0.248514,43000.0,2.0,0.0,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,32,11.0,0.459589,23000.0,5.0,0.0,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,27,12.0,0.362536,17000.0,2.0,0.0,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,45,17.0,0.421385,25000.0,1.0,0.0,0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
y = train.open_account_flg
del train['_ID_']
del train['open_account_flg']

In [8]:
client_ids = test._ID_
del test['_ID_']

In [9]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train, y, test_size=0.3)

In [10]:
X_test = test

In [11]:
print(X_train.shape, X_valid.shape, X_test.shape)

(119522, 187) (51224, 187) (91940, 187)


# Lightgbm

### parameter information: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

In [12]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)

In [13]:
params = {
    'task': 'predict',
    'num_trees': 1000,
    'num_leaves': 150,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'auc',
    'max_depth': 7,
    'learning_rate': 0.05,
    'sub_feature': 0.8,
    'sub_row': 0.8,
#     'lambda_l1': 2,
#     'lambda_l2': 2,
    'verbose': 0
}

In [14]:
gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_valid,
                early_stopping_rounds=25)

[1]	valid_0's auc: 0.721814
Train until valid scores didn't improve in 25 rounds.
[2]	valid_0's auc: 0.735931
[3]	valid_0's auc: 0.738094
[4]	valid_0's auc: 0.74046
[5]	valid_0's auc: 0.741793
[6]	valid_0's auc: 0.743901
[7]	valid_0's auc: 0.744653
[8]	valid_0's auc: 0.745459
[9]	valid_0's auc: 0.746398
[10]	valid_0's auc: 0.746482
[11]	valid_0's auc: 0.746898
[12]	valid_0's auc: 0.747315
[13]	valid_0's auc: 0.747861
[14]	valid_0's auc: 0.748428
[15]	valid_0's auc: 0.748999
[16]	valid_0's auc: 0.749398
[17]	valid_0's auc: 0.750053
[18]	valid_0's auc: 0.75035
[19]	valid_0's auc: 0.751041
[20]	valid_0's auc: 0.751371
[21]	valid_0's auc: 0.751483
[22]	valid_0's auc: 0.752051
[23]	valid_0's auc: 0.752299
[24]	valid_0's auc: 0.752575
[25]	valid_0's auc: 0.752911
[26]	valid_0's auc: 0.753156
[27]	valid_0's auc: 0.75376
[28]	valid_0's auc: 0.753957
[29]	valid_0's auc: 0.754129
[30]	valid_0's auc: 0.754255
[31]	valid_0's auc: 0.754354
[32]	valid_0's auc: 0.75466
[33]	valid_0's auc: 0.75507
[34

In [99]:
pred_test = xgb1.predict_proba(X_test)[:, 1]
prediction = pd.DataFrame(data={'_ID_': client_ids, '_VAL_': pred_test}, index=test.index)
prediction.to_csv('../data/prediction.csv', index=False)
print(prediction.head())

     _ID_     _VAL_
0  170747  0.058535
1  170748  0.137427
2  170749  0.280323
3  170750  0.158912
4  170751  0.067224
