In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, log_loss

In [2]:
data = pd.read_csv('../data/sample.csv')[:20000]

NUMERIC_COLS = ['I' + str(i) for i in range(1, 14)]
SPARSE_COLS = ['C' + str(i) for i in range(1, 27)]

data[NUMERIC_COLS] = data[NUMERIC_COLS].fillna(0)
data[SPARSE_COLS] = data[SPARSE_COLS].fillna('-1')

sparse_index = data[SPARSE_COLS]
dense_value = data[NUMERIC_COLS]
label = data['label'].values

In [3]:
ohe = OneHotEncoder()
sparse_onehot = ohe.fit_transform(sparse_index).toarray()

lr_input1 = np.concatenate((sparse_onehot, dense_value.values), axis=1)

In [4]:
lgb_train = lgb.Dataset(dense_value[:18000], label[:18000])
lgb_val = lgb.Dataset(dense_value[18000:], label[18000:], reference=lgb_train)

params	=	{
				'task':	'train',
				'boosting_type':	'gbdt',
				'objective':	'binary',
				'metric':	{'binary_logloss'},
				'num_leaves':	64,
				'num_trees':	100,
				'learning_rate':	0.01,
				'feature_fraction':	0.9,
				'bagging_fraction':	0.8,
				'bagging_freq':	5,
				'verbose':	0
}

gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_val)
pred = gbm.predict(dense_value, pred_leaf=True)

dense_onehot = ohe.fit_transform(pred).toarray()
lr_input2 = np.concatenate((sparse_onehot, dense_onehot), axis=1)



[1]	valid_0's binary_logloss: 0.531359
[2]	valid_0's binary_logloss: 0.530244
[3]	valid_0's binary_logloss: 0.529175
[4]	valid_0's binary_logloss: 0.52815
[5]	valid_0's binary_logloss: 0.52715
[6]	valid_0's binary_logloss: 0.52617
[7]	valid_0's binary_logloss: 0.525168
[8]	valid_0's binary_logloss: 0.524278
[9]	valid_0's binary_logloss: 0.523323
[10]	valid_0's binary_logloss: 0.522426
[11]	valid_0's binary_logloss: 0.521587
[12]	valid_0's binary_logloss: 0.52071
[13]	valid_0's binary_logloss: 0.519857
[14]	valid_0's binary_logloss: 0.519042
[15]	valid_0's binary_logloss: 0.518301
[16]	valid_0's binary_logloss: 0.517454
[17]	valid_0's binary_logloss: 0.516689
[18]	valid_0's binary_logloss: 0.51598
[19]	valid_0's binary_logloss: 0.515206
[20]	valid_0's binary_logloss: 0.514447
[21]	valid_0's binary_logloss: 0.513706
[22]	valid_0's binary_logloss: 0.513076
[23]	valid_0's binary_logloss: 0.512378
[24]	valid_0's binary_logloss: 0.51174
[25]	valid_0's binary_logloss: 0.511026
[26]	valid_0's 

In [5]:
lr = LogisticRegression()
lr.fit(lr_input1[:18000], label[:18000])
res1 = lr.predict(lr_input1[18000:])
roc_auc_score(label[18000:], res1), log_loss(label[18000:], res1)



(0.5158242616660852, 7.753963296631749)

In [6]:
lr = LogisticRegression()
lr.fit(lr_input2[:18000], label[:18000])
res2 = lr.predict(lr_input2[18000:])
roc_auc_score(label[18000:], res2), log_loss(label[18000:], res2)



(0.6084241936016566, 8.60023768086807)