[Porto: xgb+lgb kfold LB 0.282](https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282)

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import gc

print('loading files...')
train = pd.read_csv('./data/train.csv', na_values=-1, nrows=10000)
test = pd.read_csv('./data/test.csv', na_values=-1, nrows=10000)
col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1)  

for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)    

print(train.shape, test.shape)

loading files...
(10000, 39) (10000, 38)


# xgboost params
xgboost document
# lightgbm params
lightgbm document

[boosting](https://www.slideshare.net/freepsw/boosting-bagging-vs-boosting)

In [52]:
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y)+1)/2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [4]:
# xgb
params = {'eta': 0.02,
         'max_depth': 4,
         'subsample': 0.9,
         'colsample_bytree': 0.8,
         'objective': 'binary:logistic',
         'eval_metric': 'auc',
         'silent': True}



In [5]:
X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub = test['id'].to_frame()
sub['target'] = 0

In [9]:
nrounds = 200
kfold = 2
skf = StratifiedKFold(n_splits=kfold, random_state=0, shuffle=True)

In [54]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {} of {} :'.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, 
                          d_train, 
                          nrounds, 
                          watchlist, 
                          early_stopping_rounds=100, 
                          feval=gini_xgb,
                          maximize=True,
                          verbose_eval=100)
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values),
                                     ntree_limit = xgb_model.best_ntree_iimit+50) / (2*kfold)

 xgb kfold: 1 of 2 :
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.58281	train-gini:0.14142	valid-auc:0.53698	valid-gini:0.05862
[100]	train-auc:0.78292	train-gini:0.56583	valid-auc:0.61953	valid-gini:0.23908
[199]	train-auc:0.88093	train-gini:0.76186	valid-auc:0.61246	valid-gini:0.22492


AttributeError: 'Booster' object has no attribute 'best_ntree_iimit'

In [13]:
gc.collect()

81

In [14]:
sub.head(2)

Unnamed: 0,id,target
0,0,0
1,1,0


In [30]:
for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    break

In [31]:
train_index

array([   4,    5,    7, ..., 9995, 9996, 9997])

In [32]:
valid_index

array([   0,    1,    2, ..., 9994, 9998, 9999])

In [20]:
i

0

In [21]:
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
X

array([[2.        , 2.        , 5.        , ..., 0.8836789 , 0.3708099 ,
        3.6055512 ],
       [1.        , 1.        , 7.        , ..., 0.6188165 , 0.38871583,
        2.4494898 ],
       [5.        , 4.        , 9.        , ..., 0.6415857 , 0.3472751 ,
        3.3166249 ],
       ...,
       [1.        , 1.        , 5.        , ..., 0.77493   , 0.41964272,
        3.6055512 ],
       [1.        , 1.        , 6.        , ..., 0.62357277, 0.3472751 ,
        3.1622777 ],
       [0.        , 2.        , 4.        , ..., 1.0681142 , 0.38729835,
        3.7416575 ]], dtype=float32)

In [23]:
y.sum()

379

In [25]:
y.mean()

0.0379

In [28]:
y[train_index].mean()

0.038

In [29]:
y[test_index].mean()

0.0378

In [33]:
X_train, X_valid = X[train_index], X[valid_index]

In [34]:
y_train, y_valid = y[train_index], y[valid_index]

In [35]:
d_train = xgb.DMatrix(X_train, y_train)
d_valid = xgb.DMatrix(X_valid, y_valid)

In [36]:
d_train

<xgboost.core.DMatrix at 0x1d86a1cf580>

In [43]:
# 학습이 얼마나 잘 되고 있는지
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [55]:
xbg_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                      feval=gini_xgb, maximize=True, verbose_eval=100)

Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.58281	train-gini:0.14142	valid-auc:0.53698	valid-gini:0.05862
[100]	train-auc:0.78292	train-gini:0.56583	valid-auc:0.61953	valid-gini:0.23908
[199]	train-auc:0.88093	train-gini:0.76186	valid-auc:0.61246	valid-gini:0.22492


In [56]:
xgb_model.best_ntree_limit

120

In [58]:
sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values),
                                     ntree_limit = xgb_model.best_ntree_limit+50) / (2*kfold)



+ xgboost 모델 두개로 0.5를 채운 것
  + 두개의 fold로 나눠서 0.5 채움
+ lightgbm도 똑같이 두 fold로 나눠서 0.5 채울 것

In [60]:
# lgb
params = {'metric': 'auc', 'learning_rate' : 0.01, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

skf = StratifiedKFold(n_splits=kfold, random_state=1, shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    sub['target'] += lgb_model.predict(test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (2*kfold)

 lgb kfold: 1  of  2 : 
[LightGBM] [Info] Number of positive: 190, number of negative: 4810
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 5000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038000 -> initscore=-3.231428
[LightGBM] [Info] Start training from score -3.231428
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.596875	valid_0's gini: 0.193748
Early stopping, best iteration is:
[39]	valid_0's auc: 0.602078	valid_0's gini: 0.204339
 lgb kfold: 2  of  2 : 
[LightGBM] [Info] Number of positive: 189, number of negative: 4811
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 5000, number of 

In [61]:
gc.collect()
sub.head(2)

Unnamed: 0,id,target
0,0,0.028747
1,1,0.029357


Task was destroyed but it is pending!
task: <Task pending name='Task-1' coro=<Kernel.poll_control_queue() running at C:\Users\Attagungho\anaconda3\envs\kaggle\lib\site-packages\ipykernel\kernelbase.py:226> wait_for=<Future finished result=[<zmq.sugar.fr...001D86613DA90>, <zmq.sugar.fr...001D86A03DEB0>, <zmq.sugar.fr...001D86A022670>, <zmq.sugar.fr...001D86A022300>, <zmq.sugar.fr...001D86F044460>, <zmq.sugar.fr...001D86F0443B0>, ...]> cb=[_chain_future.<locals>._call_set_state() at C:\Users\Attagungho\anaconda3\envs\kaggle\lib\asyncio\futures.py:391]>


+ 스태킹은 싱글 모델보다 성적이 좋다. 기본으로 쓰세요.