# 10 - Multiple LGBM models for stacking

#### Imports

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="white")

#### Constants

In [2]:
n_components = 1000

In [3]:
models_folder = "models/"
lgbm_folder = "lgbm_models/"
train_data_fn = models_folder+'train_data.pkl'
target_fn = models_folder+'target.pkl'
test_data_fn = models_folder+'test_data.pkl'

weight_multiplier_fn = models_folder+"weight_multiplier.pkl"

#### Functions

In [4]:
import os.path
from sklearn.externals import joblib

def Load(filename):
    if os.path.isfile(filename):
        return joblib.load(filename)
    
def Save(obj, filename):
    joblib.dump(obj, filename)

# Loading data

In [5]:
import scipy

data = scipy.sparse.load_npz("train_sparse_matrix_after_scale.npz")

kmeans100 = Load(models_folder+'kmeans_n100.pkl')
kmeans2 = Load(models_folder+'kmeans_n2.pkl')
target = Load(target_fn)

In [6]:
from scipy import sparse

traink100 = sparse.csr_matrix(kmeans100[:427994])
traink2 = sparse.csr_matrix(kmeans2[:427994])

from scipy.sparse import hstack

data = hstack([data,traink100.transpose(),traink2.transpose()]).tocsr()  

In [7]:
weight_multiplier = Load(weight_multiplier_fn)

In [8]:
target = target.ravel()

In [9]:
test_data = scipy.sparse.load_npz("test_sparse_matrix_after_scale.npz")

In [10]:
from scipy import sparse

testk100 = sparse.csr_matrix(kmeans100[427994:])
testk2 = sparse.csr_matrix(kmeans2[427994:])

from scipy.sparse import hstack

test_data = hstack([test_data,testk100.transpose(),testk2.transpose()]).tocsr()  

In [11]:
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import re
from sklearn.metrics import roc_auc_score

def train_lgbm_model(i):
    global data
    global target
    global weight_multiplier
    
    
    X_train, X_validation, Y_train, Y_validation = train_test_split(data, target, train_size=0.8)
    
    d_train = lgbm.Dataset(X_train, label=Y_train)
    d_valid = lgbm.Dataset(X_validation, label=Y_validation)
    
    params = {
        'subsample_freq': 2,
        'subsample_for_bin': 80+i,
        'subsample': 0.7,
        'scale_pos_weight': weight_multiplier,
        'reg_lambda': 0.2,
        'reg_alpha': 7,
        'objective': 'binary',
        'num_leaves': 30+i,
        'min_split_gain': 2.0,
        'min_child_weight': 3,
        'min_child_samples': 80+i,
        'metric': 'auc',
        'max_depth': 20,
        'max_bin': 80+i,
        'learning_rate': 0.1,
        'colsample_bytree': 0.7
    }
    evals_results = {}
    num_boost_round=3000
    early_stopping_rounds=200
    feval=None

    model = lgbm.train(params, 
                         d_train, 
                         valid_sets=[d_train, d_valid], 
                         valid_names=['train','valid'], 
                         evals_result=evals_results, 
                         num_boost_round=num_boost_round,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose_eval=10, 
                         feval=feval)
    n_estimators = model.best_iteration
    print("\nModel Report iteration:",i)
    print("n_estimators : ", n_estimators)
    print("AUC"+":", evals_results['valid']['auc'][n_estimators-1])


    predicted = model.predict(X_validation)
    print("Iteration: ",i," ROC AUC score: ",roc_auc_score(Y_validation, predicted))
    
    Save(model,lgbm_folder+"lgbm{!s}.pkl".format(i))
    
    Y_test = model.predict(test_data)
    
    predictions = pd.DataFrame(Y_test)
    predictions.to_csv("lgbm_predictions/pred{!s}.pkl".format(i),header=None, index=None)

In [12]:
%%time
for i in range(0,40):
    train_lgbm_model(i)



Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.653826	valid's auc: 0.632417
[20]	train's auc: 0.668151	valid's auc: 0.639604
[30]	train's auc: 0.68065	valid's auc: 0.644416
[40]	train's auc: 0.690282	valid's auc: 0.647383
[50]	train's auc: 0.699269	valid's auc: 0.648047
[60]	train's auc: 0.707074	valid's auc: 0.648589
[70]	train's auc: 0.714426	valid's auc: 0.649096
[80]	train's auc: 0.720695	valid's auc: 0.650104
[90]	train's auc: 0.727037	valid's auc: 0.649781
[100]	train's auc: 0.733239	valid's auc: 0.650057
[110]	train's auc: 0.738741	valid's auc: 0.650122
[120]	train's auc: 0.744266	valid's auc: 0.649374
[130]	train's auc: 0.749017	valid's auc: 0.649378
[140]	train's auc: 0.75384	valid's auc: 0.648563
[150]	train's auc: 0.758544	valid's auc: 0.64898
[160]	train's auc: 0.762888	valid's auc: 0.648582
[170]	train's auc: 0.76703	valid's auc: 0.648374
[180]	train's auc: 0.771193	valid's auc: 0.647379
[190]	train's auc: 0.775176	valid's auc: 0.646722


[270]	train's auc: 0.8133	valid's auc: 0.635305
[280]	train's auc: 0.816369	valid's auc: 0.634381
[290]	train's auc: 0.819323	valid's auc: 0.63402
[300]	train's auc: 0.821809	valid's auc: 0.633506
[310]	train's auc: 0.824656	valid's auc: 0.633254
[320]	train's auc: 0.82718	valid's auc: 0.632344
Early stopping, best iteration is:
[121]	train's auc: 0.753276	valid's auc: 0.640988

Model Report iteration: 4
n_estimators :  121
AUC: 0.6409883574626616
Iteration:  4  ROC AUC score:  0.6409883574626616
Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.656639	valid's auc: 0.630776
[20]	train's auc: 0.674306	valid's auc: 0.639079
[30]	train's auc: 0.687281	valid's auc: 0.643272
[40]	train's auc: 0.697711	valid's auc: 0.645762
[50]	train's auc: 0.706926	valid's auc: 0.64714
[60]	train's auc: 0.715348	valid's auc: 0.648417
[70]	train's auc: 0.723379	valid's auc: 0.648645
[80]	train's auc: 0.730307	valid's auc: 0.649219
[90]	train's auc: 0.736883	valid's auc: 0.64

[200]	train's auc: 0.803321	valid's auc: 0.637636
[210]	train's auc: 0.807247	valid's auc: 0.636994
[220]	train's auc: 0.811194	valid's auc: 0.636785
[230]	train's auc: 0.81537	valid's auc: 0.636639
[240]	train's auc: 0.818606	valid's auc: 0.635815
[250]	train's auc: 0.821858	valid's auc: 0.635316
[260]	train's auc: 0.825369	valid's auc: 0.634895
[270]	train's auc: 0.828706	valid's auc: 0.63497
[280]	train's auc: 0.832029	valid's auc: 0.634664
[290]	train's auc: 0.83505	valid's auc: 0.634165
Early stopping, best iteration is:
[94]	train's auc: 0.749507	valid's auc: 0.641622

Model Report iteration: 9
n_estimators :  94
AUC: 0.6416222940604185
Iteration:  9  ROC AUC score:  0.6416222940604184
Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.659307	valid's auc: 0.630535
[20]	train's auc: 0.677466	valid's auc: 0.639906
[30]	train's auc: 0.691212	valid's auc: 0.644349
[40]	train's auc: 0.70299	valid's auc: 0.648617
[50]	train's auc: 0.712815	valid's auc: 0

[150]	train's auc: 0.790237	valid's auc: 0.649573
[160]	train's auc: 0.795086	valid's auc: 0.649331
[170]	train's auc: 0.800054	valid's auc: 0.649505
[180]	train's auc: 0.804753	valid's auc: 0.649472
[190]	train's auc: 0.809033	valid's auc: 0.648965
[200]	train's auc: 0.812875	valid's auc: 0.649182
[210]	train's auc: 0.81685	valid's auc: 0.648101
[220]	train's auc: 0.821192	valid's auc: 0.646851
[230]	train's auc: 0.824692	valid's auc: 0.646509
[240]	train's auc: 0.828782	valid's auc: 0.646359
[250]	train's auc: 0.832583	valid's auc: 0.645832
[260]	train's auc: 0.835923	valid's auc: 0.645943
[270]	train's auc: 0.839228	valid's auc: 0.645671
[280]	train's auc: 0.842209	valid's auc: 0.645113
Early stopping, best iteration is:
[83]	train's auc: 0.748348	valid's auc: 0.651159

Model Report iteration: 14
n_estimators :  83
AUC: 0.6511594728419148
Iteration:  14  ROC AUC score:  0.6511594728419148
Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.665284	valid

[150]	train's auc: 0.801872	valid's auc: 0.639638
[160]	train's auc: 0.807027	valid's auc: 0.639439
[170]	train's auc: 0.812106	valid's auc: 0.638754
[180]	train's auc: 0.816506	valid's auc: 0.637341
[190]	train's auc: 0.821348	valid's auc: 0.636908
[200]	train's auc: 0.825721	valid's auc: 0.636313
[210]	train's auc: 0.829585	valid's auc: 0.635618
[220]	train's auc: 0.833327	valid's auc: 0.634863
[230]	train's auc: 0.837171	valid's auc: 0.634334
[240]	train's auc: 0.841037	valid's auc: 0.633684
[250]	train's auc: 0.844426	valid's auc: 0.632726
[260]	train's auc: 0.847738	valid's auc: 0.632597
[270]	train's auc: 0.850881	valid's auc: 0.631621
Early stopping, best iteration is:
[71]	train's auc: 0.74901	valid's auc: 0.643328

Model Report iteration: 19
n_estimators :  71
AUC: 0.6433277501665006
Iteration:  19  ROC AUC score:  0.6433277501665007
Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.668069	valid's auc: 0.629588
[20]	train's auc: 0.688381	valid'

[140]	train's auc: 0.806399	valid's auc: 0.645808
[150]	train's auc: 0.811966	valid's auc: 0.644786
[160]	train's auc: 0.817468	valid's auc: 0.644505
[170]	train's auc: 0.822187	valid's auc: 0.644023
[180]	train's auc: 0.82673	valid's auc: 0.643605
[190]	train's auc: 0.831672	valid's auc: 0.64302
[200]	train's auc: 0.836115	valid's auc: 0.642338
[210]	train's auc: 0.840369	valid's auc: 0.642042
[220]	train's auc: 0.844427	valid's auc: 0.640977
[230]	train's auc: 0.848343	valid's auc: 0.640528
[240]	train's auc: 0.851714	valid's auc: 0.640386
[250]	train's auc: 0.855529	valid's auc: 0.639264
[260]	train's auc: 0.859376	valid's auc: 0.638839
[270]	train's auc: 0.86235	valid's auc: 0.638136
[280]	train's auc: 0.865489	valid's auc: 0.637427
[290]	train's auc: 0.868831	valid's auc: 0.636933
Early stopping, best iteration is:
[91]	train's auc: 0.773434	valid's auc: 0.648842

Model Report iteration: 24
n_estimators :  91
AUC: 0.6488417274676725
Iteration:  24  ROC AUC score:  0.64884172746767

[130]	train's auc: 0.809745	valid's auc: 0.647181
[140]	train's auc: 0.815799	valid's auc: 0.646626
[150]	train's auc: 0.821761	valid's auc: 0.646058
[160]	train's auc: 0.827119	valid's auc: 0.645715
[170]	train's auc: 0.832331	valid's auc: 0.64527
[180]	train's auc: 0.837278	valid's auc: 0.64454
[190]	train's auc: 0.842343	valid's auc: 0.644025
[200]	train's auc: 0.847214	valid's auc: 0.643383
[210]	train's auc: 0.851239	valid's auc: 0.642246
[220]	train's auc: 0.855436	valid's auc: 0.642106
[230]	train's auc: 0.859558	valid's auc: 0.641258
[240]	train's auc: 0.863558	valid's auc: 0.639951
[250]	train's auc: 0.867338	valid's auc: 0.639223
[260]	train's auc: 0.870535	valid's auc: 0.638996
Early stopping, best iteration is:
[64]	train's auc: 0.756255	valid's auc: 0.652068

Model Report iteration: 29
n_estimators :  64
AUC: 0.6520676372369492
Iteration:  29  ROC AUC score:  0.6520676372369492
Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.674635	valid'

[210]	train's auc: 0.860121	valid's auc: 0.636822
[220]	train's auc: 0.864505	valid's auc: 0.636087
[230]	train's auc: 0.868748	valid's auc: 0.635234
[240]	train's auc: 0.872518	valid's auc: 0.634626
[250]	train's auc: 0.875973	valid's auc: 0.633273
[260]	train's auc: 0.879166	valid's auc: 0.633262
[270]	train's auc: 0.883094	valid's auc: 0.632508
[280]	train's auc: 0.886194	valid's auc: 0.63176
Early stopping, best iteration is:
[86]	train's auc: 0.786969	valid's auc: 0.645024

Model Report iteration: 34
n_estimators :  86
AUC: 0.6450235581817573
Iteration:  34  ROC AUC score:  0.6450235581817572
Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.676899	valid's auc: 0.627717
[20]	train's auc: 0.700043	valid's auc: 0.636774
[30]	train's auc: 0.718531	valid's auc: 0.640077
[40]	train's auc: 0.733328	valid's auc: 0.641797
[50]	train's auc: 0.746277	valid's auc: 0.643131
[60]	train's auc: 0.758502	valid's auc: 0.641672
[70]	train's auc: 0.76907	valid's auc:


Model Report iteration: 39
n_estimators :  66
AUC: 0.6432325683626051
Iteration:  39  ROC AUC score:  0.6432325683626052
CPU times: user 12h 3min 15s, sys: 47.4 s, total: 12h 4min 3s
Wall time: 1h 35min 35s


In [33]:
%%time
lgbm_model1 = Load(lgbm_folder+"lgbm{!s}.pkl".format(0))
pred = lgbm_model1.predict(data)
df_train = pd.DataFrame(pred)
df_train.columns = ['lgbm0']

pred_test = lgbm_model1.predict(test_data)
df_test = pd.DataFrame(pred_test)
df_test.columns = ['lgbm0']

CPU times: user 1min 1s, sys: 15.8 ms, total: 1min 1s
Wall time: 9.04 s


In [36]:
%%time
for i in range(1,40):
    model = Load(lgbm_folder+"lgbm{!s}.pkl".format(i))
    df_train["lgbm{!s}".format(i)] = model.predict(data)
    df_test["lgbm{!s}".format(i)] = model.predict(test_data)

CPU times: user 46min 45s, sys: 490 ms, total: 46min 46s
Wall time: 6min 28s


In [40]:
df_train.shape

(427994, 40)

In [44]:
Save(df_train, "train_meta_data.pkl")
Save(df_test, "test_meta_data.pkl")

# GridSearch LGBM on meta data

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_validation, Y_train, Y_validation = train_test_split(df_train, target, train_size=0.9)



In [46]:
tuned_parameters = {
     'num_leaves': [50,1000,10000,10000],
     'max_depth':[10,20,30,40,50],
     'min_child_samples':[30,50,70,100,120],
     'max_bin':[50,100,150,200],
     'subsample':[0.1,0.4,0.7,0.9],
     'subsample_freq':[2,30,60,100],
     'colsample_bytree':[0.2,0.3,0.7],
     'min_child_weight':[2,3,6],
     'subsample_for_bin':[10,60,100,150,200],
     'min_split_gain':[1.1,2.0,10.0],
     'reg_alpha':[2,3,5,7,8],
     'reg_lambda':[0,0.2,0.8],
     'metric':['auc'],
    'learning_rate':[0.05,0.1,0.005,0.2],
    'objective':['binary'],
    'scale_pos_weight':[1,weight_multiplier,1/weight_multiplier],
}

In [49]:
%%time
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import lightgbm as lgbm
import re

clf = RandomizedSearchCV(lgbm.LGBMClassifier(nthread=8, verbose_eval=32),
                   tuned_parameters,
                   cv=4,
                   n_iter=20,
                   n_jobs=4,
                   scoring='roc_auc',
                   verbose=2)

CPU times: user 92 µs, sys: 0 ns, total: 92 µs
Wall time: 101 µs


In [50]:
%%time
clf.fit(X_train, Y_train)

Fitting 4 folds for each of 20 candidates, totalling 80 fits
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=40, max_bin=50, learning_rate=0.2, colsample_bytree=0.3 
[CV]  subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=40, max_bin=50, learning_rate=0.2, colsample_bytree=0.3, total=   0.8s
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=40, max_bin=50, learning_rate=0.2, colsample_bytree=0.3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV]  subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=40, max_bin=50, learning_rate=0.2, colsample_bytree=0.3, total=   0.8s
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=40, max_bin=50, learning_rate=0.2, colsample_bytree=0.3 
[CV]  subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=40, max_bin=50, learning_rate=0.2, colsample_bytree=0.3, total=   0.8s
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0, reg_alpha=5, objectiv

[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=7, objective=binary, num_leaves=50, min_split_gain=10.0, min_child_weight=3, min_child_samples=30, metric=auc, max_depth=50, max_bin=50, learning_rate=0.05, colsample_bytree=0.7, total=   0.6s
[CV] subsample_freq=2, subsample_for_bin=60, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=50, max_bin=50, learning_rate=0.005, colsample_bytree=0.3 
[CV]  subsample_freq=2, subsample_for_bin=60, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=50, max_bin=50, learning_rate=0.005, colsample_bytree=0.3, total=   5.1s
[CV] subsample_freq=2, subsample_for_bin=60, subsample=0.1, scale_pos_weight=1, reg_lambda=0.

[CV]  subsample_freq=30, subsample_for_bin=10, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=3, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=20, max_bin=200, learning_rate=0.005, colsample_bytree=0.7, total=   1.3s
[CV] subsample_freq=30, subsample_for_bin=10, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=3, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=20, max_bin=200, learning_rate=0.005, colsample_bytree=0.7 
[CV]  subsample_freq=30, subsample_for_bin=10, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=3, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=20, max_bin=200, learning_rate=0.005, colsample_bytree=0.7, total=   1.2s
[CV] subsample_freq=30, subsample_for_bin=10, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=3,

[CV]  subsample_freq=30, subsample_for_bin=60, subsample=0.9, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=2, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=2, min_child_samples=70, metric=auc, max_depth=10, max_bin=50, learning_rate=0.2, colsample_bytree=0.2, total=   1.6s
[CV] subsample_freq=100, subsample_for_bin=150, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=2, min_child_samples=120, metric=auc, max_depth=40, max_bin=50, learning_rate=0.05, colsample_bytree=0.2 
[CV]  subsample_freq=100, subsample_for_bin=150, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=2, min_child_samples=120, metric=auc, max_depth=40, max_bin=50, learning_rate=0.05, colsample_bytree=0.2, total=   0.7s
[CV] subsample_freq=100, subsample_for_bin=150, subsample=0.1, scale_po

[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=3, min_child_samples=70, metric=auc, max_depth=50, max_bin=200, learning_rate=0.05, colsample_bytree=0.7, total=   2.8s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=3, min_child_samples=70, metric=auc, max_depth=50, max_bin=200, learning_rate=0.05, colsample_bytree=0.7 
[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=3, min_child_samples=70, metric=auc, max_depth=50, max_bin=200, learning_rate=0.05, colsample_bytree=0.7, total=   2.9s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.4

[CV]  subsample_freq=100, subsample_for_bin=60, subsample=0.4, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=5, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=2, min_child_samples=50, metric=auc, max_depth=40, max_bin=50, learning_rate=0.1, colsample_bytree=0.3, total=   1.5s
[CV] subsample_freq=60, subsample_for_bin=60, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=50, max_bin=100, learning_rate=0.05, colsample_bytree=0.3 
[CV]  subsample_freq=60, subsample_for_bin=60, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=50, max_bin=100, learning_rate=0.05, colsample_bytree=0.3, total=   0.7s
[CV] subsample_freq=60, subsample_for_bin=60, subsample=0.1, scale_pos_weight=

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  3.9min finished


CPU times: user 25min 13s, sys: 21.5 s, total: 25min 35s
Wall time: 3min 54s


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, nthread=8, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
        verbose_eval=32),
          fit_params=None, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'num_leaves': [50, 1000, 10000, 10000], 'max_depth': [10, 20, 30, 40, 50], 'min_child_samples': [30, 50, 70, 100, 120], 'max_bin': [50, 100, 150, 200], 'subsample': [0.1, 0.4, 0.7, 0.9], 'subsample_freq': [2, 30, 60, 100], 'colsample_bytree': [0.2, 0.3, 0.7], 'min_child_weight':...05, 0.2], 'objective': ['binary'], 'scale_pos_weight': [1, 18.951239977624464, 0.05276699578395344]},
          pre_dispatch='2

In [51]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [52]:
print("RandomizedSearchCV")
report(clf.cv_results_)

RandomizedSearchCV
Model with rank: 1
Mean validation score: 0.800 (std: 0.002)
Parameters: {'subsample_freq': 100, 'subsample_for_bin': 100, 'subsample': 0.7, 'scale_pos_weight': 1, 'reg_lambda': 0.8, 'reg_alpha': 7, 'objective': 'binary', 'num_leaves': 50, 'min_split_gain': 1.1, 'min_child_weight': 2, 'min_child_samples': 30, 'metric': 'auc', 'max_depth': 20, 'max_bin': 200, 'learning_rate': 0.2, 'colsample_bytree': 0.7}

Model with rank: 2
Mean validation score: 0.800 (std: 0.002)
Parameters: {'subsample_freq': 2, 'subsample_for_bin': 200, 'subsample': 0.9, 'scale_pos_weight': 1, 'reg_lambda': 0.8, 'reg_alpha': 8, 'objective': 'binary', 'num_leaves': 1000, 'min_split_gain': 1.1, 'min_child_weight': 2, 'min_child_samples': 30, 'metric': 'auc', 'max_depth': 50, 'max_bin': 100, 'learning_rate': 0.05, 'colsample_bytree': 0.7}

Model with rank: 3
Mean validation score: 0.800 (std: 0.002)
Parameters: {'subsample_freq': 30, 'subsample_for_bin': 60, 'subsample': 0.9, 'scale_pos_weight': 1, 

In [53]:
params = clf.best_params_

In [54]:
d_train = lgbm.Dataset(X_train, label=Y_train)
d_valid = lgbm.Dataset(X_validation, label=Y_validation)

In [55]:
evals_results = {}
num_boost_round=3000
early_stopping_rounds=200
feval=None

model = lgbm.train(params, 
                     d_train, 
                     valid_sets=[d_train, d_valid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.796475	valid's auc: 0.792936
[20]	train's auc: 0.811903	valid's auc: 0.802435
[30]	train's auc: 0.821123	valid's auc: 0.806369
[40]	train's auc: 0.827489	valid's auc: 0.809908
[50]	train's auc: 0.831699	valid's auc: 0.811194
[60]	train's auc: 0.833907	valid's auc: 0.81185
[70]	train's auc: 0.834302	valid's auc: 0.812263
[80]	train's auc: 0.834473	valid's auc: 0.812354
[90]	train's auc: 0.834473	valid's auc: 0.812354
[100]	train's auc: 0.834473	valid's auc: 0.812354
[110]	train's auc: 0.834473	valid's auc: 0.812354
[120]	train's auc: 0.834473	valid's auc: 0.812354
[130]	train's auc: 0.834473	valid's auc: 0.812354
[140]	train's auc: 0.834473	valid's auc: 0.812354
[150]	train's auc: 0.834473	valid's auc: 0.812354
[160]	train's auc: 0.834473	valid's auc: 0.812354
[170]	train's auc: 0.834473	valid's auc: 0.812354
[180]	train's auc: 0.834473	valid's auc: 0.812354
[190]	train's auc: 0.834473	valid's auc: 0.8123

In [56]:
n_estimators = model.best_iteration
print("\nModel Report")
print("n_estimators : ", n_estimators)
print("AUC"+":", evals_results['valid']['auc'][n_estimators-1])


Model Report
n_estimators :  75
AUC: 0.8123569786483931


In [57]:
from sklearn.metrics import roc_auc_score

predicted = model.predict(X_validation)
print("Meta LGBM model's ROC AUC score:",roc_auc_score(Y_validation, predicted))

Meta LGBM model's ROC AUC score: 0.8123569786483931


In [58]:
Save(model,"lgbm_meta_model.pkl")

# Test data

In [62]:
Y_test = model.predict(df_test)

In [63]:
predictions = pd.DataFrame(Y_test)
predictions.to_csv("solution_lgbm_meta.csv",header=None, index=None)