# 03 - GridSearchCV - LGBM

#### Imports

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="white")

#### Constants

In [2]:
n_components = 1000

In [3]:
models_folder = "models/"
train_data_fn = models_folder+'train_data.pkl'
target_fn = models_folder+'target.pkl'
test_data_fn = models_folder+'test_data.pkl'

weight_multiplier_fn = models_folder+"weight_multiplier.pkl"

#### Functions

In [4]:
import os.path
from sklearn.externals import joblib

def Load(filename):
    if os.path.isfile(filename):
        return joblib.load(filename)
    
def Save(obj, filename):
    joblib.dump(obj, filename)

# Loading data

In [5]:
import scipy

data = scipy.sparse.load_npz("train_sparse_matrix_after_scale.npz")

target = Load(target_fn)

In [6]:
weight_multiplier = Load(weight_multiplier_fn)

## Splitting dataset

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_validation, Y_train, Y_validation = train_test_split(data, target.ravel(), train_size=0.8, random_state=42)



# CatBoost Classifier

In [9]:
import lightgbm as lgbm
import re

In [42]:
tuned_parameters = {
     'num_leaves': [50,1000,10000,10000],
     'max_depth':[10,20,30,40],
     'min_child_samples':[30,50,100],
     'max_bin':[50,100,200],
     'subsample':[0.1,0.4,0.7],
     'subsample_freq':[2,30,100],
     'colsample_bytree':[0.2,0.3,0.7],
     'min_child_weight':[2,3,6],
     'subsample_for_bin':[10,100,200],
     'min_split_gain':[1.1,2.0,10.0],
     'reg_alpha':[2,3,5,7,8],
     'reg_lambda':[0,0.2,0.8],
     'metric':['auc'],
    'learning_rate':[0.05,0.1,0.005],
    'objective':['binary'],
    'scale_pos_weight':[1,weight_multiplier,1/weight_multiplier],
}

In [43]:
%%time
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

clf = RandomizedSearchCV(lgbm.LGBMClassifier(nthread=8, verbose_eval=32),
                   tuned_parameters,
                   cv=4,
                   n_iter=100,
                   scoring='roc_auc',
                   random_state=42,
                   verbose=2)

CPU times: user 29 µs, sys: 0 ns, total: 29 µs
Wall time: 32.9 µs


In [44]:
%%time
clf.fit(X_train, Y_train)

Fitting 4 folds for each of 100 candidates, totalling 400 fits
[CV] subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=10, max_bin=50, learning_rate=0.1, colsample_bytree=0.3 
[CV]  subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=10, max_bin=50, learning_rate=0.1, colsample_bytree=0.3, total=  20.3s
[CV] subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=10, max_bin=50, learning_rate=0.1, colsample_bytree=0.3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.1s remaining:    0.0s


[CV]  subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=10, max_bin=50, learning_rate=0.1, colsample_bytree=0.3, total=  17.7s
[CV] subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=10, max_bin=50, learning_rate=0.1, colsample_bytree=0.3 
[CV]  subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=10, max_bin=50, learning_rate=0.1, colsample_bytree=0.3, total=  17.6s
[CV] subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_a

[CV]  subsample_freq=30, subsample_for_bin=200, subsample=0.4, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=3, min_child_samples=100, metric=auc, max_depth=10, max_bin=200, learning_rate=0.005, colsample_bytree=0.2, total=  21.1s
[CV] subsample_freq=2, subsample_for_bin=200, subsample=0.4, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=20, max_bin=100, learning_rate=0.005, colsample_bytree=0.7 
[CV]  subsample_freq=2, subsample_for_bin=200, subsample=0.4, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=20, max_bin=100, learning_rate=0.005, colsample_bytree=0.7, total=  26.8s
[CV] subsample_freq=2, subsample_for_bin=200, 

[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=3, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=10, max_bin=50, learning_rate=0.05, colsample_bytree=0.7, total=   7.9s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=3, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=10, max_bin=50, learning_rate=0.05, colsample_bytree=0.7 
[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=3, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=10, max_bin=50, learning_rate=0.05, colsample_bytree=0.7, total=   6.8s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.1, s

[CV]  subsample_freq=30, subsample_for_bin=100, subsample=0.7, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=2, min_child_samples=50, metric=auc, max_depth=20, max_bin=50, learning_rate=0.05, colsample_bytree=0.7, total=  39.9s
[CV] subsample_freq=100, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=10.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=30, max_bin=200, learning_rate=0.005, colsample_bytree=0.2 
[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=10.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=30, max_bin=200, learning_rate=0.005, colsample_bytree=0.2, total=   5.2s
[CV] subsample_freq=100, subsample_for_bin=10, subsam

[CV]  subsample_freq=2, subsample_for_bin=100, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=5, objective=binary, num_leaves=10000, min_split_gain=10.0, min_child_weight=2, min_child_samples=50, metric=auc, max_depth=10, max_bin=200, learning_rate=0.1, colsample_bytree=0.3, total=  17.7s
[CV] subsample_freq=2, subsample_for_bin=100, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=5, objective=binary, num_leaves=10000, min_split_gain=10.0, min_child_weight=2, min_child_samples=50, metric=auc, max_depth=10, max_bin=200, learning_rate=0.1, colsample_bytree=0.3 
[CV]  subsample_freq=2, subsample_for_bin=100, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=5, objective=binary, num_leaves=10000, min_split_gain=10.0, min_child_weight=2, min_child_samples=50, metric=auc, max_depth=10, max_bin=200, learning_rate=0.1, colsample_bytree=0.3, total=  17.7s
[CV] subsample_freq=2, subsample_for_bin=100, sub

[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=3, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=20, max_bin=50, learning_rate=0.1, colsample_bytree=0.2, total=  17.6s
[CV] subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=3, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=30, max_bin=200, learning_rate=0.005, colsample_bytree=0.2 
[CV]  subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=3, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=30, max_bin=200, learning_rate=0.005, colsample_bytree=0.2, total=  22.8s
[CV] subsample_freq=30, subsample_for_bin=200, subsam

[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=20, max_bin=200, learning_rate=0.005, colsample_bytree=0.3, total=   6.4s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=20, max_bin=200, learning_rate=0.005, colsample_bytree=0.3 
[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=20, max_bin=200, learning_rate=0.005, colsample_bytree=0.3, total=   5.9s
[CV] subsample_freq=2, subsample_for_bin=10, subsamp

[CV]  subsample_freq=30, subsample_for_bin=100, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.8, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=30, max_bin=50, learning_rate=0.05, colsample_bytree=0.7, total=  19.6s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0.8, reg_alpha=3, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=3, min_child_samples=30, metric=auc, max_depth=40, max_bin=100, learning_rate=0.05, colsample_bytree=0.3 
[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0.8, reg_alpha=3, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=3, min_child_samples=30, metric=auc, max_depth=40, max_bin=100, learning_rate=0.05, colsample_bytree=0.3, total=   6.5s
[CV] subsample_freq=2, subsample_for_bin=10, subsamp

[CV]  subsample_freq=100, subsample_for_bin=200, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=30, max_bin=200, learning_rate=0.005, colsample_bytree=0.3, total=  42.7s
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=30, max_bin=200, learning_rate=0.005, colsample_bytree=0.3 
[CV]  subsample_freq=100, subsample_for_bin=200, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=30, max_bin=200, learning_rate=0.005, colsample_bytree=0.3, total=  41.5s
[CV] subsample_freq=100, subsample_for_b

[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=30, max_bin=200, learning_rate=0.05, colsample_bytree=0.3, total=  12.1s
[CV] subsample_freq=100, subsample_for_bin=10, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=1000, min_split_gain=2.0, min_child_weight=3, min_child_samples=100, metric=auc, max_depth=30, max_bin=100, learning_rate=0.1, colsample_bytree=0.7 
[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=1000, min_split_gain=2.0, min_child_weight=3, min_child_samples=100, metric=auc, max_depth=30, max_bin=100, learning_rate=0.1, colsample_bytree=0.7, total=   8.2s
[CV] subsample_freq=100, subsample_for_bin=10, subsa

[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=1000, min_split_gain=2.0, min_child_weight=3, min_child_samples=100, metric=auc, max_depth=10, max_bin=100, learning_rate=0.005, colsample_bytree=0.7, total=   7.9s
[CV] subsample_freq=100, subsample_for_bin=10, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=1000, min_split_gain=2.0, min_child_weight=3, min_child_samples=100, metric=auc, max_depth=10, max_bin=100, learning_rate=0.005, colsample_bytree=0.7 
[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=1000, min_split_gain=2.0, min_child_weight=3, min_child_samples=100, metric=auc, max_depth=10, max_bin=100, learning_rate=0.005, colsample_bytree=0.7, total=   6.2s
[CV] subsample_freq=100, subsample_for_bin=10,

[CV]  subsample_freq=2, subsample_for_bin=200, subsample=0.4, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=50, min_split_gain=10.0, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=10, max_bin=50, learning_rate=0.005, colsample_bytree=0.2, total=  12.5s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=1000, min_split_gain=10.0, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=10, max_bin=50, learning_rate=0.05, colsample_bytree=0.2 
[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=1000, min_split_gain=10.0, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=10, max_bin=50, learning_rate=0.05, colsample_bytree=0.2, total=   8.5s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_pos_weight=1, reg_lambda=0

[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=2, min_child_samples=50, metric=auc, max_depth=30, max_bin=200, learning_rate=0.05, colsample_bytree=0.7, total=  14.6s
[CV] subsample_freq=100, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=2, min_child_samples=50, metric=auc, max_depth=30, max_bin=200, learning_rate=0.05, colsample_bytree=0.7 
[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=2, min_child_samples=50, metric=auc, max_depth=30, max_bin=200, learning_rate=0.05, colsample_bytree=0.7, total=   9.8s
[CV] subsample_freq=100, subsample_for_bin=10, subsample=0.4, sc

[CV]  subsample_freq=2, subsample_for_bin=100, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=2, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=3, min_child_samples=100, metric=auc, max_depth=30, max_bin=50, learning_rate=0.005, colsample_bytree=0.3, total=  26.8s
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=30, max_bin=100, learning_rate=0.1, colsample_bytree=0.2 
[CV]  subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=30, max_bin=100, learning_rate=0.1, colsample_bytree=0.2, total=  17.4s
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg

[CV]  subsample_freq=30, subsample_for_bin=100, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=10.0, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=40, max_bin=200, learning_rate=0.05, colsample_bytree=0.7, total=  15.3s
[CV] subsample_freq=30, subsample_for_bin=100, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=10.0, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=40, max_bin=200, learning_rate=0.05, colsample_bytree=0.7 
[CV]  subsample_freq=30, subsample_for_bin=100, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=10.0, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=40, max_bin=200, learning_rate=0.05, colsample_bytree=0.7, total=  15.2s
[CV] subsample_freq=30, subsample_for_bin=100,

[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.7, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=1000, min_split_gain=2.0, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=30, max_bin=50, learning_rate=0.1, colsample_bytree=0.3, total=  21.4s
[CV] subsample_freq=2, subsample_for_bin=100, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=2, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=30, max_bin=50, learning_rate=0.005, colsample_bytree=0.2 
[CV]  subsample_freq=2, subsample_for_bin=100, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=2, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=30, max_bin=50, learning_rate=0.005, colsample_bytree=0.2, total=  12.9s
[CV] subsample_freq=2, subsample_for_bin=100, subsample=0.1, scale_pos_weight=1, reg_lambda=0.

[CV]  subsample_freq=30, subsample_for_bin=10, subsample=0.7, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=20, max_bin=50, learning_rate=0.1, colsample_bytree=0.7, total=  28.5s
[CV] subsample_freq=30, subsample_for_bin=10, subsample=0.7, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=20, max_bin=50, learning_rate=0.1, colsample_bytree=0.7 
[CV]  subsample_freq=30, subsample_for_bin=10, subsample=0.7, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=20, max_bin=50, learning_rate=0.1, colsample_bytree=0.7, total=  18.0s
[CV] subsample_freq=30, subsample_for_bin=10, subsample=0.7, scale_

[CV]  subsample_freq=2, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=20, max_bin=50, learning_rate=0.1, colsample_bytree=0.2, total=  16.8s
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.7, scale_pos_weight=1, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=10, max_bin=200, learning_rate=0.1, colsample_bytree=0.2 
[CV]  subsample_freq=100, subsample_for_bin=200, subsample=0.7, scale_pos_weight=1, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=10, max_bin=200, learning_rate=0.1, colsample_bytree=0.2, total=  43.1s
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.7, scale_pos_weight=1, reg_lambda=0, reg_alpha

[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=30, max_bin=100, learning_rate=0.005, colsample_bytree=0.2, total=  13.2s
[CV] subsample_freq=100, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=30, max_bin=100, learning_rate=0.005, colsample_bytree=0.2 
[CV]  subsample_freq=100, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=100, metric=auc, max_depth=30, max_bin=100, learning_rate=0.005, colsample_bytree=0.2, total=   8.7s
[CV] subsample_freq=100, subsample_for_bin=

[CV]  subsample_freq=2, subsample_for_bin=200, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=5, objective=binary, num_leaves=1000, min_split_gain=10.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=30, max_bin=50, learning_rate=0.05, colsample_bytree=0.3, total=  18.2s
[CV] subsample_freq=100, subsample_for_bin=100, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=5, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=2, min_child_samples=100, metric=auc, max_depth=30, max_bin=100, learning_rate=0.005, colsample_bytree=0.7 
[CV]  subsample_freq=100, subsample_for_bin=100, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=5, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=2, min_child_samples=100, metric=auc, max_depth=30, max_bin=100, learning_rate=0.005, colsample_bytree=0.7, total=  26.9s
[CV] subsample_freq=100, subsample_for_bin=

[CV]  subsample_freq=100, subsample_for_bin=200, subsample=0.4, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=30, max_bin=50, learning_rate=0.1, colsample_bytree=0.2, total=  18.1s
[CV] subsample_freq=100, subsample_for_bin=200, subsample=0.4, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=30, max_bin=50, learning_rate=0.1, colsample_bytree=0.2 
[CV]  subsample_freq=100, subsample_for_bin=200, subsample=0.4, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=30, max_bin=50, learning_rate=0.1, colsample_bytree=0.2, total=  17.1s
[CV] subsample_freq=100, subsample_for_bin=200, subsamp

[CV]  subsample_freq=2, subsample_for_bin=200, subsample=0.1, scale_pos_weight=1, reg_lambda=0, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=10.0, min_child_weight=2, min_child_samples=50, metric=auc, max_depth=20, max_bin=100, learning_rate=0.05, colsample_bytree=0.7, total=  14.7s
[CV] subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=40, max_bin=200, learning_rate=0.05, colsample_bytree=0.3 
[CV]  subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=40, max_bin=200, learning_rate=0.05, colsample_bytree=0.3, total=  16.2s
[CV] subsample_freq=30, subsample_for_bin=200, subsample=0.1, scal

[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=20, max_bin=100, learning_rate=0.1, colsample_bytree=0.2, total=   8.2s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=20, max_bin=100, learning_rate=0.1, colsample_bytree=0.2 
[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.7, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=5, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=6, min_child_samples=50, metric=auc, max_depth=20, max_bin=100, learning_rate=0.1, colsample_bytree=0.2, total=   9.1s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.7, scal

[CV]  subsample_freq=2, subsample_for_bin=100, subsample=0.7, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=5, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=20, max_bin=100, learning_rate=0.05, colsample_bytree=0.3, total=  32.3s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=30, max_bin=50, learning_rate=0.05, colsample_bytree=0.2 
[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=2.0, min_child_weight=2, min_child_samples=30, metric=auc, max_depth=30, max_bin=50, learning_rate=0.05, colsample_bytree=0.2, total=   7.2s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.1, scale_pos_weight=1, reg_lambda=0.8, reg_alpha=7, o

[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=40, max_bin=100, learning_rate=0.1, colsample_bytree=0.3, total=  12.6s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=40, max_bin=100, learning_rate=0.1, colsample_bytree=0.3 
[CV]  subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=8, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=3, min_child_samples=50, metric=auc, max_depth=40, max_bin=100, learning_rate=0.1, colsample_bytree=0.3, total=   9.1s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=0.4, scale_p

[CV]  subsample_freq=30, subsample_for_bin=100, subsample=0.1, scale_pos_weight=1, reg_lambda=0.2, reg_alpha=2, objective=binary, num_leaves=50, min_split_gain=2.0, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=10, max_bin=200, learning_rate=0.1, colsample_bytree=0.2, total=  11.0s
[CV] subsample_freq=100, subsample_for_bin=100, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=2, min_child_samples=100, metric=auc, max_depth=20, max_bin=200, learning_rate=0.05, colsample_bytree=0.2 
[CV]  subsample_freq=100, subsample_for_bin=100, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0.2, reg_alpha=7, objective=binary, num_leaves=10000, min_split_gain=1.1, min_child_weight=2, min_child_samples=100, metric=auc, max_depth=20, max_bin=200, learning_rate=0.05, colsample_bytree=0.2, total=  39.6s
[CV] subsample_freq=100, subsample_for_bin=100, subsample=0.4, sca

[CV]  subsample_freq=2, subsample_for_bin=200, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=40, max_bin=200, learning_rate=0.1, colsample_bytree=0.2, total=  36.7s
[CV] subsample_freq=2, subsample_for_bin=200, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=40, max_bin=200, learning_rate=0.1, colsample_bytree=0.2 
[CV]  subsample_freq=2, subsample_for_bin=200, subsample=0.4, scale_pos_weight=18.951239977624464, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=50, min_split_gain=1.1, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=40, max_bin=200, learning_rate=0.1, colsample_bytree=0.2, total=  29.8s
[CV] subsample_freq=2, subsample_for_bin=200, subsample=0.4, scale_pos

[CV]  subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=50, min_split_gain=10.0, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=10, max_bin=100, learning_rate=0.05, colsample_bytree=0.2, total=  12.1s
[CV] subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=50, min_split_gain=10.0, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=10, max_bin=100, learning_rate=0.05, colsample_bytree=0.2 
[CV]  subsample_freq=30, subsample_for_bin=200, subsample=0.1, scale_pos_weight=18.951239977624464, reg_lambda=0.8, reg_alpha=7, objective=binary, num_leaves=50, min_split_gain=10.0, min_child_weight=6, min_child_samples=30, metric=auc, max_depth=10, max_bin=100, learning_rate=0.05, colsample_bytree=0.2, total=  12.0s
[CV] subsample_freq=2, subsample_for_bin=10, subsample=

[CV]  subsample_freq=30, subsample_for_bin=100, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=2, min_child_samples=100, metric=auc, max_depth=40, max_bin=100, learning_rate=0.05, colsample_bytree=0.2, total=   8.9s
[CV] subsample_freq=30, subsample_for_bin=100, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=2, min_child_samples=100, metric=auc, max_depth=40, max_bin=100, learning_rate=0.05, colsample_bytree=0.2 
[CV]  subsample_freq=30, subsample_for_bin=100, subsample=0.1, scale_pos_weight=0.05276699578395344, reg_lambda=0, reg_alpha=2, objective=binary, num_leaves=1000, min_split_gain=1.1, min_child_weight=2, min_child_samples=100, metric=auc, max_depth=40, max_bin=100, learning_rate=0.05, colsample_bytree=0.2, total=   7.9s
[CV] subsample_freq=30, subsample_for_bin=100, subsa

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed: 146.2min finished


CPU times: user 15h 59min 39s, sys: 1min 22s, total: 16h 1min 2s
Wall time: 2h 27min 3s


RandomizedSearchCV(cv=4, error_score='raise',
          estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, nthread=8, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
        verbose_eval=32),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'num_leaves': [50, 1000, 10000, 10000], 'max_depth': [10, 20, 30, 40], 'min_child_samples': [30, 50, 100], 'max_bin': [50, 100, 200], 'subsample': [0.1, 0.4, 0.7], 'subsample_freq': [2, 30, 100], 'colsample_bytree': [0.2, 0.3, 0.7], 'min_child_weight': [2, 3, 6], 'subsample_for_..., 0.005], 'objective': ['binary'], 'scale_pos_weight': [1, 18.951239977624464, 0.05276699578395344]},
          pre_dispatch='

In [45]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [46]:
print("RandomizedSearchCV")
report(clf.cv_results_)

RandomizedSearchCV
Model with rank: 1
Mean validation score: 0.653 (std: 0.006)
Parameters: {'subsample_freq': 2, 'subsample_for_bin': 100, 'subsample': 0.7, 'scale_pos_weight': 1, 'reg_lambda': 0.2, 'reg_alpha': 7, 'objective': 'binary', 'num_leaves': 50, 'min_split_gain': 2.0, 'min_child_weight': 3, 'min_child_samples': 100, 'metric': 'auc', 'max_depth': 20, 'max_bin': 100, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

Model with rank: 2
Mean validation score: 0.653 (std: 0.006)
Parameters: {'subsample_freq': 2, 'subsample_for_bin': 100, 'subsample': 0.7, 'scale_pos_weight': 1, 'reg_lambda': 0.8, 'reg_alpha': 5, 'objective': 'binary', 'num_leaves': 50, 'min_split_gain': 2.0, 'min_child_weight': 6, 'min_child_samples': 30, 'metric': 'auc', 'max_depth': 20, 'max_bin': 100, 'learning_rate': 0.05, 'colsample_bytree': 0.3}

Model with rank: 3
Mean validation score: 0.650 (std: 0.007)
Parameters: {'subsample_freq': 100, 'subsample_for_bin': 200, 'subsample': 0.7, 'scale_pos_weight': 1, '

In [47]:
params = clf.best_params_
# params = {'subsample_freq': 2, 'subsample_for_bin': 100, 'subsample': 0.7, 'scale_pos_weight': 1, 'reg_lambda': 0.2, 'reg_alpha': 7, 'objective': 'binary', 'num_leaves': 50, 'min_split_gain': 2.0, 'min_child_weight': 3, 'min_child_samples': 100, 'metric': 'auc', 'max_depth': 20, 'max_bin': 100, 'learning_rate': 0.1, 'colsample_bytree': 0.7}

In [50]:
evals_results = {}
num_boost_round=3000
early_stopping_rounds=200
feval=None

model = lgbm.train(params, 
                     d_train, 
                     valid_sets=[d_train, d_valid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

Training until validation scores don't improve for 200 rounds.
[10]	train's auc: 0.650859	valid's auc: 0.6216
[20]	train's auc: 0.66754	valid's auc: 0.629954
[30]	train's auc: 0.680867	valid's auc: 0.637238
[40]	train's auc: 0.690948	valid's auc: 0.640846
[50]	train's auc: 0.698107	valid's auc: 0.643292
[60]	train's auc: 0.704833	valid's auc: 0.644888
[70]	train's auc: 0.709977	valid's auc: 0.645596
[80]	train's auc: 0.71439	valid's auc: 0.646295
[90]	train's auc: 0.71801	valid's auc: 0.64717
[100]	train's auc: 0.720487	valid's auc: 0.647642
[110]	train's auc: 0.723143	valid's auc: 0.647897
[120]	train's auc: 0.724961	valid's auc: 0.648275
[130]	train's auc: 0.726812	valid's auc: 0.648083
[140]	train's auc: 0.72847	valid's auc: 0.648556
[150]	train's auc: 0.730257	valid's auc: 0.648846
[160]	train's auc: 0.732237	valid's auc: 0.648759
[170]	train's auc: 0.733286	valid's auc: 0.649013
[180]	train's auc: 0.734456	valid's auc: 0.64854
[190]	train's auc: 0.735655	valid's auc: 0.64807
[200]

In [51]:
n_estimators = model.best_iteration
print("\nModel Report")
print("n_estimators : ", n_estimators)
print("AUC"+":", evals_results['valid']['auc'][n_estimators-1])


Model Report
n_estimators :  169
AUC: 0.6490449913737819


In [52]:
from sklearn.metrics import roc_auc_score

predicted = model.predict(X_validation)
print("ROC AUC score:",roc_auc_score(Y_validation, predicted))

ROC AUC score: 0.6490449913737819


In [53]:
Save(model,"lgbm_model.pkl")

# Test Data

In [54]:
test_data = scipy.sparse.load_npz("test_sparse_matrix_after_scale.npz")

In [55]:
Y_test = model.predict(test_data, num_iteration=model.best_iteration)
print(Y_test.max())
print(Y_test.mean())

## Saving test predictions

In [58]:
predictions = pd.DataFrame(Y_test)
predictions.to_csv("solution_lgbm.csv",header=None, index=None)