In [1]:
from __future__ import print_function

### Caching

This function perform benchmark for a set of parameter. This is cache on the disk

In [2]:
from datetime import datetime

import joblib

import numpy as np
import lightgbm as lgb

memory = joblib.Memory(cachedir='../cache', verbose=10)

@memory.cache
def bench_lgbm(X, y, T, valid, **params):
    """Execute the gradient boosting pipeline"""

    # Extract the parameter required for the dataset
    max_bin = params.pop('max_bin')

    # Measure the time to prepare the data
    start_data_t = datetime.now()
    # Prepare the data
    lgbm_training = lgb.Dataset(X, label=y, max_bin=max_bin)
    end_data_t = datetime.now() - start_data_t
    # lgbm_testing = lgb.Dataset(T, label=valid, max_bin=max_bin)

    # Pop the number of trees
    n_est = params.pop('n_estimators')
    # Create the number of leafs depending of the max depth
    params['num_leaves'] = np.power(2, params['max_depth'] - 1)
    # Do not limit the depth of the trees
    params['max_depth'] = -1

    # Set the fitting time
    start_fit_t = datetime.now()
    gbm = lgb.train(params, lgbm_training, num_boost_round=n_est)
    end_fit_t = datetime.now() - start_fit_t

    # Predict on the training
    pred = gbm.predict(X)
    pred[np.nonzero(pred >= 0.5)] = 1
    pred[np.nonzero(pred < 0.5)] = 0
    score_training = np.mean(pred == y)

    pred = gbm.predict(T)
    pred[np.nonzero(pred >= 0.5)] = 1
    pred[np.nonzero(pred < 0.5)] = 0
    score_testing = np.mean(pred == valid)

    return {'score_training': score_training,
            'score_testing': score_testing,
            'time_data': end_data_t,
            'time_fit': end_fit_t}

## Lightgbm on Higgs dataset

Load the configuration file with the parameters to use the fast histogram method

In [3]:
import yaml
configuration_path = "../params_benchmark/parameters_higgs.conf"
config_name = 'lightgbm'
with open(configuration_path, 'r') as stream:
    params = yaml.load(stream)[config_name]

params = {key: (value if isinstance(value, list) else [value])
          for key, value in params.items()}

print(params)

{'application': ['binary'], 'bagging_fraction': [1.0], 'bagging_freq': [0], 'bagging_seed': [42], 'boosting': ['gbdt'], 'data_random_seed': [42], 'feature_fraction': [1.0], 'feature_fraction_seed': [42], 'is_sparse': [False], 'learning_rate': [0.1], 'max_bin': [255], 'max_depth': [3, 5, 8], 'metric': ['binary_logloss'], 'min_data_in_leaf': [1], 'min_gain_tol_split': [1e-07], 'n_estimators': [1], 'num_threads': [1], 'tree_learner': ['serial'], 'verbosity': [1]}


Create a parametere grid to try different depth

In [4]:
from sklearn.model_selection import ParameterGrid
params_grid = list(ParameterGrid(params))

Open the dataset with a given number of samples

In [5]:
import sys
sys.path.insert(0, '../datasets')
from misc import load_higgs

N_SAMPLES = 1e7
data = load_higgs(random_state=42, n_samples=int(N_SAMPLES))

[Memory]    0.0s, 0.0min: Loading load_higgs from /home/glemaitre/scikit_learn_data/higgs_benchmark_data/joblib/misc/load_higgs/3034b65fbc56ad5acf012d3c20d7f04a
__________________________________________load_higgs cache loaded - 0.0s, 0.0min


In [6]:
results = []
for p in params_grid:
    bench_results = bench_lgbm(*data, **p)
    bench_results.update({'n_samples': data[0].shape[0]})
    bench_results.update(p)
    results.append(bench_results)

________________________________________________________________________________
[Memory] Calling __main__--home-glemaitre-Documents-work-code-gbrt-benchmarks-benchmark-__ipython-input__.bench_lgbm...
bench_lgbm(memmap([[ 1.757253, ...,  1.467264],
       ..., 
       [ 1.880784, ...,  0.950771]], dtype=float32), 
memmap([1, ..., 0]), memmap([[ 2.089598, ...,  1.037894],
       ..., 
       [ 0.464477, ...,  0.51742 ]], dtype=float32), 
memmap([1, ..., 1]), application='binary', bagging_fraction=1.0, bagging_freq=0, bagging_seed=42, boosting='gbdt', data_random_seed=42, feature_fraction=1.0, feature_fraction_seed=42, is_sparse=False, learning_rate=0.1, max_bin=255, max_depth=3, metric='binary_logloss', min_data_in_leaf=1, min_gain_tol_split=1e-07, n_estimators=1, num_threads=1, tree_learner='serial', verbosity=1)
______________________________________________________bench_lgbm - 39.5s, 0.7min
________________________________________________________________________________
[Memory] Call

We can drop the results into a dataframe

In [7]:
import pandas as pd
df = pd.DataFrame(results)
df.to_pickle('../results/lightgbm.pkl')