In [1]:
from __future__ import print_function

### Caching

This function perform benchmark for a set of parameter. This is cache on the disk

In [2]:
from datetime import datetime

import joblib

import numpy as np
import xgboost as xgb

memory = joblib.Memory(cachedir='../cache', verbose=10)

@memory.cache
def bench_xgb(X, y, T, valid, **params):
    """Execute the gradient boosting pipeline"""

    # Create the data matrix
    start_data_t = datetime.now()
    xgb_training = xgb.DMatrix(
        X,
        label=y,
        missing=None,
        weight=None,
        silent=False,
        feature_names=None,
        feature_types=None)
    end_data_t = datetime.now() - start_data_t

    xgb_testing = xgb.DMatrix(
        T,
        label=valid,
        missing=None,
        weight=None,
        silent=False,
        feature_names=None,
        feature_types=None)

    n_est = params.pop('n_estimators')
    start_fit_t = datetime.now()
    bst = xgb.train(params, xgb_training, n_est)
    end_fit_t = datetime.now() - start_fit_t

    pred = bst.predict(xgb_training)
    pred[np.nonzero(pred >= 0.5)] = 1
    pred[np.nonzero(pred < 0.5)] = 0
    score_training = np.mean(pred == y)

    pred = bst.predict(xgb_testing)
    pred[np.nonzero(pred >= 0.5)] = 1
    pred[np.nonzero(pred < 0.5)] = 0
    score_testing = np.mean(pred == valid)

    return {'score_training': score_training,
            'score_testing': score_testing,
            'time_data': end_data_t,
            'time_fit': end_fit_t}

## XGBoost on Higgs dataset

### Exact method

Load the configuration file with the parameters to use the exact method

In [3]:
import yaml
configuration_path = "../params_benchmark/parameters_higgs.conf"
config_name = 'xgboost-exact'
with open(configuration_path, 'r') as stream:
    params = yaml.load(stream)[config_name]

params = {key: (value if isinstance(value, list) else [value])
          for key, value in params.items()}

print(params)

{'alpha': [0.0], 'booster': ['gbtree'], 'colsample_bylevel': [1.0], 'colsample_bytree': [1.0], 'eta': [0.1], 'gamma': [1e-07], 'lambda': [0.0], 'max_depth': [3, 5, 8], 'min_child_weight': [1], 'n_estimators': [1], 'nthread': [1], 'objective': ['binary:logistic'], 'seed': [42], 'subsample': [1.0], 'tree_method': ['exact'], 'cache_opt': [False]}


Create a parametere grid to try different depth

In [4]:
from sklearn.model_selection import ParameterGrid
params_grid = list(ParameterGrid(params))

Open the dataset with a given number of samples

In [5]:
import sys
sys.path.insert(0, '../datasets')
from misc import load_higgs

N_SAMPLES = 1e7
data = load_higgs(random_state=42, n_samples=int(N_SAMPLES))

[Memory]    0.0s, 0.0min: Loading load_higgs from /home/glemaitre/scikit_learn_data/higgs_benchmark_data/joblib/misc/load_higgs/3034b65fbc56ad5acf012d3c20d7f04a
__________________________________________load_higgs cache loaded - 0.0s, 0.0min


In [6]:
results = []
for p in params_grid:
    bench_results = bench_xgb(*data, **p)
    bench_results.update({'n_samples': data[0].shape[0]})
    bench_results.update(p)
    results.append(bench_results)

________________________________________________________________________________
[Memory] Calling __main__--home-glemaitre-Documents-work-code-gbrt-benchmarks-benchmark-__ipython-input__.bench_xgb...
bench_xgb(memmap([[ 1.757253, ...,  1.467264],
       ..., 
       [ 1.880784, ...,  0.950771]], dtype=float32), 
memmap([1, ..., 0]), memmap([[ 2.089598, ...,  1.037894],
       ..., 
       [ 0.464477, ...,  0.51742 ]], dtype=float32), 
memmap([1, ..., 1]), alpha=0.0, booster='gbtree', cache_opt=False, colsample_bylevel=1.0, colsample_bytree=1.0, eta=0.1, gamma=1e-07, lambda=0.0, max_depth=3, min_child_weight=1, n_estimators=1, nthread=1, objective='binary:logistic', seed=42, subsample=1.0, tree_method='exact')
_______________________________________________________bench_xgb - 71.2s, 1.2min
________________________________________________________________________________
[Memory] Calling __main__--home-glemaitre-Documents-work-code-gbrt-benchmarks-benchmark-__ipython-input__.bench_xgb...


We can drop the results into a dataframe

In [7]:
import pandas as pd
df = pd.DataFrame(results)
df.to_pickle('../results/xgboost_exact.pkl')

### Fast histogram method

Load the configuration file with the parameters to use the fast histogram method

In [8]:
import yaml
configuration_path = "../params_benchmark/parameters_higgs.conf"
config_name = 'xgboost-fast-hist'
with open(configuration_path, 'r') as stream:
    params = yaml.load(stream)[config_name]

params = {key: (value if isinstance(value, list) else [value])
          for key, value in params.items()}

print(params)

{'alpha': [0.0], 'booster': ['gbtree'], 'colsample_bylevel': [1.0], 'colsample_bytree': [1.0], 'eta': [0.1], 'gamma': [1e-07], 'lambda': [0.0], 'max_depth': [3, 5, 8], 'min_child_weight': [1], 'n_estimators': [1], 'nthread': [1], 'objective': ['binary:logistic'], 'seed': [42], 'sketch_eps': [0.003952569169960474], 'subsample': [1.0], 'tree_method': ['hist'], 'cache_opt': [False]}


Create a parametere grid to try different depth

In [9]:
from sklearn.model_selection import ParameterGrid
params_grid = list(ParameterGrid(params))

Open the dataset with a given number of samples

In [10]:
import sys
sys.path.insert(0, '../datasets')
from misc import load_higgs

N_SAMPLES = 1e7
data = load_higgs(random_state=42, n_samples=int(N_SAMPLES))

[Memory]  301.6s, 5.0min: Loading load_higgs from /home/glemaitre/scikit_learn_data/higgs_benchmark_data/joblib/misc/load_higgs/3034b65fbc56ad5acf012d3c20d7f04a
__________________________________________load_higgs cache loaded - 0.0s, 0.0min


In [11]:
results = []
for p in params_grid:
    bench_results = bench_xgb(*data, **p)
    bench_results.update({'n_samples': data[0].shape[0]})
    bench_results.update(p)
    results.append(bench_results)

________________________________________________________________________________
[Memory] Calling __main__--home-glemaitre-Documents-work-code-gbrt-benchmarks-benchmark-__ipython-input__.bench_xgb...
bench_xgb(memmap([[ 1.757253, ...,  1.467264],
       ..., 
       [ 1.880784, ...,  0.950771]], dtype=float32), 
memmap([1, ..., 0]), memmap([[ 2.089598, ...,  1.037894],
       ..., 
       [ 0.464477, ...,  0.51742 ]], dtype=float32), 
memmap([1, ..., 1]), alpha=0.0, booster='gbtree', cache_opt=False, colsample_bylevel=1.0, colsample_bytree=1.0, eta=0.1, gamma=1e-07, lambda=0.0, max_depth=3, min_child_weight=1, n_estimators=1, nthread=1, objective='binary:logistic', seed=42, sketch_eps=0.003952569169960474, subsample=1.0, tree_method='hist')
_______________________________________________________bench_xgb - 57.9s, 1.0min
________________________________________________________________________________
[Memory] Calling __main__--home-glemaitre-Documents-work-code-gbrt-benchmarks-benchmark

We can drop the results into a dataframe

In [12]:
import pandas as pd
df = pd.DataFrame(results)
df.to_pickle('../results/xgboost_fast_hist.pkl')