In [1]:
from __future__ import print_function

### Caching

This function perform benchmark for a set of parameter. This is cache on the disk

In [12]:
from datetime import datetime

import joblib

import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

memory = joblib.Memory(cachedir='../cache', verbose=10)

@memory.cache
def bench_sklearn(X, y, T, valid, **params):
    """Execute the gradient boosting pipeline"""

    # Create the data matrix
    start_data_t = datetime.now()
    clf = GradientBoostingClassifier()
    clf.set_params(**params)
    end_data_t = datetime.now() - start_data_t
    start_fit_t = datetime.now()
    clf.fit(X, y)
    end_fit_t = datetime.now() - start_fit_t

    score_training = np.mean(clf.predict(X) == y)
    score_testing = np.mean(clf.predict(T) == valid)

    return {'score_training': score_training,
            'score_testing': score_testing,
            'time_data': end_data_t,
            'time_fit': end_fit_t}

## Scikit-learn on Higgs dataset

### No presorting

In [13]:
import yaml
configuration_path = "../params_benchmark/parameters_higgs.conf"
config_name = 'sklearn-master-nopresort'
with open(configuration_path, 'r') as stream:
    params = yaml.load(stream)[config_name]

params = {key: (value if isinstance(value, list) else [value])
          for key, value in params.items()}

print(params)

{'n_estimators': [1], 'max_depth': [3, 5, 8], 'max_features': [1.0], 'presort': [False], 'random_state': [42]}


In [14]:
from sklearn.model_selection import ParameterGrid
params_grid = list(ParameterGrid(params))

In [15]:
import sys
sys.path.insert(0, '../datasets')
from misc import load_higgs

N_SAMPLES = 1e7
data = load_higgs(random_state=42, n_samples=int(N_SAMPLES))

[Memory]  474.6s, 7.9min: Loading load_higgs from /home/glemaitre/scikit_learn_data/higgs_benchmark_data/joblib/misc/load_higgs/3034b65fbc56ad5acf012d3c20d7f04a
__________________________________________load_higgs cache loaded - 0.0s, 0.0min


In [16]:
results = []
for p in params_grid:
    bench_results = bench_sklearn(*data, **p)
    bench_results.update({'n_samples': data[0].shape[0]})
    bench_results.update(p)
    results.append(bench_results)



________________________________________________________________________________
[Memory] Calling __main__--home-glemaitre-Documents-work-code-gbrt-benchmarks-benchmark-__ipython-input__.bench_sklearn...
bench_sklearn(memmap([[ 1.757253, ...,  1.467264],
        ..., 
        [ 1.880784, ...,  0.950771]], dtype=float32), 
memmap([1, ..., 0]), memmap([[ 2.089598, ...,  1.037894],
        ..., 
        [ 0.464477, ...,  0.51742 ]], dtype=float32), 
memmap([1, ..., 1]), max_depth=3, max_features=1.0, n_estimators=1, presort=False, random_state=42)
__________________________________________________bench_sklearn - 163.8s, 2.7min
________________________________________________________________________________
[Memory] Calling __main__--home-glemaitre-Documents-work-code-gbrt-benchmarks-benchmark-__ipython-input__.bench_sklearn...
bench_sklearn(memmap([[ 1.757253, ...,  1.467264],
        ..., 
        [ 1.880784, ...,  0.950771]], dtype=float32), 
memmap([1, ..., 0]), memmap([[ 2.089598, ...

In [17]:
import pandas as pd
df = pd.DataFrame(results)
df.to_pickle('../results/sklearn_master_nopresort.pkl')