In [None]:
from __future__ import print_function

### Caching

This function perform benchmark for a set of parameter. This is cache on the disk

In [None]:
from datetime import datetime

import joblib

import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

memory = joblib.Memory(cachedir='../cache', verbose=10)

@memory.cache
def bench_xgb(X, y, T, valid, **params):
    """Execute the gradient boosting pipeline"""

    # Create the data matrix
    start_data_t = datetime.now()
    clf = GradientBoostingClassifier()
    clf.set_params(**params)
    start_fit_t = datetime.now()
    clf.fit(X, y)
    end_fit_t = datetime.now() - start_fit_t

    score_training = np.mean(clf.predit(X) == y)
    score_testing = np.mean(clf.predit(T) == valid)

    return {'score_training': score_training,
            'score_testing': score_testing,
            'time_data': end_data_t,
            'time_fit': end_fit_t}

## Scikit-learn on Higgs dataset

### No presorting

In [2]:
import yaml
configuration_path = "../params_benchmark/parameters_higgs.conf"
config_name = 'sklearn-master-nopresort'
with open(configuration_path, 'r') as stream:
    params = yaml.load(stream)[config_name]

params = {key: (value if isinstance(value, list) else [value])
          for key, value in params.items()}

print(params)

{'n_estimators': [1], 'max_depth': [3, 5, 8], 'max_features': [1.0], 'presort': [False], 'random_state': [42]}


In [3]:
from sklearn.model_selection import ParameterGrid
params_grid = list(ParameterGrid(params))

In [None]:
import sys
sys.path.insert(0, '../datasets')
from misc import load_higgs

N_SAMPLES = 1e7
data = load_higgs(random_state=42, n_samples=int(N_SAMPLES))

In [None]:
results = []
for p in params_grid:
    bench_results = bench_xgb(*data, **p)
    bench_results.update({'n_samples': data[0].shape[0]})
    bench_results.update(p)
    results.append(bench_results)

In [None]:
import pandas as pd
df = pd.DataFrame(results)
df.to_pickle('../results/sklearn_master_nopresort.pkl')