# Данные

Загрузим данные, необходимые для работы примера.

In [1]:
%%time

! git clone --quiet https://github.com/guolinke/boosting_tree_benchmarks.git
! cd boosting_tree_benchmarks/data
! wget -q "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
! gunzip -f HIGGS.csv.gz
! python boosting_tree_benchmarks/data/higgs2libsvm.py
! cd ../..

CPU times: user 5.45 s, sys: 536 ms, total: 5.98 s
Wall time: 9min 19s


In [2]:
%%time

from sklearn.datasets import load_svmlight_files

X_train, y_train, X_test, y_test = load_svmlight_files(
    ('higgs.train', 
     'higgs.test'))

CPU times: user 3min 37s, sys: 2.22 s, total: 3min 39s
Wall time: 3min 39s


Посмотрим на размеры данных и на точность константного предсказателя.

In [25]:
X_train.shape

(10500000, 28)

In [33]:
import numpy as np
from scipy.stats import mode
np.mean(y_test == mode(y_train)[0][0])

0.529014

# LightGBM

Сравним скорость на CPU и GPU для реализации LightGBM.

In [34]:
import lightgbm as lgb

In [35]:
%%time

lgb_gbm = lgb.LGBMClassifier(objective='binary',
                             boosting_type='gbdt',
                             device='cpu',
                             n_estimators=25)

lgb_gbm.fit(X_train, y_train,
            eval_set=[(X_test, y_test)],
            eval_metric='binary_error')

[1]	valid_0's binary_error: 0.32842
[2]	valid_0's binary_error: 0.324932
[3]	valid_0's binary_error: 0.319218
[4]	valid_0's binary_error: 0.317836
[5]	valid_0's binary_error: 0.313634
[6]	valid_0's binary_error: 0.311608
[7]	valid_0's binary_error: 0.309196
[8]	valid_0's binary_error: 0.306998
[9]	valid_0's binary_error: 0.305922
[10]	valid_0's binary_error: 0.30437
[11]	valid_0's binary_error: 0.302572
[12]	valid_0's binary_error: 0.301128
[13]	valid_0's binary_error: 0.299338
[14]	valid_0's binary_error: 0.297698
[15]	valid_0's binary_error: 0.29602
[16]	valid_0's binary_error: 0.295076
[17]	valid_0's binary_error: 0.29398
[18]	valid_0's binary_error: 0.2929
[19]	valid_0's binary_error: 0.291798
[20]	valid_0's binary_error: 0.29095
[21]	valid_0's binary_error: 0.289862
[22]	valid_0's binary_error: 0.28934
[23]	valid_0's binary_error: 0.288564
[24]	valid_0's binary_error: 0.287786
[25]	valid_0's binary_error: 0.287052
CPU times: user 1min 59s, sys: 1.94 s, total: 2min 1s
Wall time: 23

In [36]:
%%time

lgb_gbm = lgb.LGBMClassifier(objective='binary',
                             boosting_type='gbdt',
                             device='gpu',
                             n_estimators=25)

lgb_gbm.fit(X_train, y_train,
            eval_set=[(X_test, y_test)],
            eval_metric='binary_error')

[1]	valid_0's binary_error: 0.32842
[2]	valid_0's binary_error: 0.324932
[3]	valid_0's binary_error: 0.319218
[4]	valid_0's binary_error: 0.317836
[5]	valid_0's binary_error: 0.313634
[6]	valid_0's binary_error: 0.311608
[7]	valid_0's binary_error: 0.309196
[8]	valid_0's binary_error: 0.306998
[9]	valid_0's binary_error: 0.305922
[10]	valid_0's binary_error: 0.30437
[11]	valid_0's binary_error: 0.302572
[12]	valid_0's binary_error: 0.301128
[13]	valid_0's binary_error: 0.299338
[14]	valid_0's binary_error: 0.297698
[15]	valid_0's binary_error: 0.29602
[16]	valid_0's binary_error: 0.295076
[17]	valid_0's binary_error: 0.29398
[18]	valid_0's binary_error: 0.2929
[19]	valid_0's binary_error: 0.291798
[20]	valid_0's binary_error: 0.29095
[21]	valid_0's binary_error: 0.289862
[22]	valid_0's binary_error: 0.28934
[23]	valid_0's binary_error: 0.288564
[24]	valid_0's binary_error: 0.287786
[25]	valid_0's binary_error: 0.287054
CPU times: user 1min 49s, sys: 3.3 s, total: 1min 52s
Wall time: 22

In [37]:
from sklearn.metrics import accuracy_score, roc_auc_score

print "Accuracy:", accuracy_score(y_test, lgb_gbm.predict(X_test))
print "AUC:", roc_auc_score(y_test, lgb_gbm.predict_proba(X_test)[:, 1])

Accuracy:

  if diff:


 0.712946
AUC: 0.7895449896061647


# Catboost

Проделаем аналогичные действия с реализацией Catboost.

In [38]:
%%time

import catboost

train_pool = catboost.Pool(X_train.toarray(), y_train)
test_pool = catboost.Pool(X_test.toarray(), y_test)

clf = catboost.CatBoostClassifier(
    loss_function='Logloss',
    iterations=25,
    task_type='CPU'
)

CPU times: user 30.2 s, sys: 2.9 s, total: 33.1 s
Wall time: 30.1 s


In [39]:
%%time

clf.fit(
    train_pool,
    eval_set=test_pool
)

0:	learn: 0.6880803	test: 0.6880801	best: 0.6880801 (0)	total: 1.76s	remaining: 42.2s
1:	learn: 0.6833516	test: 0.6833495	best: 0.6833495 (1)	total: 3.5s	remaining: 40.2s
2:	learn: 0.6787728	test: 0.6787733	best: 0.6787733 (2)	total: 5.28s	remaining: 38.7s
3:	learn: 0.6746698	test: 0.6746652	best: 0.6746652 (3)	total: 7.23s	remaining: 38s
4:	learn: 0.6705258	test: 0.6705230	best: 0.6705230 (4)	total: 8.96s	remaining: 35.8s
5:	learn: 0.6668154	test: 0.6668140	best: 0.6668140 (5)	total: 10.7s	remaining: 33.8s
6:	learn: 0.6634037	test: 0.6633938	best: 0.6633938 (6)	total: 12.4s	remaining: 31.9s
7:	learn: 0.6599678	test: 0.6599591	best: 0.6599591 (7)	total: 14.2s	remaining: 30.1s
8:	learn: 0.6569559	test: 0.6569357	best: 0.6569357 (8)	total: 15.9s	remaining: 28.2s
9:	learn: 0.6538344	test: 0.6538132	best: 0.6538132 (9)	total: 17.8s	remaining: 26.8s
10:	learn: 0.6508834	test: 0.6508601	best: 0.6508601 (10)	total: 19.6s	remaining: 24.9s
11:	learn: 0.6483609	test: 0.6483348	best: 0.6483348 (1

<catboost.core._CatBoostBase at 0x7f3ebcc11c90>

In [40]:
clf = catboost.CatBoostClassifier(
    loss_function='Logloss',
    iterations=25,
    task_type='GPU'
)

In [41]:
%%time

clf.fit(
    train_pool,
    eval_set=test_pool
)

0:	learn: 0.6880601905	test: 0.68806	bestTest:	0.68806 (1)	total: 351ms	remaining: 8.43s
1:	learn: 0.6832901429	test: 0.6832920625	bestTest:	0.6832920625 (2)	total: 663ms	remaining: 7.63s
2:	learn: 0.6787926667	test: 0.67879325	bestTest:	0.67879325 (3)	total: 952ms	remaining: 6.98s
3:	learn: 0.6746924762	test: 0.6746914375	bestTest:	0.6746914375 (4)	total: 1.2s	remaining: 6.3s
4:	learn: 0.6705629524	test: 0.67056375	bestTest:	0.67056375 (5)	total: 1.45s	remaining: 5.78s
5:	learn: 0.6669227143	test: 0.6669128125	bestTest:	0.6669128125 (6)	total: 1.7s	remaining: 5.38s
6:	learn: 0.6633342381	test: 0.6633244375	bestTest:	0.6633244375 (7)	total: 1.95s	remaining: 5.02s
7:	learn: 0.6599367143	test: 0.659933125	bestTest:	0.659933125 (8)	total: 2.2s	remaining: 4.68s
8:	learn: 0.6569189524	test: 0.6569039375	bestTest:	0.6569039375 (9)	total: 2.47s	remaining: 4.39s
9:	learn: 0.6537900476	test: 0.6537714375	bestTest:	0.6537714375 (10)	total: 2.71s	remaining: 4.07s
10:	learn: 0.6509410476	test: 0.6

<catboost.core._CatBoostBase at 0x7f3ebcbe0b50>

In [42]:
print "Accuracy:", accuracy_score(y_test, clf.predict(X_test.toarray()))
print "AUC:", roc_auc_score(y_test, clf.predict_proba(X_test.toarray())[:, 1])

Accuracy: 0.679938
AUC: 0.7467504764417499


На что стоит обратить внимание:

* Скорость работы на GPU выше даже если сравнивать с параллельной версией, использующей несколько CPU.
* Для GPU много времени занимает трансляция модели в код для GPU, поэтому для большого количество итераций прирост производительности выше.
* При аналогичных параметрах обучения на GPU и CPU могут получаться разные результаты. Это связано с тем, что на CPU и на GPU используется немного разная арифметика с плавающей точкой.

**Упражнения:** 

* Попробуйте получить максимально возможную точность на этих данных с помощью любой из реализаций.
* Сравните скорость работы этих реализаций и реализаций в sklearn и Spark.MLLib