# Бустинг

На этом уроке вы продолжете работать с датасетом, с котором вы уже работали. Однако сегодня вы запустите на нем более крутые алгоритмы машинного обучения. Удачи!

In [1]:
import pandas as pd

In [2]:
# Данные библиотеки могут быть не установленны. 
# Установите их через pip install/ conda install
import xgboost
import lightgbm as lgb
from catboost import CatBoostClassifier

In [4]:
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeClassifier

In [5]:
data = pd.read_csv('dota2_skill_train.csv', index_col='id')
data.shape

(99871, 57)

In [6]:
data.head()

Unnamed: 0_level_0,skilled,player_team,winner_team,duration,pre_game_duration,first_blood_time,first_blood_claimed,hero_id,hero_pick_order,leaver_status,...,avg_deaths_x16,avg_assists_x16,avg_gpm_x16,avg_xpm_x16,best_kills_x16,best_assists_x16,best_gpm_x16,best_xpm_x16,win_streak,best_win_streak
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,1,dire,dire,2140,90,129,0,90,9,0,...,8,15,352,430,10,30,551,745,2,5
8,0,radiant,radiant,2138,90,174,0,5,5,0,...,9,19,294,425,13,37,445,717,2,12
12,0,radiant,radiant,3547,90,360,0,81,7,0,...,7,9,493,543,23,18,691,762,3,3
13,1,dire,radiant,1878,90,28,0,74,9,0,...,7,12,515,583,25,34,869,935,0,6
14,1,dire,radiant,2232,90,129,0,14,6,0,...,10,16,337,452,34,43,672,797,0,9


In [7]:
# Кодируем категориальные фичи 
data['player_team'] = data['player_team'].map({'dire':0, 'radiant':1})
data['winner_team'] = data['winner_team'].map({'dire':0, 'radiant':1})

## Вспомнить всё 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('skilled', axis=1), data.skilled, random_state=42)

In [9]:
tree_6 = DecisionTreeClassifier(max_depth=6)
tree_6.fit(X_train, y_train)

pred = tree_6.predict(X_test)
accuracy_score(y_test, pred)

0.662528035885934

In [10]:
print('ROC-AUC: ', roc_auc_score(y_test, pred))
print(confusion_matrix(y_test, pred))

ROC-AUC:  0.6117802842521022
[[12883  2239]
 [ 6187  3659]]


## Построим бустинговые модели и сравним результаты 

## Xgboost 

In [11]:
'''
Мы можем добавлять параметры модели с помощью словаря параметров. 
Это удобно т.к. в бустинговых моделях параметров намного больше, чем в линейных

'''


params = { 
    'eta': 0.07, #то же самое, что и learning rate
    'max_depth': 8, 
    'objective': 'binary:logistic',
    'eval_metric': 'Logloss',
    'seed': 7}


model = xgboost.XGBClassifier(num_round = 100)
model.fit(X_train, y_train,  eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.33743
[1]	validation_0-error:0.32121
[2]	validation_0-error:0.31512
[3]	validation_0-error:0.31300
[4]	validation_0-error:0.30940
[5]	validation_0-error:0.30263
[6]	validation_0-error:0.30147
[7]	validation_0-error:0.29866
[8]	validation_0-error:0.29346
[9]	validation_0-error:0.29257
[10]	validation_0-error:0.29105
[11]	validation_0-error:0.28953
[12]	validation_0-error:0.28797
[13]	validation_0-error:0.28701
[14]	validation_0-error:0.28452
[15]	validation_0-error:0.28352
[16]	validation_0-error:0.28068
[17]	validation_0-error:0.27896
[18]	validation_0-error:0.27872
[19]	validation_0-error:0.27759
[20]	validation_0-error:0.27840
[21]	validation_0-error:0.27840
[22]	validation_0-error:0.27727
[23]	validation_0-error:0.27539
[24]	validation_0-error:0.27563
[25]	validation_0-error:0.27467
[26]	validation_0-error:0.27431
[27]	validation_0-error:0.27319
[28]	validation_0-error:0.27343
[29]	validation_0-error:0.27343
[30]	validation_0-error:0.27163
[31]	validation_0-

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1, num_round=100,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

#### Построим confusion matrix и roc-auc 
Важно: бустинговые модели, как правило, выдают вероятность принадлежности к классу.

In [12]:
# В данном случае достаточно слова predict
y_predict_proba = model.predict(X_test)

In [13]:
print('ROC-AUC: ', roc_auc_score(y_test, y_predict_proba))
print(confusion_matrix(y_test, y_predict_proba))

ROC-AUC:  0.7294444214746535
[[12559  2563]
 [ 3659  6187]]


## Catboost 

Все параметры смотреть здесь:
https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning-docpage/

Мы настроим только несколько параметров:
 - количество итераций
 - глубину деревьев
 - метрику качества
 - функцию потерь
 - random seed, чтобы результаты моделей были сравнимы
 
В каждой настроенной модели мы будем использовать одни и те же параметры, опять же, чтобы результаты были похожи и объективны

In [14]:
model = CatBoostClassifier(iterations=100,
                           learning_rate=0.07, 
                           depth=8, 
                           eval_metric = 'AUC',
                           loss_function='Logloss',
                          random_state=7)
model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

0:	test: 0.6860601	best: 0.6860601 (0)	total: 297ms	remaining: 29.4s
1:	test: 0.6973464	best: 0.6973464 (1)	total: 361ms	remaining: 17.7s
2:	test: 0.7022035	best: 0.7022035 (2)	total: 426ms	remaining: 13.8s
3:	test: 0.7079006	best: 0.7079006 (3)	total: 496ms	remaining: 11.9s
4:	test: 0.7112800	best: 0.7112800 (4)	total: 581ms	remaining: 11s
5:	test: 0.7169482	best: 0.7169482 (5)	total: 648ms	remaining: 10.1s
6:	test: 0.7168093	best: 0.7169482 (5)	total: 723ms	remaining: 9.6s
7:	test: 0.7185901	best: 0.7185901 (7)	total: 794ms	remaining: 9.13s
8:	test: 0.7199292	best: 0.7199292 (8)	total: 854ms	remaining: 8.64s
9:	test: 0.7229102	best: 0.7229102 (9)	total: 914ms	remaining: 8.23s
10:	test: 0.7249060	best: 0.7249060 (10)	total: 988ms	remaining: 8s
11:	test: 0.7264539	best: 0.7264539 (11)	total: 1.05s	remaining: 7.72s
12:	test: 0.7280701	best: 0.7280701 (12)	total: 1.12s	remaining: 7.47s
13:	test: 0.7298126	best: 0.7298126 (13)	total: 1.2s	remaining: 7.36s
14:	test: 0.7324075	best: 0.73240

<catboost.core.CatBoostClassifier at 0x167ee732908>

#### Построим confusion matrix и roc-auc 

In [15]:
y_predict_proba = model.predict(X_test)

In [16]:
print('ROC-AUC: ', roc_auc_score(y_test, y_predict_proba))
print(confusion_matrix(y_test, y_predict_proba))

ROC-AUC:  0.6854641629218521
[[12777  2345]
 [ 4667  5179]]


## LightGBM 

Полезно посмотреть документацию:https://lightgbm.readthedocs.io/en/latest/Parameters.html.
Это позволит вам поиграться с параметрами.

In [17]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)


params = {'max_depth':8, #глубина деревьев
    'boosting_type': 'gbdt', #модель для бустинга, в данном случае - gradient bosting decision tree
    'objective': 'binary', #задача
    'metric': ['auc'], #метрика качества
    'learning_rate': 0.07, #
     'seed': 7}

In [18]:
gbm = lgb.train(params, lgb_train,
            num_boost_round=100, #количество построений-улучшений алгоритма
            valid_sets=[lgb_train, lgb_eval],
            early_stopping_rounds=10 #после этого количества одинаковых результатов, алгоритм досрочно закончит обучение
               )

[1]	training's auc: 0.701242	valid_1's auc: 0.693456
Training until validation scores don't improve for 10 rounds
[2]	training's auc: 0.713214	valid_1's auc: 0.703888
[3]	training's auc: 0.720579	valid_1's auc: 0.708844
[4]	training's auc: 0.725099	valid_1's auc: 0.714191
[5]	training's auc: 0.727701	valid_1's auc: 0.715816
[6]	training's auc: 0.731399	valid_1's auc: 0.718912
[7]	training's auc: 0.736399	valid_1's auc: 0.722671
[8]	training's auc: 0.740426	valid_1's auc: 0.726231
[9]	training's auc: 0.741339	valid_1's auc: 0.727577
[10]	training's auc: 0.745113	valid_1's auc: 0.730943
[11]	training's auc: 0.747749	valid_1's auc: 0.733173
[12]	training's auc: 0.750337	valid_1's auc: 0.735724
[13]	training's auc: 0.753091	valid_1's auc: 0.738293
[14]	training's auc: 0.75446	valid_1's auc: 0.739456
[15]	training's auc: 0.756416	valid_1's auc: 0.741463
[16]	training's auc: 0.758003	valid_1's auc: 0.743015
[17]	training's auc: 0.759438	valid_1's auc: 0.744366
[18]	training's auc: 0.761237	v

#### Оценим модель 

In [19]:
y_predict_proba = gbm.predict(X_test)

In [20]:
print('ROC-AUC: ', roc_auc_score(y_test, y_predict_proba))

ROC-AUC:  0.7929191012294265


In [21]:
for i in range(len(y_predict_proba)):
    if y_predict_proba[i]<0.5:
        y_predict_proba[i]=0
    else:
        y_predict_proba[i] = 1

In [22]:
confusion_matrix(y_test,y_predict_proba)

array([[12702,  2420],
       [ 4287,  5559]], dtype=int64)