### Дипломная работа Александра Соколова

#### Градиентный бустинг (CatBoost + GPU)
Кернел 4 из 5 в разделе ML (отредактирован 21.04.2021)
---

# 1. Импорт библиотек, инициализация глобальных констант
## 1.1. Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import os
import tqdm
import pickle


from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier, Pool
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

np.warnings.filterwarnings('ignore')

## 1.2. Глобальные константы

In [2]:
# CURRENT_DIR = './'  # имя текущей директории для локальной машины 
CURRENT_DIR = '../'  # имя текущей директории для каггл

PATH_TO_WORKDIR = CURRENT_DIR + 'working/'

In [3]:
!pip freeze > requirements.txt

## 1.3. Проверка подключения и параметров GPU

In [4]:
!nvidia-smi

Fri Apr 30 07:10:16 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

# 2. Импорт предобработанных данных
---
предобработка осуществлялась в [первом кернеле](https://www.kaggle.com/sokolovaleks/sf-dst-10-diplom-1-ml-sokolov)

In [5]:
merged_train_data = pd.read_csv('../input/alfabattle2-sandbox/preproc_data_for_boosting/preproc_data_for_boosting/merged_data.csv')
merged_test_data = pd.read_csv('../input/alfabattle2-sandbox/preproc_data_for_boosting/preproc_data_for_boosting/merged_test_data.csv')

In [6]:
features = [x for x in merged_train_data.columns if x not in ['app_id', 'flag']]

# 3. Разбиваем тренировочную выборку
---
Важно выделять валидационную выборку, чтобы контролировать обучение и не переобучаться. 

In [7]:
targets = merged_train_data.flag.values

cv = KFold(n_splits=5, random_state=100, shuffle=True)

# 4. CatBoost + CV

In [8]:
name_model = 'CatBoost'

In [9]:
!mkdir model_CatBoost

In [10]:
%%time

oof = np.zeros(len(merged_train_data))
train_preds = np.zeros(len(merged_train_data))

models = []

tree_params = {
    'max_depth': 5,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'random_state': 100,
    'l2_leaf_reg': 1,
    'task_type': 'GPU' # если имеется GPU с CUDA, то можно ее задействовать, чтобы на порядок ускорить обучение
}


for fold_, (train_idx, val_idx) in enumerate(cv.split(merged_train_data, targets), 1):
    print(f'Началось обучение на фолде номер:= {fold_}.')
    model = CatBoostClassifier(**tree_params)    
    
    train, val = merged_train_data.iloc[train_idx], merged_train_data.iloc[val_idx]
    
    train_pool = Pool(train[features], train.flag.values)
    val_pool = Pool(val[features], val.flag.values)
    

    model.fit(train_pool, eval_set=[val_pool], early_stopping_rounds=100, verbose_eval=50, use_best_model=True, plot=False)
    
    oof[val_idx] = model.predict_proba(val_pool)[:, 1]
    
    train_preds[train_idx] += model.predict_proba(train_pool)[:, 1] / (cv.n_splits-1)
    models.append(model)
    
    file_name_model = f'model_CatBoost/model_{name_model}_{fold_}'
    model.save_model(PATH_TO_WORKDIR + file_name_model)
    print(f'Обучение на фолде номер:= {fold_} завершилось.')

Началось обучение на фолде номер:= 1.
0:	learn: 0.6291874	test: 0.6279340	best: 0.6279340 (0)	total: 36.4ms	remaining: 36.4s
50:	learn: 0.7400744	test: 0.7352458	best: 0.7352458 (50)	total: 1.18s	remaining: 22s
100:	learn: 0.7464372	test: 0.7412741	best: 0.7412741 (100)	total: 2.33s	remaining: 20.7s
150:	learn: 0.7514476	test: 0.7451583	best: 0.7451642 (148)	total: 3.45s	remaining: 19.4s
200:	learn: 0.7556333	test: 0.7484902	best: 0.7484902 (200)	total: 4.54s	remaining: 18s
250:	learn: 0.7584953	test: 0.7507088	best: 0.7507088 (250)	total: 5.63s	remaining: 16.8s
300:	learn: 0.7608958	test: 0.7526601	best: 0.7526601 (300)	total: 7.16s	remaining: 16.6s
350:	learn: 0.7627018	test: 0.7540476	best: 0.7540476 (350)	total: 8.23s	remaining: 15.2s
400:	learn: 0.7642716	test: 0.7552214	best: 0.7552214 (400)	total: 9.33s	remaining: 13.9s
450:	learn: 0.7657118	test: 0.7562774	best: 0.7562774 (450)	total: 10.4s	remaining: 12.7s
500:	learn: 0.7669233	test: 0.7570581	best: 0.7570581 (500)	total: 11.5

In [11]:
file_name_pickle = f'model_CatBoost/feats_model{name_model}.pickle'
with open(PATH_TO_WORKDIR + file_name_pickle, 'wb') as f:
    pickle.dump(features, f)

In [12]:
'Train roc-auc model CatBoost', roc_auc_score(targets, train_preds)

('Train roc-auc model CatBoost', 0.7759388373262527)

In [13]:
'CV roc-auc  model CatBoost', roc_auc_score(targets, oof)

('CV roc-auc  model CatBoost', 0.7643527553015116)

In [14]:
score = np.zeros(len(merged_test_data))

test_pool = Pool(merged_test_data[features])

for model in tqdm.tqdm_notebook(models):
    score += model.predict_proba(test_pool)[:, 1] / len(models)
    
submission = pd.DataFrame({
    'app_id' : merged_test_data.app_id.values,
    'score': score
}) 
submission.to_csv(PATH_TO_WORKDIR + f'sub_model{name_model}.csv', index=None) # ~ 0.732 на public test

  0%|          | 0/5 [00:00<?, ?it/s]