<a href="https://colab.research.google.com/github/Yanina-Kutovaya/GNN/blob/main/notebooks/5_1_CatBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Реализация CatBoostClassifier с использованием GPU (CUDA 12.5) для классификации узлов на датасете Bitcoin-OTC

Задача — предсказать "уровень доверия" пользователей

Признаки узлов/рёбер - синтетические

Небходимо выбрать среду выполнения с GPU: Среда выполнения → Сменить среду выполнения → Графический процессор T4

## 1. Установка зависимостей

Требования:
- CatBoost ≥1.2.2
- PyTorch Geometric для загрузки данных


* CatBoost автоматически использует доступные GPU через CUDA, явная установка CUDA 12.5 не требуется

In [1]:
install = True
if install:
  !pip install -q catboost
  !pip install -q torch-geometric

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import output
output.disable_custom_widget_manager()

## 2. Импорт библиотек

In [3]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from torch_geometric.datasets import BitcoinOTC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## 3. Загрузка данных

In [4]:
dataset = BitcoinOTC(root='/tmp/BitcoinOTC')
data = dataset[0]

Downloading https://snap.stanford.edu/data/soc-sign-bitcoinotc.csv.gz
Extracting /tmp/BitcoinOTC/raw/soc-sign-bitcoinotc.csv.gz
Processing...
Done!


## 4. Создание признаков узлов с учётом рёбер

In [5]:
def create_node_features(data):
    num_nodes = data.num_nodes
    features = []

    # Степень узлов
    degrees = np.zeros(num_nodes)
    for src in data.edge_index[0].unique().cpu().numpy():
        degrees[src] = (data.edge_index[0].cpu().numpy() == src).sum()

    # Статистики по рёбрам
    edge_attrs = [[] for _ in range(num_nodes)]
    for i in range(data.edge_index.size(1)):
        src = data.edge_index[0, i].item()
        attr = data.edge_attr[i].item()
        edge_attrs[src].append(attr)

    # Создание фичей
    for node_id in range(num_nodes):
        node_feats = [
            degrees[node_id],  # Степень узла
            np.mean(edge_attrs[node_id]) if edge_attrs[node_id] else 0,  # Среднее значение рёбер
            np.max(edge_attrs[node_id]) if edge_attrs[node_id] else 0,   # Максимум
            np.min(edge_attrs[node_id]) if edge_attrs[node_id] else 0,   # Минимум
            len(edge_attrs[node_id])                                      # Количество рёбер
        ]
        features.append(node_feats)

    return np.array(features)

## 5. Подготовка данных

In [6]:
X = create_node_features(data)
X

array([[2. , 2.5, 4. , 1. , 2. ],
       [3. , 5. , 5. , 5. , 3. ],
       [0. , 0. , 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. ]])

In [7]:
y = np.zeros(data.num_nodes)

# Создание меток (3 класса)
q1 = np.quantile(X[:, 0], 0.5)
q2 = np.quantile(X[:, 0], 0.999)
y[X[:, 0] > q2] = 2
y[(X[:, 0] > q1) & (X[:, 0] <= q2)] = 1

## 6. Разделение данных

In [8]:
# Стратифицированное разделение
def stratified_split(labels, train_ratio=0.6, val_ratio=0.2, seed=42):
    np.random.seed(seed)
    unique_classes = np.unique(labels)
    train_indices, val_indices, test_indices = [], [], []

    for cls in unique_classes:
        cls_indices = np.where(labels == cls)[0]
        np.random.shuffle(cls_indices)

        train_end = int(train_ratio * len(cls_indices))
        val_end = train_end + int(val_ratio * len(cls_indices))

        train_indices.extend(cls_indices[:train_end])
        val_indices.extend(cls_indices[train_end:val_end])
        test_indices.extend(cls_indices[val_end:])

    return (
        np.array(train_indices, dtype=int),
        np.array(val_indices, dtype=int),
        np.array(test_indices, dtype=int)
    )

train_idx, val_idx, test_idx = stratified_split(labels=y)

X_train, X_val, X_test = X[train_idx], X[val_idx], X[test_idx]
y_train, y_val, y_test = y[train_idx], y[val_idx], y[test_idx]

## 7. Создание CatBoost модели с GPU

In [9]:
# Расчет весов классов
class_counts = np.bincount(y_train.astype(int))
min_weight = 1e-2
class_weights = {
    i: max(1.0 / (count + 1e-5), min_weight)
    for i, count in enumerate(class_counts)
}

# Обучение модели с оптимизированными параметрами
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=8,
    #l2_leaf_reg=3.0,
    loss_function='MultiClass',
    #eval_metric='TotalF1',
    task_type='GPU',
    devices='0:1',
    class_weights=class_weights,
    early_stopping_rounds=50,
    verbose=50,
    random_seed=42,
    use_best_model=True,
    od_type='Iter',
)

## 8. Обучение

In [10]:
model.fit(X_train, y_train, eval_set=(X_val, y_val))

0:	learn: 0.9522579	test: 0.9509105	best: 0.9509105 (0)	total: 90.5ms	remaining: 1m 30s
50:	learn: 0.0567106	test: 0.0515933	best: 0.0515933 (50)	total: 1.72s	remaining: 32.1s
100:	learn: 0.0257996	test: 0.0227413	best: 0.0227413 (100)	total: 3.22s	remaining: 28.7s
150:	learn: 0.0157167	test: 0.0137540	best: 0.0137540 (150)	total: 4.64s	remaining: 26.1s
200:	learn: 0.0111077	test: 0.0096930	best: 0.0096930 (200)	total: 6.69s	remaining: 26.6s
250:	learn: 0.0085311	test: 0.0074353	best: 0.0074353 (250)	total: 8.79s	remaining: 26.2s
300:	learn: 0.0068666	test: 0.0059833	best: 0.0059833 (300)	total: 12s	remaining: 27.9s
350:	learn: 0.0057343	test: 0.0049959	best: 0.0049959 (350)	total: 14.3s	remaining: 26.4s
400:	learn: 0.0049131	test: 0.0042803	best: 0.0042803 (400)	total: 15.3s	remaining: 22.9s
450:	learn: 0.0043028	test: 0.0037487	best: 0.0037487 (450)	total: 15.7s	remaining: 19.2s
500:	learn: 0.0038156	test: 0.0033237	best: 0.0033237 (500)	total: 16.1s	remaining: 16.1s
550:	learn: 0.00

<catboost.core.CatBoostClassifier at 0x790df2451fd0>

## 9. Оценка

In [11]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, labels=[0, 1, 2]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1199
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         1

    accuracy                           1.00      1203
   macro avg       1.00      1.00      1.00      1203
weighted avg       1.00      1.00      1.00      1203



## 10. Важность признаков

In [12]:
feature_names = ['Degree', 'EdgeMean', 'EdgeMax', 'EdgeMin', 'EdgeCount']
print("\nFeature Importance:")
for name, score in zip(feature_names, model.get_feature_importance()):
    print(f"{name}: {score:.2f}")


Feature Importance:
Degree: 69.03
EdgeMean: 3.33
EdgeMax: 0.39
EdgeMin: 0.95
EdgeCount: 26.31


## 11. Сохранение модели

In [13]:
model.save_model('catboost_model.cbm')

## 12. Загрузка модели

In [14]:
loaded_model = CatBoostClassifier()
loaded_model.load_model('catboost_model.cbm')

<catboost.core.CatBoostClassifier at 0x790df3a2dc10>

In [15]:
y_pred = loaded_model.predict(X_test)
print(classification_report(y_test, y_pred, labels=[0, 1, 2]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1199
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         1

    accuracy                           1.00      1203
   macro avg       1.00      1.00      1.00      1203
weighted avg       1.00      1.00      1.00      1203

