# 1️⃣ **Описание шаблона для решения задачи.**

**Задача**: обучить CatBoostа также залогировать основные компоненты

**Что залогировать? 🧐**:
- Конфиг, в котором хранится все служебные переменные
- После разбиения на train_test_split залогировать ТОЛЬКО test часть
- Сохранить обученные модели
- Сохранить два репорта - для классификации и для регрессии


✅ Будут выполнены:
- все дополнительные условия
- возможности фреймворков (загрузка датасетов с помощью соответствующих классов, правильная подготовка категориальных признаков, early_stopping, многопоточность)
- подбор гиперпараметров для каждой модели

👀 При желании, рекомендуется проделать следующее:
- Добавить теги для эксперимента
- Провести EDA и сохранить графики
- Добавить еще метрик и отслеживать их по мере обучения (главное в меру 😁)


❗️❗️❗️ **P.S.**
- Данный ноутбук - далеко не единственное верное решение, воспринимайте его как помощник для вашего собственного решения или чтобы побороть страх белого листа :)

# 2️⃣ Подключаем необходимые библиотеки

In [1]:
# Для чтения переменных среды
%pip install python-dotenv -q

In [3]:
from dotenv import dotenv_values
from pathlib import Path
from dataclasses import dataclass, asdict

import pandas as pd
import numpy as np
import torch

from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, root_mean_squared_error, r2_score

from catboost import CatBoostClassifier, CatBoostRegressor, Pool

In [4]:
dotenv_path = Path('..')
dotenv = dotenv_values(dotenv_path / '.env')

<div class="alert alert-warning">

Необходимо получить access и secret токены

In [3]:
access_key = ...
secret_key = ...

In [12]:
%%capture
#  Не показывать свои api-ключи
%env CLEARML_WEB_HOST=https://app.clear.ml/
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml

%env CLEARML_API_ACCESS_KEY=$access_key
%env CLEARML_API_SECRET_KEY=$secret_key

In [5]:
@dataclass
class CFG:
    project_name: str = "..."
    experiment_name: str = "..."

    data_path: str = "../data"
    train_name: str = "quickstart_train.csv"

    seed: int = 2024
cfg = CFG()

In [6]:
def seed_everything(seed=2024):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(cfg.seed)

# 3️⃣ Начинаем эксперимент

In [None]:
task = Task.init(
    project_name=cfg.project_name,
    task_name=cfg.experiment_name
);
task.add_tags([...]) # Добавьте тэги обучения

In [None]:
# Добавить конфиг запуска
task.connect(
    ...
)

# 4️⃣ Подгружаем данные

In [7]:
folder_path = Path(cfg.data_path)

In [8]:
rides_info = pd.read_csv(str(folder_path / cfg.train_name))

## Препроцессинг

In [9]:
rides_info.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,mean_rating,distance_sum,rating_min,speed_max,user_ride_quality_median,deviation_normal_count,user_uniq
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,109.99,another_bug,4.737759,12141310.0,0.1,180.855726,0.023174,174,170
1,O41613818T,VW Polo VI,economy,petrol,3.9,2015,78218,2021,34.48,electro_bug,4.480517,18039090.0,0.0,187.862734,12.306011,174,174
2,d-2109686j,Renault Sandero,standart,petrol,6.3,2012,23340,2017,34.93,gear_stick,4.768391,15883660.0,0.1,102.382857,2.513319,174,173
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,32.22,engine_fuel,3.88092,16518830.0,0.1,172.793237,-5.029476,174,170
4,N-8915870N,Renault Sandero,standart,petrol,4.7,2012,26428,2017,27.51,engine_fuel,4.181149,13983170.0,0.1,203.462289,-14.260456,174,171


In [10]:
cat_features = ["model", "car_type", "fuel_type"]  # Выделяем категориальные признаки
targets = ["target_class", "target_reg"]
features2drop = ["car_id"]  # эти фичи будут удалены

# Отбираем итоговый набор признаков для использования моделью
filtered_features = [i for i in rides_info.columns if (i not in targets and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]

print("cat_features", cat_features)
print("num_features", len(num_features))
print("targets", targets)

for c in cat_features:  # Избавлеямся от NaN'ов
    rides_info[c] = rides_info[c].astype(str)

cat_features ['model', 'car_type', 'fuel_type']
num_features 11
targets ['target_class', 'target_reg']


In [11]:
train, test = train_test_split(rides_info, test_size=0.2, random_state=cfg.seed)

In [None]:
# Залогируйте только валидационную выборку!

logger.report_table(
    ...
)

In [12]:
X_train = train[filtered_features].drop(targets, axis=1, errors="ignore")
y_train_cls = train["target_class"]
y_train_reg = train["target_reg"]

X_test = test[filtered_features].drop(targets, axis=1, errors="ignore")
y_test_cls = test["target_class"]
y_test_reg = test["target_reg"]

# 5️⃣ Обучаем модельку

In [13]:
catboost_hyp_cls = {
        "depth": 4,
        "iterations": 3500,
        "learning_rate": 0.06,
        "loss_function": "MultiClass",  # MultiLogloss
        # eval_metric = 'Precision',  F1:macro / AUC:hints=skip_train~false
        "custom_metric": ["Recall"],  # 'AUC / Accuracy,
        
        # Главная фишка катбуста - работа с категориальными признаками
        "cat_features": cat_features,
        # ignored_features = ignored_features,
        
        # Регуляризация и ускорение
        "colsample_bylevel": 0.098,
        "subsample": 0.95,
        "l2_leaf_reg": 9,
        "min_data_in_leaf": 243,
        "max_bin": 187,
        "random_strength": 1,
        
        # Параметры скорения
        "task_type": "CPU",    
        "thread_count": -1,
        "bootstrap_type": "Bernoulli", 
        
        # Важное!
        "random_seed": cfg.seed,
        "auto_class_weights": "SqrtBalanced",
        "early_stopping_rounds": 50
}

catboost_hyp_reg = {
        "random_seed": cfg.seed,
        "thread_count": -1,
        "cat_features": cat_features,
        "eval_metric": "RMSE"
}

залогируйте параметры CatBoost

Логирование CatBoost в ClearML - https://clear.ml/docs/latest/docs/guides/frameworks/catboost/

In [14]:
clf = CatBoostClassifier(**catboost_hyp_cls)
reg = CatBoostRegressor(**catboost_hyp_reg)

In [15]:
clf.fit(X_train, y_train_cls, eval_set=(X_test, y_test_cls))

0:	learn: 2.1695430	test: 2.1730530	best: 2.1730530 (0)	total: 51.1ms	remaining: 2m 58s
1:	learn: 2.1692456	test: 2.1728220	best: 2.1728220 (1)	total: 52.8ms	remaining: 1m 32s
2:	learn: 2.1643163	test: 2.1694042	best: 2.1694042 (2)	total: 54.5ms	remaining: 1m 3s
3:	learn: 2.1640750	test: 2.1692212	best: 2.1692212 (3)	total: 56.4ms	remaining: 49.3s
4:	learn: 2.1114799	test: 2.1187678	best: 2.1187678 (4)	total: 61ms	remaining: 42.6s
5:	learn: 2.1112717	test: 2.1186031	best: 2.1186031 (5)	total: 62.6ms	remaining: 36.4s
6:	learn: 2.0740845	test: 2.0824182	best: 2.0824182 (6)	total: 66ms	remaining: 32.9s
7:	learn: 2.0694989	test: 2.0789473	best: 2.0789473 (7)	total: 68.5ms	remaining: 29.9s
8:	learn: 2.0693512	test: 2.0788356	best: 2.0788356 (8)	total: 70ms	remaining: 27.2s
9:	learn: 1.9978023	test: 2.0039531	best: 2.0039531 (9)	total: 72.8ms	remaining: 25.4s
10:	learn: 1.9266825	test: 1.9296427	best: 1.9296427 (10)	total: 75.4ms	remaining: 23.9s
11:	learn: 1.9022030	test: 1.9064657	best: 1.

156:	learn: 0.8186087	test: 0.8500628	best: 0.8500628 (156)	total: 410ms	remaining: 8.73s
157:	learn: 0.8122412	test: 0.8439827	best: 0.8439827 (157)	total: 412ms	remaining: 8.71s
158:	learn: 0.8122392	test: 0.8439784	best: 0.8439784 (158)	total: 414ms	remaining: 8.7s
159:	learn: 0.8122375	test: 0.8439744	best: 0.8439744 (159)	total: 415ms	remaining: 8.67s
160:	learn: 0.8122359	test: 0.8439708	best: 0.8439708 (160)	total: 417ms	remaining: 8.64s
161:	learn: 0.8121821	test: 0.8440167	best: 0.8439708 (160)	total: 419ms	remaining: 8.62s
162:	learn: 0.8121808	test: 0.8440135	best: 0.8439708 (160)	total: 420ms	remaining: 8.61s
163:	learn: 0.8120535	test: 0.8439976	best: 0.8439708 (160)	total: 422ms	remaining: 8.59s
164:	learn: 0.8120524	test: 0.8439952	best: 0.8439708 (160)	total: 424ms	remaining: 8.58s
165:	learn: 0.8120515	test: 0.8439929	best: 0.8439708 (160)	total: 426ms	remaining: 8.56s
166:	learn: 0.8120506	test: 0.8439909	best: 0.8439708 (160)	total: 427ms	remaining: 8.53s
167:	learn:

<catboost.core.CatBoostClassifier at 0x7f37e8f02cc0>

In [16]:
reg.fit(X_train, y_train_reg, eval_set=(X_test, y_test_reg))

Learning rate set to 0.056174
0:	learn: 16.9453408	test: 18.3830397	best: 18.3830397 (0)	total: 3.22ms	remaining: 3.22s
1:	learn: 16.5703868	test: 18.0155586	best: 18.0155586 (1)	total: 5.13ms	remaining: 2.56s
2:	learn: 16.1460899	test: 17.5746783	best: 17.5746783 (2)	total: 7.07ms	remaining: 2.35s
3:	learn: 15.7605002	test: 17.1470033	best: 17.1470033 (3)	total: 10.1ms	remaining: 2.51s
4:	learn: 15.4749140	test: 16.8383650	best: 16.8383650 (4)	total: 11.7ms	remaining: 2.33s
5:	learn: 15.1555832	test: 16.5012391	best: 16.5012391 (5)	total: 14.2ms	remaining: 2.35s
6:	learn: 14.8746574	test: 16.1997367	best: 16.1997367 (6)	total: 16.3ms	remaining: 2.31s
7:	learn: 14.6302282	test: 15.9337793	best: 15.9337793 (7)	total: 18.4ms	remaining: 2.28s
8:	learn: 14.3535353	test: 15.6297373	best: 15.6297373 (8)	total: 20.1ms	remaining: 2.21s
9:	learn: 14.1458278	test: 15.4141911	best: 15.4141911 (9)	total: 21.9ms	remaining: 2.17s
10:	learn: 13.9047295	test: 15.1560997	best: 15.1560997 (10)	total: 23

<catboost.core.CatBoostRegressor at 0x7f37e8fa8f50>

# Метрики на тесте

In [17]:
y_pred_cls = clf.predict(X_test)
y_pred_reg = reg.predict(X_test)

## Классификация

In [18]:
cls_report = classification_report(y_test_cls, y_pred_cls, target_names=y_test_cls.unique(), output_dict=True)

In [19]:
cls_report = pd.DataFrame(cls_report).T
cls_report

Unnamed: 0,precision,recall,f1-score,support
electro_bug,0.948276,0.916667,0.932203,60.0
engine_ignition,1.0,1.0,1.0,55.0
another_bug,1.0,1.0,1.0,52.0
gear_stick,0.701493,0.903846,0.789916,52.0
engine_overheat,0.756098,0.62,0.681319,50.0
engine_fuel,0.565217,0.481481,0.52,54.0
break_bug,0.492537,0.611111,0.545455,54.0
engine_check,0.981132,0.912281,0.945455,57.0
wheel_shake,0.827586,0.705882,0.761905,34.0
accuracy,0.801282,0.801282,0.801282,0.801282


## Регрессия

In [26]:
def regression_report(y_true, y_pred):
    # Вычисление метрик
    r2 = r2_score(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    
    # Создание словаря для хранения метрик
    metrics = {
        'R2 score': [r2],
        'RMSE': [rmse]
    }
    
    # Преобразование словаря в DataFrame
    report_df = pd.DataFrame(metrics).T
    
    return report_df

In [27]:
regression_report(y_test_reg, y_pred_reg)

Unnamed: 0,0
R2 score,0.606864
RMSE,11.766568


# 6️⃣ Сохраняем результаты в ClearML 