# CatBoost HyperParameter Tuning

## Libraries

In [4]:
import os
os.chdir("../input")

import numpy as np
import pandas as pd
import optuna

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score
from catboost import CatBoostClassifier, Pool

## Helper Functions

In [5]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10, log=True),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10, log=True),
        "verbose": 0,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = []
    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_pool = Pool(X_train, y_train)
        valid_pool = Pool(X_valid, y_valid)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=0)

        y_pred = model.predict(X_valid)
        scores.append(accuracy_score(y_valid, y_pred))

    return sum(scores) / len(scores)

## Reading & Preparing Data

In [8]:
df = pd.read_csv("transformed_heart.csv")

In [9]:
X = df.drop("output", axis=1)
y = df["output"]

## Hyperparameter Tuning

In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2024-12-06 15:57:38,538] A new study created in memory with name: no-name-7351c396-e5d5-4902-82dd-f507342b10d3


[I 2024-12-06 15:57:38,813] Trial 0 finished with value: 0.801967213114754 and parameters: {'iterations': 709, 'depth': 6, 'learning_rate': 0.22851697946497362, 'l2_leaf_reg': 0.07330450826392484, 'border_count': 212, 'bagging_temperature': 0.09850161673891267, 'random_strength': 0.0027769701255086613}. Best is trial 0 with value: 0.801967213114754.
[I 2024-12-06 15:57:39,345] Trial 1 finished with value: 0.8348087431693989 and parameters: {'iterations': 260, 'depth': 4, 'learning_rate': 0.0035178157792476153, 'l2_leaf_reg': 0.017744795178910915, 'border_count': 216, 'bagging_temperature': 0.22365732362209578, 'random_strength': 0.8069595924580716}. Best is trial 1 with value: 0.8348087431693989.
[I 2024-12-06 15:57:40,618] Trial 2 finished with value: 0.8051366120218579 and parameters: {'iterations': 381, 'depth': 7, 'learning_rate': 0.0029969349134170774, 'l2_leaf_reg': 2.792847320958256, 'border_count': 227, 'bagging_temperature': 0.37734276156055035, 'random_strength': 0.2336872887

In [11]:
print("Best parameters:", study.best_params)
print("Best CV accuracy:", study.best_value)

Best parameters: {'iterations': 717, 'depth': 4, 'learning_rate': 0.05183212157953212, 'l2_leaf_reg': 2.050905423160657, 'border_count': 255, 'bagging_temperature': 0.8289890486077406, 'random_strength': 5.4192370836425425}
Best CV accuracy: 0.8613661202185792


In [12]:
best_params = study.best_params
best_params["verbose"] = 0
final_model = CatBoostClassifier(**best_params)
final_model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x150068d40>