# CatBoost HyperParameter Tuning

## Libraries

In [2]:
import os
os.chdir("../input")

import numpy as np
import pandas as pd
import optuna

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score
from catboost import CatBoostClassifier, Pool

## Helper Functions

In [None]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10, log=True),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10, log=True),
        "verbose": 0,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = []
    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_pool = Pool(X_train, y_train)
        valid_pool = Pool(X_valid, y_valid)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=0)

        y_pred = model.predict(X_valid)
        scores.append(accuracy_score(y_valid, y_pred))

    return sum(scores) / len(scores)

## Reading & Preparing Data

In [4]:
df = pd.read_csv("heart.csv")

In [5]:
X = df.drop("output", axis=1)
y = df["output"]

## Hyperparameter Tuning

In [9]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2024-12-01 21:31:53,349] A new study created in memory with name: no-name-c5ea8e27-8ada-410d-a53f-d7a3ff7d28d3
[I 2024-12-01 21:31:53,555] Trial 0 finished with value: 0.8018579234972677 and parameters: {'iterations': 779, 'depth': 6, 'learning_rate': 0.052033494151578825, 'l2_leaf_reg': 0.02254016530329813, 'border_count': 116, 'bagging_temperature': 0.6327781136649759, 'random_strength': 0.1624572895216702}. Best is trial 0 with value: 0.8018579234972677.
[I 2024-12-01 21:31:53,936] Trial 1 finished with value: 0.8216939890710382 and parameters: {'iterations': 222, 'depth': 4, 'learning_rate': 0.011890359808222376, 'l2_leaf_reg': 0.0058418676890383886, 'border_count': 234, 'bagging_temperature': 0.8079029481311919, 'random_strength': 0.039918846445960714}. Best is trial 1 with value: 0.8216939890710382.
[I 2024-12-01 21:31:54,112] Trial 2 finished with value: 0.811639344262295 and parameters: {'iterations': 463, 'depth': 4, 'learning_rate': 0.08959605105067305, 'l2_leaf_reg': 0.01

In [10]:
print("Best parameters:", study.best_params)
print("Best CV accuracy:", study.best_value)

Best parameters: {'iterations': 540, 'depth': 7, 'learning_rate': 0.04200623759811386, 'l2_leaf_reg': 0.028278249345751846, 'border_count': 90, 'bagging_temperature': 0.34945523906714304, 'random_strength': 7.170576274930001}
Best CV accuracy: 0.8581420765027323


In [11]:
best_params = study.best_params
best_params["verbose"] = 0
final_model = CatBoostClassifier(**best_params)
final_model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x15690ac00>