# Optuna for CatBoostRegressor

Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pickle
import optuna
import catboost as cat
from catboost import Pool
from sklearn.metrics import mean_squared_error

# Optuna + CV + custom loss

In [None]:
class CFG:
    n_repeats = 2
    n_folds = 2
    num_boost_round = 10000
    seeds = [1, 42, 228, 265, 21, 8081988, 5062023, 666, 1488]

def balanced_log_loss(y_true, y_pred):
    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)
    
    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1 - y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

def catboost_opt(features, n_trials):
    X, y = train_df[features], train_df.Class
    
    def objective(trial):
        bll_list = list()
        
        # Parameters
        params = {
            'auto_class_weights': 'Balanced',
            'task_type': 'GPU',
            'eval_metric': 'Logloss',
            'loss_function': 'Logloss', 
            'random_seed': 10062023,
            'od_type': 'Iter', # Type of overfitting detector - stop after k iteraions
            'od_wait': 100, # Overfitting detector - stop training after k iterations without metric improvement
            'metric_period': 100, # Show metric each k iterations
            'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']), 
             # Hyperparamters (in order of importance decreasing)
            'iterations' : 3000, # trial.suggest_int('iterations', 300, 1200),        
            'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-3, 3e-1),    
            'l2_leaf_reg': trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
             # decrease to deal with overfit
            'depth' : trial.suggest_int('depth', 4, 10),  # Max tree depth         
             # decrease to deal with overfit
            'max_leaves': trial.suggest_int('num_leaves', 4, 128),  # Max number of leaves in one tree
            'subsample': trial.suggest_float('subsample', 0.3, 0.7) # randomly select part of data without return
            'colsample_bylevel': trial.suggest_float('subsample', 0.3, 0.7) # the percentage of features to use at each 
                                                                            # split selection
                                                                            # alias: rsm
             # increase to deal with overfit
            'random_strength': trial.suggest_int('random_strength', 0, 100), # The amount of randomness to use 
                                                                             # for scoring splits when the tree structure
                                                                             # is selected. Helps to avoid overfitting
            'bagging_temperature' : trial.suggest_loguniform('bagging_temperature', 0, 100),       # Assigns random 
                                                                                                   # weights to objects
            # this feature value can be increased to 1024 for important features:
            # per_float_feature_quantization='0:border_count=1024'
            'border_count': trial.suggest_categorical('border_count', 254), # The number of splits for numerical features
                                                                            # bigger is better but slowly
                                                                            # alias: max_bin
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100), # Minimal number of data in one leaf
                                                                               # aliases: min_child_samples, 
        }
        if params['grow_policy'] == 'SymmetricTree': 
            params['boosting_type']= trial.suggest_categorical('boosting_type', ['Ordered', 'Plain'])
        else:
            params['boosting_type'] = 'Plain'
        
        for i in range(CFG.n_repeats):
            print(f'Repeat {blu}#{i+1}')

            # Create an oof array for inner loop
            oof = np.zeros(train_df.shape[0])

            kf = MultilabelStratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=8062023+i)

            # Stratify based on Class and Alpha (3 types of conditions)
            for fold, (train_idx, val_idx) in enumerate(kf.split(X=train_df[features], y=greeks.iloc[:,1:3]), start = 1): 

                # Split the dataset according to the fold indexes.
                X_train = X.iloc[train_idx]
                X_val = X.iloc[val_idx]
                y_train = y.iloc[train_idx]
                y_val = y.iloc[val_idx]
          
                train_pool = Pool(X_train, y_train, cat_features=['EJ'])
                val_pool = Pool(X_val, y_val, cat_features=['EJ'])

                # Learning
                model = cat.CatBoostClassifier(**params)     
                model.fit(train_pool, eval_set=val_pool)
                # Predict
                preds = model.predict_proba(val_pool)[:,1]
                # Evaluation
                bll = balanced_log_loss(y_val, preds)
                bll_list.append(bll)
                
        return np.mean(bll_list)
            
    study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe()
    df.sort_values('value').iloc[:, [1] + list(range(5, 14))]
    df.to_csv(f'optuna_catboost_fold_.csv')
            
catboost_opt(features, n_trials=2000)

Launch Optuna study

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600) # change timeout if you want to make optimization process longer

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Create a dataframe from the study and select columns with neccessary parameters

In [None]:
df = study.trials_dataframe()
df.sort_values('value').iloc[:, [1] + list(range(5, 14))]

For Better Accuracy:

- Use large max_bin (may be slower)

- Use small learning_rate with large num_iterations

- Use large num_leaves (may cause over-fitting)

- Use bigger training data

- Try dart

Deal with Over-fitting:

- Use small max_bin

- Use small num_leaves

- Use min_data_in_leaf and min_sum_hessian_in_leaf

- Use bagging by set bagging_fraction and bagging_freq

- Use feature sub-sampling by set feature_fraction

- Use bigger training data

- Try lambda_l1, lambda_l2 and min_gain_to_split for regularization

- Try max_depth to avoid growing deep tree

- Try extra_trees

- Try increasing path_smooth