# Optuna for CatBoostRegressor

Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pickle
import optuna
import catboost as cat
from catboost import Pool
from sklearn.metrics import mean_squared_error

Prepare data

In [None]:
matrix = pd.read_pickle("checkpoint_final.pkl")
# Downcast the float columns to reduce RAM usage
floatcols = [c for c in matrix.columns if matrix[c].dtype=="float32"]
matrix[floatcols] = matrix[floatcols].astype("float16")
matrix['item_cnt_month'] = matrix['item_cnt_month'].clip(0,20)
keep_from_month = 2  # The first couple of months are dropped because of distortions to their features (e.g. wrong item age)
val_month = 33
test_month = 34

dropcols = [
    "shop_id",
    "item_id",
    "new_item",
]  # The features are dropped to reduce overfitting

categoricals = [
    "item_category_id",
    "month",
]
matrix[categoricals] = matrix[categoricals].astype("category") 
train = matrix.drop(columns=dropcols).loc[matrix.date_block_num < val_month, :]
train = train[train.date_block_num >= keep_from_month]
val = matrix.drop(columns=dropcols).loc[matrix.date_block_num == val_month, :]
test = matrix.drop(columns=dropcols).loc[matrix.date_block_num == test_month, :]

X_train = train.drop(columns="item_cnt_month")
y_train = train.item_cnt_month
X_val = val.drop(columns="item_cnt_month")
y_val = val.item_cnt_month
X_test = test.drop(columns="item_cnt_month")

del(matrix, train, test)

Set objective function

In [None]:
def objective(trial):
    train_pool = Pool(X_train, y_train, cat_features=categoricals)
    val_pool = Pool(X_val, y_val, cat_features=categoricals)
    
    # Parameters
    params = {
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'task_type': 'GPU',
        'random_seed': 42,
        'od_type': 'Iter', # Type of overfitting detector - stop after k iteraions
        'od_wait': 30, # Overfitting detector - stop training after k iterations without metric improvement
        'metric_period': 100, # Show metric each k iterations
        'iterations' : trial.suggest_int('iterations', 300, 1200),                         
        'l2_leaf_reg': trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 3e-1),             
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']), 
        'depth' : trial.suggest_int('depth', 4, 10),  # Max tree depth
        'random_strength' :trial.suggest_int('random_strength', 0, 100), # The amount of randomness to use 
                                                                         # for scoring splits when the tree structure
                                                                         # is selected. Helps to avoid overfitting
        'max_bin': trial.suggest_categorical('max_bin', [2,3,4,5,10,20,32,64]), # The number of splits for 
                                                                                # numerical features
        
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), # Assigns random 
                                                                                              # weights to objects
    }
    
    if params['grow_policy'] == 'SymmetricTree': 
        params['boosting_type']= trial.suggest_categorical('boosting_type', ['Ordered', 'Plain'])
    else:
        params['boosting_type'] = 'Plain'
    
    # Learning
    model = cat.CatBoostRegressor(
        **params
    )        
    model.fit(train_pool)
    # Predict
    preds = model.predict(val_pool)
    # Evaluation
    rmse = mean_squared_error(y_val, preds, squared=False)
    return rmse

Launch Optuna study

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600) # change timeout if you want to make optimization process longer

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Create a dataframe from the study and select columns with neccessary parameters

In [None]:
df = study.trials_dataframe()
df.sort_values('value').iloc[:, [1] + list(range(5, 14))]

# Optuna for LightGBMRegressor

Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pickle
import optuna
import lightgbm as lgb

from sklearn.metrics import mean_squared_error

Prepare data

In [None]:
matrix = pd.read_pickle("checkpoint_final.pkl")
# Downcast the float columns to reduce RAM usage
floatcols = [c for c in matrix.columns if matrix[c].dtype=="float32"]
matrix[floatcols] = matrix[floatcols].astype("float16")
matrix['item_cnt_month'] = matrix['item_cnt_month'].clip(0,20)
keep_from_month = 2  # The first couple of months are dropped because of distortions to their features (e.g. wrong item age)
val_month = 33
test_month = 34

dropcols = [
    "shop_id",
    "item_id",
    "new_item",
]  # The features are dropped to reduce overfitting

categoricals = [
    "item_category_id",
    "month",
]
matrix[categoricals] = matrix[categoricals].astype("category") 
train = matrix.drop(columns=dropcols).loc[matrix.date_block_num < val_month, :]
train = train[train.date_block_num >= keep_from_month]
val = matrix.drop(columns=dropcols).loc[matrix.date_block_num == val_month, :]
test = matrix.drop(columns=dropcols).loc[matrix.date_block_num == test_month, :]

X_train = train.drop(columns="item_cnt_month")
y_train = train.item_cnt_month
X_val = val.drop(columns="item_cnt_month")
y_val = val.item_cnt_month
X_test = test.drop(columns="item_cnt_month")

del(matrix, train, test)

For Better Accuracy:

- Use large max_bin (may be slower)

- Use small learning_rate with large num_iterations

- Use large num_leaves (may cause over-fitting)

- Use bigger training data

- Try dart

Deal with Over-fitting:

- Use small max_bin

- Use small num_leaves

- Use min_data_in_leaf and min_sum_hessian_in_leaf

- Use bagging by set bagging_fraction and bagging_freq

- Use feature sub-sampling by set feature_fraction

- Use bigger training data

- Try lambda_l1, lambda_l2 and min_gain_to_split for regularization

- Try max_depth to avoid growing deep tree

- Try extra_trees

- Try increasing path_smooth

In [None]:
import warnings
warnings.filterwarnings("ignore", module="lightgbm")
from sklearn.metrics import mean_squared_error

# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
def objective(trial):
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_val, label=y_val)

    param = {
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': 0,
        'boosting_type': 'gbdt', # other options: rf, dart, goss
        'force_col_wise': False, # Use only with CPU devices
       
        'subsample_for_bin': 300000, # Number of data that sampled to construct feature discrete bins; setting this 
                                     # to larger value will give better training result but may increase train time
        'n_estimators': trial.suggest_int('n_estimators', 500, 1200),      
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 3e-1),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256), # Max number of leaves in one tree
        'max_bin': trial.suggest_int('max_bin', 32, 255), # Max number of bins that feature values will be 
                                                           # bucketed in. small number of bins may reduce training 
                                                           # accuracy but may deal with overfitting
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0), # Randomly select a subset of features 
                                                                               # if feature_fraction < 1.0
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0), # Randomly select part of data without 
                                                                               # resampling if bagging_fraction < 1.0
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7), # Perform bagging at every k iteration
        'min_data_in_leaf': trial.suggest_int('min_child_samples', 5, 100), # Minimal number of data in one leaf
                                                                            # aliases: min_child_samples, 
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 1e-1), # Stop trying to split 
                                                                                               # leave if sum of it's
                                                                                               # hessian less than k
        'cat_smooth': trial.suggest_float('cat_smooth', 10.0, 100.0), # this can reduce the effect of noises in 
                                                                      # categorical features, especially for 
                                                                      # categories with few data
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse')
    gbm = lgb.train(
        param, dtrain, valid_sets=[dvalid], verbose_eval=100, callbacks=[pruning_callback]
    )

    preds = gbm.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    return rmse


if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize"
    )
    study.optimize(objective, n_trials=500)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

Create a dataframe from the study and select columns with neccessary parameters

In [None]:
df = study.trials_dataframe()
df.sort_values('value')# .iloc[:, [1] + list(range(5, 14))]