# Optuna for CatBoostRegressor

Prepare data

In [None]:
matrix = pd.read_pickle("checkpoint_final.pkl")
# Downcast the float columns to reduce RAM usage
floatcols = [c for c in matrix.columns if matrix[c].dtype=="float32"]
matrix[floatcols] = matrix[floatcols].astype("float16")
matrix['item_cnt_month'] = matrix['item_cnt_month'].clip(0,20)
keep_from_month = 2  # The first couple of months are dropped because of distortions to their features (e.g. wrong item age)
val_month = 33
test_month = 34

dropcols = [
    "shop_id",
    "item_id",
    "new_item",
]  # The features are dropped to reduce overfitting

categoricals = [
    "item_category_id",
    "month",
]
matrix[categoricals] = matrix[categoricals].astype("category") 
train = matrix.drop(columns=dropcols).loc[matrix.date_block_num < val_month, :]
train = train[train.date_block_num >= keep_from_month]
val = matrix.drop(columns=dropcols).loc[matrix.date_block_num == val_month, :]
test = matrix.drop(columns=dropcols).loc[matrix.date_block_num == test_month, :]

X_train = train.drop(columns="item_cnt_month")
y_train = train.item_cnt_month
X_val = val.drop(columns="item_cnt_month")
y_val = val.item_cnt_month
X_test = test.drop(columns="item_cnt_month")

del(matrix, train, test)

Set objective function

In [None]:
def objective(trial):
    train_pool = Pool(X_train, y_train, cat_features=categoricals)
    val_pool = Pool(X_val, y_val, cat_features=categoricals)
    
    # Parameters
    params = {
        'iterations' : trial.suggest_int('iterations', 300, 1200, step=100),                         
        'l2_leaf_reg': trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 3e-1),             
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']), 
        'depth' : trial.suggest_int('depth', 4, 10),  # max tree depth
        'random_strength' :trial.suggest_int('random_strength', 0, 100), # The amount of randomness to use 
                                                                         # for scoring splits when the tree structure
                                                                         # is selected. Helps to avoid overfitting
        'max_bin':trial.suggest_categorical('max_bin', [2,3,4,5,10,20,32,64]), # The number of splits for 
                                                                               # numerical features
        
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), # assigns random 
                                                                                              # weights to objects
    }
    
    if params['grow_policy'] == 'SymmetricTree': 
        params['boosting_type']= trial.suggest_categorical('boosting_type', ['Ordered', 'Plain'])
    else:
        params['boosting_type'] = 'Plain'
    
    # Learning
    model = cat.CatBoostRegressor(
        loss_function='RMSE',
        eval_metric='RMSE',
        task_type='GPU',
        random_seed=42,
        od_type='Iter',
        od_wait=30,
        metric_period=100,
        **params
    )        
    model.fit(train_pool)
    # Predict
    preds = model.predict(val_pool)
    # Evaluation
    rmse = mean_squared_error(y_val, preds, squared=False)
    return rmse

Launch Optuna study

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600) # change timeout if you want to make optimization process longer

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Create a dataframe from the study

In [None]:
df = study.trials_dataframe()