# CatBoost (Regressor)

## CatBoost Download

In [None]:
!pip install catboost

## Hyperparameter Tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

CBR = CatBoostRegressor()

# Enter the parameter that you want to test
parameters = {
            'learning_rate': [0.03, 0.05, 0.07],
            'depth': [3, 4],
            'l2_leaf_reg': [1, 3, 5, 7],
            'iterations'    : [900, 1100],
            'loss_function' : ["MAE"],
            'random_seed' : [42]
            } 

# Test all the parameters with the settings above
Grid_CBR = GridSearchCV(estimator=CBR, param_grid = parameters, cv = 2, n_jobs=-1)
Grid_CBR.fit(x_train, y_train)

In [None]:
# Results
print("\n The best estimator across ALL searched params:\n",Grid_CBR.best_estimator_)
print("\n The best score across ALL searched params:\n",Grid_CBR.best_score_)
print("\n The best parameters across ALL searched params:\n",Grid_CBR.best_params_)

## CatBoost Predict

In [None]:
# Apply the best parameter in the model and fit => predict
CatBoost = CatBoostRegressor(iterations = 1100, depth = 4, learning_rate = 0.05, loss_function = "MAE", l2_leaf_reg=7, random_seed = 42)
CatBoost.fit(x_train, y_train)
CatBoost.predict(x_test)

## Hyperparameter Tuning using Optuna

In [None]:
!pip install optuna

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna

In [None]:
# Setting
SAMPLE_RATE = 0.4
RANDOM_SEED = 1
EARLY_STOPPING_ROUND = 100

In [None]:
# Train/Test Split (need data preprocessing beforehand)
X = train_sample[features]
y = train_sample.target
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
# Function for finding best parameter (enter the parameters that needs to be tested)
def objective(trial):
    param = {}
    param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.01, 0.02, 0.001)
    param['depth'] = trial.suggest_int('depth', 7, 9)
    param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.5, 0.5)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = 1000
    param['use_best_model'] = True
    param['eval_metric'] = 'RMSE'
    param['od_type'] = 'iter'
    param['od_wait'] = 20
    param['random_state'] = RANDOM_SEED
    param['logging_level'] = 'Silent'
    
    regressor = CatBoostRegressor(**param)

    regressor.fit(X_train.copy(), y_train.copy(),
        eval_set=[(X_eval.copy(), y_eval.copy())],
                  early_stopping_rounds=EARLY_STOPPING_ROUND)
    loss = mean_squared_error(y_valid, regressor.predict(X_valid.copy()))
    return loss

In [None]:
# Study using Optuna
%%time
study = optuna.create_study(study_name=f'catboost-seed{RANDOM_SEED}')
study.optimize(objective, n_trials=500, n_jobs=-1, timeout=24000)

In [None]:
# See the best parameter
study.best_params

In [None]:
# Enter the best parameter and run the model
%%time
optimized_regressor = CatBoostRegressor(learning_rate=study.best_params['learning_rate'],
                                        depth=study.best_params['depth'],
                                        l2_leaf_reg=study.best_params['l2_leaf_reg'],
                                        min_child_samples=study.best_params['min_child_samples'],
                                        grow_policy='Depthwise',
                                        iterations=10000,
                                        use_best_model=True,
                                        eval_metric='RMSE',
                                        od_type='iter',
                                        od_wait=20,
                                        random_state=RANDOM_SEED,
                                        logging_level='Silent')
optimized_regressor.fit(X_train.copy(), y_train.copy(),
                        eval_set=[(X_eval.copy(), y_eval.copy())],
                        early_stopping_rounds=EARLY_STOPPING_ROUND)
pred_train = optimized_regressor.predict(X_train.copy())
pred_valid = optimized_regressor.predict(X_valid.copy())

In [None]:
# Results
mean_squared_error(y_train, pred_train)
mean_squared_error(y_valid, pred_valid)