![Photo by Stephen Phillips - Hostreviews.co.uk on UnSplash](https://cf.bstatic.com/xdata/images/hotel/max1024x768/408003083.jpg?k=c49b5c4a2346b3ab002b9d1b22dbfb596cee523b53abef2550d0c92d0faf2d8b&o=&hp=1){fig-align="center" width=50%}


# Import data

In [1]:
import gc
import os
from pathlib import Path
from typing import List, Optional, Tuple

import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from data import pre_process, utils
from features import feature_engineering
from IPython.display import clear_output
from lets_plot import *
from lets_plot.mapping import as_discrete
from models import train_model
from sklearn import (
    cluster,
    compose,
    ensemble,
    impute,
    metrics,
    model_selection,
    neighbors,
    pipeline,
    preprocessing,
)
from tqdm.notebook import tqdm

LetsPlot.setup_html()
import pickle

import optuna

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


**Objective**:
* Examine the necessary sample pre-processing steps before modeling
* Create the required pipeline
* 
Evaluate multiple algoritms
* 
Choose a suitable baseli modell.





# Prepare dataframe before modelling


In [2]:
df = pd.read_parquet(
    utils.Configuration.INTERIM_DATA_PATH.joinpath(
        "2023-10-01_Processed_dataset_for_NB_use.parquet.gzip"
    )
)

X, y = pre_process.prepare_data_for_modelling(df)

Shape of X and y with outliers: (3660, 14), (3660,)
Shape of X and y without outliers: (3427, 14), (3427,)


In [3]:
def dumper(file, name):
    pickle.dump(file, open(f"{name}.pickle", "wb"))

In [4]:
def objective(trial: optuna.Trial) -> float:
    """
    Optuna objective function for tuning CatBoost hyperparameters.

    This function takes an Optuna trial and explores hyperparameters for a CatBoost
    model to minimize the Root Mean Squared Error (RMSE) using K-Fold cross-validation.

    Parameters:
    - trial (optuna.Trial): Optuna trial object for hyperparameter optimization.

    Returns:
    - float: Mean RMSE across K-Fold cross-validation iterations.

    Example use case:
    ```python
    # Create an Optuna study and optimize hyperparameters
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100)

    # Get the best hyperparameters
    best_params = study.best_params
    ```
    """
    catboost_params = {
        "iterations": trial.suggest_int("iterations", 10, 1000),
        "depth": trial.suggest_int("depth", 1, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 30),
        "border_count": trial.suggest_int("border_count", 1, 255),
        "thread_count": os.cpu_count(),
    }

    results = []
    optuna.logging.set_verbosity(optuna.logging.WARNING)

    # Extract feature names and data types
    # features = X.columns[~X.columns.str.contains("price")]
    # numerical_features = X.select_dtypes("number").columns.to_list()
    categorical_features = X.select_dtypes("object").columns.to_list()

    # Create a K-Fold cross-validator
    CV = model_selection.RepeatedKFold(
        n_splits=10, n_repeats=1, random_state=utils.Configuration.seed
    )

    for train_fold_index, val_fold_index in CV.split(X):
        X_train_fold, X_val_fold = X.loc[train_fold_index], X.loc[val_fold_index]
        y_train_fold, y_val_fold = y.loc[train_fold_index], y.loc[val_fold_index]

        # Create CatBoost datasets
        catboost_train = catboost.Pool(
            X_train_fold,
            y_train_fold,
            cat_features=categorical_features,
        )
        catboost_valid = catboost.Pool(
            X_val_fold,
            y_val_fold,
            cat_features=categorical_features,
        )

        # Initialize and train the CatBoost model
        model = catboost.CatBoostRegressor(**catboost_params)
        model.fit(
            catboost_train,
            eval_set=[catboost_valid],
            early_stopping_rounds=utils.Configuration.early_stopping_round,
            verbose=utils.Configuration.verbose,
            use_best_model=True,
        )

        # Calculate OOF validation predictions
        valid_pred = model.predict(X_val_fold)

        RMSE_score = metrics.mean_squared_error(y_val_fold, valid_pred, squared=False)

        del (
            X_train_fold,
            y_train_fold,
            X_val_fold,
            y_val_fold,
            catboost_train,
            catboost_valid,
            model,
            valid_pred,
        )
        gc.collect()

        results.append(RMSE_score)
    return np.mean(results)

In [5]:
study = optuna.create_study(direction="minimize")
study.optimize(train_model.Optuna_Objective(X, y), n_trials=50, show_progress_bar=True)

[I 2023-10-22 19:28:49,161] A new study created in memory with name: no-name-d0bf6825-08f2-44b8-b7ab-454e29b57ead


  0%|          | 0/50 [00:00<?, ?it/s]

In [6]:
print(study.best_params)
print(study.best_value)
dumper(study.best_params, "CatBoost_params")
dumper(study.best_value, "CatBoost_value")

{'iterations': 800, 'depth': 7, 'learning_rate': 0.1250305192970083, 'random_strength': 7.84360123799234, 'bagging_temperature': 0.6210737504978057, 'l2_leaf_reg': 30, 'border_count': 141}
0.10727907686803564
