![Photo by Stephen Phillips - Hostreviews.co.uk on UnSplash](https://cf.bstatic.com/xdata/images/hotel/max1024x768/408003083.jpg?k=c49b5c4a2346b3ab002b9d1b22dbfb596cee523b53abef2550d0c92d0faf2d8b&o=&hp=1){fig-align="center" width=50%}


# Import data

In [33]:
import gc
import os
from pathlib import Path
from typing import List, Optional, Tuple

import catboost
import pandas as pd
from data import pre_process, utils
from features import feature_engineering
from lets_plot import *
from lets_plot.mapping import as_discrete
from models import train_model
from sklearn import metrics, model_selection
from tqdm.notebook import tqdm

LetsPlot.setup_html()
import pickle

import optuna

**Objective**:
* Optimize hyperparameters by Optuna
* Evaluate final model based on the test portion.





# Prepare dataframe before modelling


In [2]:
df = pd.read_parquet(
    utils.Configuration.INTERIM_DATA_PATH.joinpath(
        "2023-10-01_Processed_dataset_for_NB_use.parquet.gzip"
    )
)

X, y = pre_process.prepare_data_for_modelling(df)

Shape of X and y with outliers: (3660, 14), (3660,)
Shape of X and y without outliers: (3427, 14), (3427,)


In [17]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=utils.Configuration.seed,
)

print(f"Shape of X_train: {X_train.shape}, Shape of X_test: {X_test.shape}")

Shape of X_train: (2741, 14), Shape of X_test: (686, 14)


In [3]:
def dumper(file, name):
    pickle.dump(file, open(f"{name}.pickle", "wb"))

In [4]:
def objective(trial: optuna.Trial) -> float:
    """
    Optuna objective function for tuning CatBoost hyperparameters.

    This function takes an Optuna trial and explores hyperparameters for a CatBoost
    model to minimize the Root Mean Squared Error (RMSE) using K-Fold cross-validation.

    Parameters:
    - trial (optuna.Trial): Optuna trial object for hyperparameter optimization.

    Returns:
    - float: Mean RMSE across K-Fold cross-validation iterations.

    Example use case:
    ```python
    # Create an Optuna study and optimize hyperparameters
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100)

    # Get the best hyperparameters
    best_params = study.best_params
    ```
    """
    catboost_params = {
        "iterations": trial.suggest_int("iterations", 10, 1000),
        "depth": trial.suggest_int("depth", 1, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 30),
        "border_count": trial.suggest_int("border_count", 1, 255),
        "thread_count": os.cpu_count(),
    }

    results = []
    optuna.logging.set_verbosity(optuna.logging.WARNING)

    # Extract feature names and data types
    # features = X.columns[~X.columns.str.contains("price")]
    # numerical_features = X.select_dtypes("number").columns.to_list()
    categorical_features = X.select_dtypes("object").columns.to_list()

    # Create a K-Fold cross-validator
    CV = model_selection.RepeatedKFold(
        n_splits=10, n_repeats=1, random_state=utils.Configuration.seed
    )

    for train_fold_index, val_fold_index in CV.split(X):
        X_train_fold, X_val_fold = X.loc[train_fold_index], X.loc[val_fold_index]
        y_train_fold, y_val_fold = y.loc[train_fold_index], y.loc[val_fold_index]

        # Create CatBoost datasets
        catboost_train = catboost.Pool(
            X_train_fold,
            y_train_fold,
            cat_features=categorical_features,
        )
        catboost_valid = catboost.Pool(
            X_val_fold,
            y_val_fold,
            cat_features=categorical_features,
        )

        # Initialize and train the CatBoost model
        model = catboost.CatBoostRegressor(**catboost_params)
        model.fit(
            catboost_train,
            eval_set=[catboost_valid],
            early_stopping_rounds=utils.Configuration.early_stopping_round,
            verbose=utils.Configuration.verbose,
            use_best_model=True,
        )

        # Calculate OOF validation predictions
        valid_pred = model.predict(X_val_fold)

        RMSE_score = metrics.mean_squared_error(y_val_fold, valid_pred, squared=False)

        del (
            X_train_fold,
            y_train_fold,
            X_val_fold,
            y_val_fold,
            catboost_train,
            catboost_valid,
            model,
            valid_pred,
        )
        gc.collect()

        results.append(RMSE_score)
    return np.mean(results)

In [5]:
%%script echo skipping

study = optuna.create_study(direction="minimize")
study.optimize(train_model.Optuna_Objective(X_train, y_train), n_trials=100, show_progress_bar=True)

dumper(study.best_params, "CatBoost_params")
dumper(study.best_value, "CatBoost_value")

Couldn't find program: 'echo'


In [24]:
catboost_params_optuna = pd.read_pickle("CatBoost_params.pickle")

print(
    f'The best OOF RMSE score of the hyperparameter tuning is {pd.read_pickle("CatBoost_value.pickle"):.4f}.'
)
print(f"The corresponding values: {catboost_params_optuna}")

The best OOF RMSE score of the hyperparameter tuning is 0.1060.
The corresponding values: {'iterations': 956, 'depth': 7, 'learning_rate': 0.050050595110243595, 'random_strength': 7.110744896133362, 'bagging_temperature': 0.024119607385698107, 'l2_leaf_reg': 3, 'border_count': 205}


# Retrain using the best parameters and predict

In [29]:
categorical_features = X_train.select_dtypes("object").columns.to_list()

catboost_train = catboost.Pool(
    X_train,
    y_train,
    cat_features=categorical_features,
)

model = catboost.CatBoostRegressor(**catboost_params_optuna)
model.fit(
    catboost_train,
    verbose=utils.Configuration.verbose,
)

<catboost.core.CatBoostRegressor at 0x1d146456110>

In [32]:
test_prediction = model.predict(X_test, thread_count=-1, verbose=None)

In [45]:
print(metrics.mean_squared_error(test_prediction, y_test, squared=False))
print(metrics.r2_score(test_prediction, y_test))

0.1101569172024175
0.86914579850464


In [80]:
results = (
    pd.concat(
        [y_test.reset_index(drop=True), pd.Series(test_prediction)], axis="columns"
    )
    .rename(columns={"price": "original_values", 0: "predicted_values"})
    .apply(lambda x: 10**x)
    .assign(residuals=lambda df: df.original_values - df.predicted_values)
)
results

Unnamed: 0,original_values,predicted_values,residuals
0,159000.0,1.577675e+05,1232.485602
1,279000.0,2.253084e+05,53691.571930
2,1099000.0,1.055867e+06,43133.142544
3,1050000.0,1.373137e+06,-323137.040841
4,815000.0,7.278441e+05,87155.904463
...,...,...,...
681,650000.0,5.981813e+05,51818.732665
682,570000.0,4.770458e+05,92954.151024
683,195000.0,1.561886e+05,38811.427194
684,349000.0,3.509563e+05,-1956.328876


In [76]:
(
    results.pipe(
        lambda df: ggplot(df, aes("original_values", "predicted_values"))
        + geom_point()
        + geom_smooth()
    )
)

In [86]:
(results.pipe(lambda df: ggplot(df, aes("residuals")) + geom_histogram(stat="bin", bins = 100)))

In [7]:
import boto3
import creds

In [8]:
dynamodb = boto3.resource(
    "dynamodb",
    aws_access_key_id=creds.AWS.AWS_ACCESS_KEY_ID,
    aws_secret_access_key=creds.AWS.AWS_SECRET_ACCESS_KEY,
    region_name=creds.AWS.AWS_DEFAULT_REGION,
)