In [66]:
import mlflow
import pandas as pd

from catboost import CatBoostRegressor

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error

In [75]:
df_train = pd.read_csv('../data/train_data.csv')
df_test = pd.read_csv('../data/test_data.csv')
df_full = pd.read_csv('../data/ml_data.csv')

In [76]:
df_train.shape

(32555, 51)

In [77]:
feature_selected = ['price_class_pred',
 'power_horse',
 'year',
 'engine',
 'brand_Россия',
 'region_Южный',
 'brand_Япония',
 'mileage',
 'brand_Китай',
 'brand_Южная Корея',
 'wheel_drive_полный',
 'brand_США',
 'wheel_drive_передний']

In [78]:
X_train = df_train.drop(['price', 'text'], axis=1)[feature_selected]
y_train = df_train['price']

X_test = df_test.drop(['price', 'text'], axis=1)[feature_selected]
y_test = df_test['price']

In [79]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [80]:
X_train.shape

(26044, 13)

In [62]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 500, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = CatBoostRegressor(**params, silent=True)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse = mean_absolute_percentage_error(y_val, predictions)
    return rmse

In [64]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)

[I 2024-11-21 03:54:06,898] A new study created in memory with name: no-name-48ffeb88-1c0c-4aa4-a237-9e87feda27d4
[I 2024-11-21 03:54:09,157] Trial 0 finished with value: 0.23578368818546852 and parameters: {'iterations': 1017, 'learning_rate': 0.08347286957945295, 'depth': 3, 'subsample': 0.2733599266989948, 'colsample_bylevel': 0.38457031611371223, 'min_data_in_leaf': 14}. Best is trial 0 with value: 0.23578368818546852.
[I 2024-11-21 03:54:17,638] Trial 1 finished with value: 0.2684069950596959 and parameters: {'iterations': 1172, 'learning_rate': 0.003956475451333895, 'depth': 7, 'subsample': 0.6046399232062919, 'colsample_bylevel': 0.9194065802335246, 'min_data_in_leaf': 5}. Best is trial 0 with value: 0.23578368818546852.
[I 2024-11-21 03:54:19,770] Trial 2 finished with value: 0.2410244667390507 and parameters: {'iterations': 704, 'learning_rate': 0.06994690275012178, 'depth': 3, 'subsample': 0.35490072257812777, 'colsample_bylevel': 0.44437946631362585, 'min_data_in_leaf': 40}.

In [65]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

Best hyperparameters: {'iterations': 1862, 'learning_rate': 0.07126926225582741, 'depth': 8, 'subsample': 0.9534581504986257, 'colsample_bylevel': 0.9955728241823156, 'min_data_in_leaf': 4}
Best RMSE: 0.2114689514788336


In [81]:
def eval_metrics(y_test, y_pred):
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return rmse, mae, mape, r2

In [82]:
params = study.best_params

In [83]:
params

{'iterations': 1862,
 'learning_rate': 0.07126926225582741,
 'depth': 8,
 'subsample': 0.9534581504986257,
 'colsample_bylevel': 0.9955728241823156,
 'min_data_in_leaf': 4}