In [35]:
import mlflow
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from keras.layers import Dense
from keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error


In [None]:
df_train = pd.read_csv('../data/train_data.csv')
df_test = pd.read_csv('../data/test_data.csv')
df_full = pd.read_csv('../data/ml_data.csv')

In [40]:
df_train.shape

(32555, 51)

In [44]:
feature_selected = [
    'price_class_pred',
    'power_horse',
    'year',
    'engine',
    'brand_Россия',
    'region_Южный',
    'brand_Япония',
    'mileage',
    'brand_Китай',
    'brand_Южная Корея',
    'wheel_drive_полный',
    'brand_США',
    'wheel_drive_передний'
]

In [None]:
X_train = df_train.drop(['price', 'text'], axis=1)[feature_selected]
y_train = df_train['price']

X_test = df_test.drop(['price', 'text'], axis=1)[feature_selected]
y_test = df_test['price']

In [50]:
def eval_metrics(y_test, y_pred):
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return rmse, mae, mape, r2

In [32]:
params = {
    'iterations': 1862,
    'learning_rate': 0.07126926225582741,
    'depth': 8,
    'subsample': 0.9534581504986257,
    'colsample_bylevel': 0.9955728241823156,
    'min_data_in_leaf': 4,  
}

In [None]:
mlflow.set_tracking_uri('../mlruns')

mlflow.set_experiment('With text 10 classes feature selected, params tuned')

with mlflow.start_run():


    model = CatBoostRegressor(**params, verbose=500)
    
    mlflow.log_params(params)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    rmse, mae, mape, r2 = eval_metrics(y_test, y_pred)
    
    print('Catboost model')
    print(f'  RMSE: {rmse}')
    print(f'  MAE: {mae}')
    print(f'  R2: {r2}')
    print(f'  mape: {mape}')

    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mae', mae)
    mlflow.log_metric('r2', r2)
    mlflow.log_metric('mape', mape)

    # Пример входных данных для логирования
    input_example = X_test.sample(5)

    # Логирование модели с примером входных данных
    mlflow.catboost.log_model(model, 'catboost_base', input_example=input_example)

Catboost model
  RMSE: 524042.4280352981
  MAE: 262543.74683669204
  R2: 0.660628721740682
  mape: 0.566329906230659


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]
