In [None]:
import mlflow
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor

from keras.layers import Dense
from keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error


In [16]:
df_ml = pd.read_csv('../data/ml_data.csv')


In [17]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40451 entries, 0 to 40450
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         40451 non-null  int64  
 1   text          40416 non-null  object 
 2   is_dealer     40451 non-null  bool   
 3   year          40451 non-null  int64  
 4   engine        40428 non-null  float64
 5   transmission  40451 non-null  object 
 6   mileage       40451 non-null  int64  
 7   power_horse   40451 non-null  int64  
 8   car_body      40451 non-null  object 
 9   wheel_drive   40451 non-null  object 
 10  fuel_type     40451 non-null  object 
 11  brand         40451 non-null  object 
 12  region        40451 non-null  object 
dtypes: bool(1), float64(1), int64(4), object(7)
memory usage: 3.7+ MB


In [18]:
df_ml.columns

Index(['price', 'text', 'is_dealer', 'year', 'engine', 'transmission',
       'mileage', 'power_horse', 'car_body', 'wheel_drive', 'fuel_type',
       'brand', 'region'],
      dtype='object')

In [19]:
X = df_ml.drop(['price', 'text'], axis=1)
y = df_ml['price']

In [20]:
def eval_metrics(y_test, y_pred):
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return rmse, mae, mape, r2

In [22]:
mlflow.set_tracking_uri('../mlruns')

In [23]:
df_ml.columns

Index(['price', 'text', 'is_dealer', 'year', 'engine', 'transmission',
       'mileage', 'power_horse', 'car_body', 'wheel_drive', 'fuel_type',
       'brand', 'region'],
      dtype='object')

Прогноз для текста

In [25]:
loaded_model = mlflow.pyfunc.load_model('runs:/798e3dacc2794a3fa23cc7e9cc2ee33f/bert-finetuning')

In [26]:
df_ml.columns = ['price', 'text', 'is_dealer', 'year', 'engine',
       'transmission', 'mileage', 'power_horse', 'car_body', 'wheel_drive',
       'fuel_type', 'brand', 'region']

In [33]:
prob_preds = loaded_model.predict(list(df_ml['text'].astype(str).values))

In [39]:
prob_preds

Unnamed: 0,label,score
0,LABEL_9,0.966847
1,LABEL_4,0.406988
2,LABEL_0,0.960189
3,LABEL_2,0.429774
4,LABEL_9,0.972477
...,...,...
40446,LABEL_8,0.285746
40447,LABEL_1,0.729497
40448,LABEL_5,0.387094
40449,LABEL_5,0.181845


In [55]:
y_pred = prob_preds['label'].apply(lambda x: x[-1])

In [56]:
y_pred

0        9
1        4
2        0
3        2
4        9
        ..
40446    8
40447    1
40448    5
40449    5
40450    5
Name: label, Length: 40451, dtype: object

In [57]:
df_ml['price_class_pred'] = np.array(y_pred, dtype=int)


In [58]:
df_ml.corr(numeric_only=True)

Unnamed: 0,price,is_dealer,year,engine,mileage,power_horse,price_class_pred
price,1.0,0.061096,0.54695,0.439743,-0.300054,0.642709,0.71279
is_dealer,0.061096,1.0,0.074097,0.039316,-0.046497,0.071447,0.100309
year,0.54695,0.074097,1.0,-0.02542,-0.446733,0.170798,0.565833
engine,0.439743,0.039316,-0.02542,1.0,0.125205,0.826577,0.315861
mileage,-0.300054,-0.046497,-0.446733,0.125205,1.0,0.026489,-0.250933
power_horse,0.642709,0.071447,0.170798,0.826577,0.026489,1.0,0.514252
price_class_pred,0.71279,0.100309,0.565833,0.315861,-0.250933,0.514252,1.0


In [59]:
X = df_ml.drop(['price', 'text'], axis=1)
X_numeric = pd.get_dummies(X, columns=[
        'is_dealer', 'transmission', 'car_body',
        'wheel_drive','fuel_type', 'brand', 
        'region'
        ])
y = df_ml['price']
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.25)


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_test, y_test, test_size=0.25)

In [None]:
mlflow.set_experiment('With text 10 classes')

with mlflow.start_run():

    iterations = 1500
    learning_rate=0.1
    depth=10
    cat_features=[
        'is_dealer', 'transmission', 'car_body',
        'wheel_drive','fuel_type', 'brand', 
        'region',
        ]
    verbose=200
    random_state=42

    model = CatBoostRegressor(iterations=iterations,
                           learning_rate=learning_rate,
                           depth=depth,
                           verbose=verbose, 
                           random_state=random_state)
    
    mlflow.log_param('iterations', iterations)
    mlflow.log_param('learning_rate', learning_rate)
    mlflow.log_param('depth', depth)
    mlflow.log_param('cat_features', ', '.join(cat_features))  # Преобразуем список в строку для логирования
    mlflow.log_param('verbose', verbose)
    mlflow.log_param('random_state', random_state)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    rmse, mae, mape, r2 = eval_metrics(y_test, y_pred)
    
    print('Catboost model')
    print(f'  RMSE: {rmse}')
    print(f'  MAE: {mae}')
    print(f'  R2: {r2}')
    print(f'  mape: {mape}')

    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mae', mae)
    mlflow.log_metric('r2', r2)
    mlflow.log_metric('mape', mape)

    # Пример входных данных для логирования
    input_example = X_test.sample(5)

    # Логирование модели с примером входных данных
    mlflow.catboost.log_model(model, 'catboost_base', input_example=input_example)

0:	learn: 724265.4955762	total: 43.9ms	remaining: 1m 5s
200:	learn: 142437.5420684	total: 2.99s	remaining: 19.3s
400:	learn: 101605.8976257	total: 6.16s	remaining: 16.9s
600:	learn: 79896.2328309	total: 9.26s	remaining: 13.8s
800:	learn: 64966.6770703	total: 12.7s	remaining: 11.1s
1000:	learn: 54708.0657079	total: 16.3s	remaining: 8.12s
1200:	learn: 47384.3419287	total: 19.6s	remaining: 4.88s
1400:	learn: 41509.5004242	total: 22.9s	remaining: 1.62s
1499:	learn: 39214.2987607	total: 24.6s	remaining: 0us
Catboost model
  RMSE: 268731.78106582677
  MAE: 140969.91529568276
  R2: 0.8864446386754643
  mape: 0.2259287363070237


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 234.90it/s] 


In [65]:
feature_selected = [
    'power_horse', 'year', 'engine', 'brand_Япония',
    'region_Южный', 'mileage', 'wheel_drive_полный',
    'brand_Россия', 'brand_Китай', 'brand_Южная Корея',
    'brand_США', 'brand_Франция', 'transmission_AT',
    'car_body_внедорожник', 'wheel_drive_задний',
    'brand_Чехия', 'fuel_type_электро', 'car_body_хетчбэк',
    'region_Приволжский', 'car_body_лифтбек',
    'region_Северо-Западный', 'is_dealer_True',
    'brand_Италия', 'fuel_type_газ', 'car_body_купе',
    'region_Сибирский', 'brand_Украина', 'brand_Испания',
    'car_body_микроавтобус', 'price_class_pred'
    ]

In [66]:
X_train = X_train[feature_selected]
X_test = X_test[feature_selected]

In [None]:
mlflow.set_experiment('With text 10 classes, feature selected')

with mlflow.start_run():

    iterations = 1500
    learning_rate=0.1
    depth=10
    cat_features=[
        'is_dealer', 'transmission', 'car_body',
        'wheel_drive','fuel_type', 'brand', 
        'region',
        ]
    verbose=200
    random_state=42

    model = CatBoostRegressor(iterations=iterations,
                           learning_rate=learning_rate,
                           depth=depth,
                           verbose=verbose, 
                           random_state=random_state)
    
    mlflow.log_param('iterations', iterations)
    mlflow.log_param('learning_rate', learning_rate)
    mlflow.log_param('depth', depth)
    mlflow.log_param('cat_features', ', '.join(cat_features))  # Преобразуем список в строку для логирования
    mlflow.log_param('verbose', verbose)
    mlflow.log_param('random_state', random_state)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    rmse, mae, mape, r2 = eval_metrics(y_test, y_pred)
    
    print('Catboost model')
    print(f'  RMSE: {rmse}')
    print(f'  MAE: {mae}')
    print(f'  R2: {r2}')
    print(f'  mape: {mape}')

    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mae', mae)
    mlflow.log_metric('r2', r2)
    mlflow.log_metric('mape', mape)

    # Пример входных данных для логирования
    input_example = X_test.sample(5)

    # Логирование модели с примером входных данных
    mlflow.catboost.log_model(model, 'catboost_base', input_example=input_example)

2024/11/14 17:42:43 INFO mlflow.tracking.fluent: Experiment with name 'With text 10 classes, feature selected' does not exist. Creating a new experiment.


0:	learn: 723573.3583143	total: 13ms	remaining: 19.5s
200:	learn: 149203.5315271	total: 2.94s	remaining: 19s
400:	learn: 109738.9473087	total: 6.18s	remaining: 17s
600:	learn: 89451.4526259	total: 9.2s	remaining: 13.8s
800:	learn: 75886.2087722	total: 12.4s	remaining: 10.8s
1000:	learn: 65326.5838177	total: 15.6s	remaining: 7.76s
1200:	learn: 57812.1743187	total: 18.7s	remaining: 4.65s
1400:	learn: 51943.5750450	total: 22s	remaining: 1.55s
1499:	learn: 49541.6363720	total: 23.7s	remaining: 0us
Catboost model
  RMSE: 276301.4864292953
  MAE: 143242.2004885609
  R2: 0.8799572253340813
  mape: 0.22539072027295326


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 784.95it/s] 


In [70]:
X_train.shape

(7584, 30)

In [None]:
mlflow.set_experiment('With text 10 classes, feature selected')

with mlflow.start_run():

    model_params = dict()

    model_params['n_estimators'] = 200
    model_params['max_depth'] = 20
    model_params['max_features'] = 30
    model_params['random_state'] = 42

    rf = RandomForestRegressor(**model_params)

    mlflow.log_params(model_params)

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)

    rmse, mae, mape, r2 = eval_metrics(y_test, y_pred)
    
    print('Random Forest model')
    print(f'  RMSE: {rmse}')
    print(f'  MAE: {mae}')
    print(f'  R2: {r2}')
    print(f'  mape: {mape}')

    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mae', mae)
    mlflow.log_metric('r2', r2)
    mlflow.log_metric('mape', mape)

    # Пример входных данных для логирования
    input_example = X_test.sample(5) 

    # Логирование модели с примером входных данных
    mlflow.sklearn.log_model(rf, 'Random Forest Regressor', input_example=input_example)

Random Forest model
  RMSE: 284224.17554280773
  MAE: 153550.1335641417
  R2: 0.872974295192652
  mape: 0.23699669444147342


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 12.36it/s] 


In [None]:
mlflow.set_experiment('All without text')

with mlflow.start_run():
    # Параметры модели
    model_params = {
        'input_dim': X_train.shape[1],
        'hidden_units': 64,
        'epochs': 100,
        'batch_size': 32
    }
    
    mlflow.log_params(model_params)

    # Стандартизация данных
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Создание модели нейронной сети
    model = Sequential()
    model.add(Dense(model_params['hidden_units'], input_dim=model_params['input_dim'], activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer='adam', loss='mean_squared_error')

    # Обучение модели
    model.fit(X_train_scaled, y_train, epochs=model_params['epochs'], batch_size=model_params['batch_size'], verbose=0)

    # Прогнозирование
    y_pred = model.predict(X_test_scaled)

    # Оценка метрик
    rmse, mae, mape, r2 = eval_metrics(y_test, y_pred)
    
    print('Neural Network model')
    print(f'  RMSE: {rmse}')
    print(f'  MAE: {mae}')
    print(f'  R2: {r2}')
    print(f'  MAPE: {mape}')

    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mae', mae)
    mlflow.log_metric('r2', r2)
    mlflow.log_metric('mape', mape)

    # Логирование модели без примера входных данных
    mlflow.keras.log_model(model, 'Neural Network Regressor')