In [19]:
import os
import mlflow

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

In [20]:
import pandas as pd

In [21]:
df = pd.read_pickle('../data/CleanCarData.pkl')
display(df.head(10))

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.349609,5.589844,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.539062,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.851562,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.849609,4.148438,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.601562,6.871094,42450,Diesel,Dealer,Manual,0
5,vitara brezza,2018,9.25,9.828125,2071,Diesel,Dealer,Manual,0
6,ciaz,2015,6.75,8.117188,18796,Petrol,Dealer,Manual,0
7,s cross,2015,6.5,8.609375,33429,Diesel,Dealer,Manual,0
8,ciaz,2016,8.75,8.890625,20273,Diesel,Dealer,Manual,0
9,ciaz,2015,7.449219,8.921875,42367,Diesel,Dealer,Manual,0


In [22]:
df = df.rename(columns={'Present_Price': 'target'})

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)

In [24]:
cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features

['Car_Name', 'Fuel_Type', 'Selling_type', 'Transmission']

In [25]:
num_features = X_train.select_dtypes(include=['number']).columns.to_list()
num_features

['Year', 'Selling_Price', 'Driven_kms', 'Owner']

In [26]:
s_scaler = StandardScaler()
l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом
regressor = RandomForestRegressor()

In [27]:
# Для удобной работы со столбцами
preprocessor = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков
    ],
    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования

In [28]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor)])

pipeline.fit(X_train, y_train)

In [29]:
predictions = pipeline.predict(X_test) 

metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics

{'mae': np.float64(1.314117720754523),
 'mape': np.float64(0.9772362643562917),
 'mse': np.float64(3.0013884047909003)}

In [32]:
# Работаем с MLflow локально
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri)   

In [68]:
# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
EXPERIMENT_NAME = "car_project_2"
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "car_model_rf"

In [69]:
# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
req_file = '../requirements.txt'
# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели
#params_dict = {'n_estimators': 10, 'max_depth': 10}
params_dict = pipeline.get_params()



In [71]:
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

In [72]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
        # получаем уникальный идентификатор запуска эксперимента
        run_id = run.info.run_id 
        mlflow.sklearn.log_model(pipeline, 
                                    artifact_path="models",
                                    signature=signature,
                                    input_example=input_example,
                                    pip_requirements=req_file
                                )
        mlflow.log_metrics(metrics)
        mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/10/22 18:24:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/2/runs/93b34d1299f9441f9a7168bad2f10018.
2024/10/22 18:24:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.


In [66]:
mlflow.delete_experiment

2024/10/22 18:23:27 INFO mlflow.tracking._tracking_service.client: 🏃 View run useful-horse-637 at: http://127.0.0.1:5000/#/experiments/0/runs/3eacfbd3b93f473384fef9a17931502d.
2024/10/22 18:23:27 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


Exception: Could not get run status corresponding to string 3eacfbd3b93f473384fef9a17931502d. Valid run status strings: ['RUNNING', 'SCHEDULED', 'FINISHED', 'FAILED', 'KILLED']