In [29]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
import os
import numpy as np
import pandas as pd

In [31]:
import mlflow
from mlflow.models import infer_signature

In [32]:
from datetime import datetime, timedelta
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from typing import Any, Dict, Literal, NoReturn

In [33]:
from sqlalchemy import create_engine

In [34]:
from dotenv import load_dotenv
load_dotenv()

True

In [35]:
PASSWORD_DB = os.getenv('PASSWORD_DB', None)
USER_DB = os.getenv('USER_DB', None)

In [36]:
os.getenv("MLFLOW_S3_ENDPOINT_URL", None)

'https://storage.yandexcloud.net'

In [37]:
os.getenv("MLFLOW_TRACKING_URI", None)

'postgresql://mlflow:mlflow@localhost:5432/mlflow_db'

In [38]:
FEATURES = [
    "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup",
    "Latitude", "Longitude"
]
TARGET = "MedHouseVal"

In [41]:
engine = create_engine(f"postgresql://{USER_DB}:{PASSWORD_DB}@localhost:5432/postgres")

In [42]:
data = pd.read_sql_query("SELECT * FROM california_housing", engine)
data.head(5)

Unnamed: 0,index,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [43]:
X, y = data[FEATURES], data[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
scaler = StandardScaler()
X_train_fitted = scaler.fit_transform(X_train)
X_test_fitted = scaler.transform(X_test)

model = RandomForestRegressor()

In [56]:
def train_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    r2 = r2_score(y_test, prediction)
    rmse = mean_squared_error(y_test, prediction)**0.5
    mae = median_absolute_error(y_test, prediction)

    # сохранение метрик
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)

    # сохранение модели
    signature = infer_signature(X_test, prediction)
    mlflow.sklearn.log_model(model, "model_lr", signature=signature)
    mlflow.sklearn.save_model(model, "model_lr")

In [57]:
exp_name = "rnd_forest_4"
mlflow.create_experiment(exp_name, artifact_location=f"s3://alxvmr-mlflow-artifacts/{exp_name}")
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='s3://alxvmr-mlflow-artifacts/rnd_forest_4', creation_time=1710242938504, experiment_id='7', last_update_time=1710242938504, lifecycle_stage='active', name='rnd_forest_4', tags={}>

In [58]:
mlflow.end_run()

In [59]:
with mlflow.start_run(run_name="run_4"):
    train_model(model, X_train_fitted, X_test_fitted, y_train, y_test)

In [None]:
mlflow.get_experiment_by_name("rnd_forest_4")

In [None]:
#client = mlflow.client.MlflowClient()

In [None]:
#client.tracking_uri