In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

X = pd.read_csv('data/X_autofeat.csv')
y = pd.read_csv('data/y_autofeat.csv')

features = ["monthly_charges", "total_charges", "senior_citizen_0",	"senior_citizen_1"]
target = "target"
test_size = 0.2  # Размер тестовой выборки

X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size=test_size, shuffle=False)


In [5]:
import os
import mlflow.pyfunc
import optuna
from optuna.integration.mlflow import MLflowCallback
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, log_loss
from collections import defaultdict
import mlflow
from dotenv import load_dotenv
load_dotenv()


# Настройка окружения для MLflow и S3
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

# Настройка MLflow
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

TABLE_NAME = 'users_churn'

# Настройка эксперимента
EXPERIMENT_NAME = "catboost_hyperparameter_tuning"
RUN_NAME = "model_bayesian_search"

# Настройка study
STUDY_DB_NAME = "sqlite:///local.study.db"  # Локальная база данных для хранения study
STUDY_NAME = "churn_model"

# Функция objective для Optuna
def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }
    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    metrics = defaultdict(list)

    for train_index, val_index in skf.split(X_train, y_train):
        # Преобразуем индексы в numpy.ndarray, если это необходимо
        train_X, val_X = X_train.iloc[train_index], X_train.iloc[val_index]
        train_y, val_y = y_train.iloc[train_index], y_train.iloc[val_index]

        model.fit(train_X, train_y, eval_set=(val_X, val_y), verbose=False)
        prediction = model.predict(val_X)
        probas = model.predict_proba(val_X)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, probas)

        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)

    # Агрегирование метрик
    err1 = sum(metrics["err1"]) / len(metrics["err1"])
    err2 = sum(metrics["err2"]) / len(metrics["err2"])
    auc = sum(metrics["auc"]) / len(metrics["auc"])
    precision = sum(metrics["precision"]) / len(metrics["precision"])
    recall = sum(metrics["recall"]) / len(metrics["recall"])
    f1 = sum(metrics["f1"]) / len(metrics["f1"])
    logloss = sum(metrics["logloss"]) / len(metrics["logloss"])

    # Логирование метрик в MLflow
    mlflow.log_metrics({
        "err1": err1,
        "err2": err2,
        "auc": auc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "logloss": logloss,
    })

    return auc

# Создание эксперимента в MLflow
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

# Запуск эксперимента
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    # Настройка MLflow Callback с mlflow_kwargs
    mlflc = MLflowCallback(
        tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
        metric_name="auc",
        mlflow_kwargs={
            "run_name": RUN_NAME,
            "nested": True,  # Вложенные запуски для логирования каждого trial
        }
    )

    # Создание и оптимизация study
    study = optuna.create_study(
        study_name=STUDY_NAME,
        storage=STUDY_DB_NAME,  # Использование локальной базы данных
        sampler=optuna.samplers.TPESampler(),
        direction="maximize",
        load_if_exists=True
    )
    study.optimize(objective, n_trials=10, callbacks=[mlflc])

    # Логирование лучших параметров
    best_params = study.best_params
    mlflow.log_params(best_params)

    # Логирование модели
    best_model = CatBoostClassifier(**best_params)
    best_model.fit(X_train, y_train)
    mlflow.catboost.log_model(best_model, "cv")
    # Сохраняем модель в формате CatBoost
    best_model.save_model("models/model.pkl")
    # Логируем модель в MLflow
    mlflow.log_artifact("models/model.pkl", "cv")

    # Вывод результатов
    print(f"Number of finished trials: {len(study.trials)}")
    print(f"Best params: {best_params}")

  mlflc = MLflowCallback(
[I 2025-03-08 18:50:26,563] Using an existing study with name 'churn_model' instead of creating a new one.
[I 2025-03-08 18:50:29,367] Trial 65 finished with value: 0.82133283007552 and parameters: {'learning_rate': 0.06899115462802699, 'depth': 4, 'l2_leaf_reg': 0.7319881133030135, 'random_strength': 1.8805491407222639}. Best is trial 9 with value: 0.8420247049008844.
[I 2025-03-08 18:50:39,794] Trial 66 finished with value: 0.8177450759361943 and parameters: {'learning_rate': 0.022360689682981244, 'depth': 11, 'l2_leaf_reg': 0.3991541118232797, 'random_strength': 0.8062247135753791}. Best is trial 9 with value: 0.8420247049008844.
[I 2025-03-08 18:50:55,000] Trial 67 finished with value: 0.8168745890903111 and parameters: {'learning_rate': 0.0081622156271034, 'depth': 11, 'l2_leaf_reg': 0.3598320961664071, 'random_strength': 0.30502191368070325}. Best is trial 9 with value: 0.8420247049008844.
[I 2025-03-08 18:51:10,719] Trial 68 finished with value: 0.81631

0:	learn: 0.6725155	total: 1.53ms	remaining: 1.53s
1:	learn: 0.6538097	total: 3.08ms	remaining: 1.54s
2:	learn: 0.6374266	total: 4.5ms	remaining: 1.5s
3:	learn: 0.6225163	total: 5.9ms	remaining: 1.47s
4:	learn: 0.6082345	total: 7.26ms	remaining: 1.44s
5:	learn: 0.5952573	total: 8.82ms	remaining: 1.46s
6:	learn: 0.5823016	total: 10.3ms	remaining: 1.46s
7:	learn: 0.5699113	total: 11.7ms	remaining: 1.45s
8:	learn: 0.5621099	total: 13.2ms	remaining: 1.46s
9:	learn: 0.5526369	total: 14.7ms	remaining: 1.45s
10:	learn: 0.5426870	total: 16.3ms	remaining: 1.46s
11:	learn: 0.5343609	total: 17.8ms	remaining: 1.47s
12:	learn: 0.5261867	total: 19.3ms	remaining: 1.46s
13:	learn: 0.5197585	total: 20.7ms	remaining: 1.46s
14:	learn: 0.5130942	total: 22.3ms	remaining: 1.46s
15:	learn: 0.5068843	total: 23.7ms	remaining: 1.46s
16:	learn: 0.5017180	total: 25.3ms	remaining: 1.46s
17:	learn: 0.4949499	total: 26.8ms	remaining: 1.46s
18:	learn: 0.4899359	total: 28.2ms	remaining: 1.46s
19:	learn: 0.4838801	tota