In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import ndcg_score
import joblib
from catboost import CatBoostRanker, Pool

In [2]:
DATA_DIR = Path("range_features")
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

In [3]:
def build_group_sizes(df):
    return df.groupby(["visitorid", "anchor_session_id"]).size().tolist()


def split_train_valid(df, valid_frac=0.2):
    split_point = df["anchor_ts"].quantile(1.0 - valid_frac)
    return df[df["anchor_ts"] < split_point].copy(), df[
        df["anchor_ts"] >= split_point
    ].copy()


def eval_ndcg_per_group(df, preds, ks=(5, 10, 20)):
    metrics = {}
    sizes = build_group_sizes(df)
    labels = df["gain"].values
    start = 0
    for k in ks:
        vals = []
        start = 0
        for sz in sizes:
            if sz < 2:
                start += sz
                continue
            y = labels[start : start + sz]
            p = preds[start : start + sz]
            vals.append(ndcg_score([y], [p], k=k))
            start += sz
        metrics[f"ndcg_{k}"] = float(np.mean(vals)) if vals else np.nan
    return metrics

In [4]:
train_valid = pd.read_parquet(DATA_DIR / "train_X.parquet")
test = pd.read_parquet(DATA_DIR / "test_X.parquet")

train_valid слишком здоровенный получился не могу обучить на всем датасете

In [5]:
pos = train_valid[train_valid["gain"] > 0]
neg = train_valid[train_valid["gain"] == 0].sample(
    n=len(pos) * 8, random_state=42
)  # ratio 1:x
train_valid = pd.concat([pos, neg])

In [6]:
BASE_FEATURES = [
    "als_score",
    "sim_max",
    "item_pop_w",
    "sess_n_events",
    "sess_n_items",
    "sess_duration",
    "sess_cnt_view",
    "sess_cnt_addtocart",
    "sess_cnt_transaction",
    "available",
    "categoryid",
    "root_category",
    "level_0",
    "level_1",
    "level_2",
    "level_3",
    "level_4",
    "level_5",
    "value_count",
    "value_mean",
    "value_std",
    "value_min",
    "value_max",
]
DROP = ["visitorid", "anchor_session_id", "itemid", "anchor_ts", "gain", "timestamp"]

features = [c for c in BASE_FEATURES if c in train_valid.columns]


cat_features = [
    c
    for c in [
        "available",
        "categoryid",
        "root_category",
        "level_0",
        "level_1",
        "level_2",
        "level_3",
        "level_4",
        "level_5",
    ]
    if c in features
]
num_features = [c for c in features if c not in cat_features]

In [7]:
def convert_categorical_to_str(df: pd.DataFrame, cat_features: list) -> pd.DataFrame:
    df_copy = df.copy()

    for feature in cat_features:
        if feature in df_copy.columns:
            df_copy[feature] = df_copy[feature].astype(str)

    return df_copy

In [8]:
train_valid = convert_categorical_to_str(train_valid, cat_features)
test = convert_categorical_to_str(test, cat_features)

In [9]:
train_part, valid_part = split_train_valid(train_valid)

del train_valid


# Создаем group_id колонки
train_part["group_id"] = (
    train_part["visitorid"].astype(str)
    + "_"
    + train_part["anchor_session_id"].astype(str)
)
valid_part["group_id"] = (
    valid_part["visitorid"].astype(str)
    + "_"
    + valid_part["anchor_session_id"].astype(str)
)

# Сортируем по group_id
train_part = train_part.sort_values("group_id").reset_index(drop=True)
valid_part = valid_part.sort_values("group_id").reset_index(drop=True)

# Теперь создаем Pool
train_pool = Pool(
    train_part[features],
    label=train_part["gain"],
    group_id=train_part["group_id"],
    cat_features=cat_features,
)

valid_pool = Pool(
    valid_part[features],
    label=valid_part["gain"],
    group_id=valid_part["group_id"],
    cat_features=cat_features,
)

# модель CatBoostRanker
model = CatBoostRanker(
    iterations=1500,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3.0,
    loss_function="YetiRank",
    eval_metric="NDCG:top=10",
    random_seed=42,
    od_type="Iter",
    od_wait=50,
    use_best_model=True,
)

In [10]:
# обучение
model.fit(train_pool, eval_set=valid_pool, verbose=100)

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.8049447	best: 0.8049447 (0)	total: 1.36s	remaining: 33m 58s
100:	test: 0.9055616	best: 0.9055616 (100)	total: 1m 49s	remaining: 25m 16s
200:	test: 0.9128306	best: 0.9128306 (200)	total: 3m 39s	remaining: 23m 40s
300:	test: 0.9164355	best: 0.9164633 (298)	total: 5m 29s	remaining: 21m 53s
400:	test: 0.9179489	best: 0.9179588 (399)	total: 7m 20s	remaining: 20m 7s
500:	test: 0.9190752	best: 0.9191437 (493)	total: 9m 11s	remaining: 18m 19s
600:	test: 0.9195284	best: 0.9195559 (588)	total: 11m	remaining: 16m 28s
700:	test: 0.9203721	best: 0.9203721 (700)	total: 12m 50s	remaining: 14m 38s
800:	test: 0.9205645	best: 0.9205645 (800)	total: 14m 36s	remaining: 12m 44s
900:	test: 0.9209521	best: 0.9209591 (899)	total: 16m 21s	remaining: 10m 52s
1000:	test: 0.9213550	best: 0.9213950 (999)	total: 18m 8s	remaining: 9m 2s
1100:	test: 0.9216654	best: 0.9216672 (1092)	total: 19m 55s	remaining: 7m 13s
1200:	test: 0.9220625	best: 0.9220671 (1197)

<catboost.core.CatBoostRanker at 0x7bf5e03140d0>

In [11]:
# валидация
pv = model.predict(valid_pool)


valid_metrics = eval_ndcg_per_group(valid_part, pv)

print("Valid metrics:", valid_metrics)

test["group_id"] = (
    test["visitorid"].astype(str) + "_" + test["anchor_session_id"].astype(str)
)

test = test.sort_values("group_id").reset_index(drop=True)

# тест
test_pool = Pool(
    test[features],
    label=test["gain"],
    group_id=test["group_id"],
    cat_features=cat_features,
)

Valid metrics: {'ndcg_5': 0.7545993596910043, 'ndcg_10': 0.7682340738536737, 'ndcg_20': 0.7719175556646417}


In [12]:
pt = model.predict(test_pool)

test_metrics = eval_ndcg_per_group(test, pt)
print("Test metrics:", test_metrics)

Test metrics: {'ndcg_5': 0.6588970793415756, 'ndcg_10': 0.6633743605540435, 'ndcg_20': 0.6664979058737888}


In [13]:
# сохраняем модель
model.save_model(str(MODEL_DIR / "catboost_ranker_8.cbm"))
joblib.dump(cat_features, MODEL_DIR / "cat_features_8.pkl")

['models/cat_features_8.pkl']

In [14]:
import mlflow

from utils_mlflow import setup_mlflow_client, setup_env

setup_env()

client = setup_mlflow_client()

EXPERIMENT_NAME = "online_recommendations_pr_final"
RUN_NAME = "range_model_8"


if client.get_experiment_by_name(EXPERIMENT_NAME) is None:
    experiment_id = client.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id


mlflow.set_tracking_uri(client.tracking_uri)
mlflow.set_registry_uri(client.tracking_uri)

In [15]:
# Начинаем MLflow run
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id):
    # Логируем параметры
    params = {
        "iterations": 1500,
        "learning_rate": 0.05,
        "depth": 8,
        "l2_leaf_reg": 3.0,
        "loss_function": "YetiRank",
        "eval_metric": "NDCG:top=10",
        "random_seed": 42,
        "od_type": "Iter",
        "od_wait": 50,
        "pos_neg_ratio": "1:5",
        "n_features": len(features),
        "n_cat_features": len(cat_features),
        "train_size": len(train_part),
        "valid_size": len(valid_part),
        "test_size": len(test),
    }

    mlflow.log_params(params)

    # Логируем списки фичей
    mlflow.log_text("\n".join(features), "features.txt")
    mlflow.log_text("\n".join(cat_features), "categorical_features.txt")

    for metric_name, metric_value in valid_metrics.items():
        mlflow.log_metric(f"valid_{metric_name}", metric_value)

    for metric_name, metric_value in test_metrics.items():
        mlflow.log_metric(f"test_{metric_name}", metric_value)

    mlflow.catboost.log_model(
        model, "model", registered_model_name="catboost_ranker_ratio5"
    )

    mlflow.log_artifacts(str(MODEL_DIR), "model_files")
    try:
        feature_importance = model.get_feature_importance(test_pool)
        importance_df = pd.DataFrame(
            {"feature": features, "importance": feature_importance}
        ).sort_values("importance", ascending=False)

        importance_path = MODEL_DIR / "feature_importance.csv"
        importance_df.to_csv(importance_path, index=False)
        mlflow.log_artifact(str(importance_path), "feature_importance")

        # Логируем топ-10 важных фичей как параметры
        for i, (feat, imp) in enumerate(importance_df.head(10).values):
            mlflow.log_param(f"top_feature_{i + 1}", f"{feat}:{imp:.4f}")

    except Exception as e:
        print(f"Не удалось получить feature importance: {e}")

Registered model 'catboost_ranker_ratio5' already exists. Creating a new version of this model...
2025/09/29 08:05:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: catboost_ranker_ratio5, version 3
Created version '3' of model 'catboost_ranker_ratio5'.


🏃 View run range_model_8 at: http://127.0.0.1:5000/#/experiments/17/runs/ae9b5c42705744bb9e20b541379157e2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/17
