Собираем метрики и делаем выводы

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics import ndcg_score
import joblib
from catboost import CatBoostRanker, Pool
import mlflow
from utils_mlflow import setup_mlflow_client, setup_env
import pickle
import json

In [2]:
ALS_pth = Path("ALS_assets")

range_feature_pth = Path("range_features")

models_pth = Path("models")

num_recs = 10

In [3]:
test_range_model = pd.read_parquet(range_feature_pth / "test_X.parquet")
model_5 = CatBoostRanker().load_model(models_pth / "catboost_ranker.cbm")
model_8 = CatBoostRanker().load_model(models_pth / "catboost_ranker_8.cbm")

with open(models_pth / "cat_features.pkl", "rb") as f:
    cat_features_model = pickle.load(f)

In [4]:
def convert_categorical_to_str(df: pd.DataFrame, cat_features: list) -> pd.DataFrame:
    df_copy = df.copy()

    for feature in cat_features:
        if feature in df_copy.columns:
            df_copy[feature] = df_copy[feature].astype(str)

    return df_copy


def generate_ranking_recommendations(
    model, test_data: pd.DataFrame, features: list, cat_features: list, k: int = 15
) -> dict:
    """
    Генерация рекомендаций из ranking модели
    """
    # Подготавливаем данные для предсказания
    test_data = test_data.copy()

    test_data = convert_categorical_to_str(test_data, cat_features)

    test_data["group_id"] = (
        test_data["visitorid"].astype(str)
        + "_"
        + test_data["anchor_session_id"].astype(str)
    )
    test_data = test_data.sort_values("group_id").reset_index(drop=True)

    # Делаем предсказания
    test_pool = Pool(
        test_data[features],
        group_id=test_data["group_id"],
        cat_features=[
            c
            for c in features
            if c
            in [
                "available",
                "categoryid",
                "root_category",
                "level_0",
                "level_1",
                "level_2",
                "level_3",
                "level_4",
                "level_5",
            ]
        ],
    )

    predictions = model.predict(test_pool)
    test_data["prediction"] = predictions

    # Создаем топ-k рекомендации для каждого пользователя
    recommendations = {}
    for (visitor_id, session_id), group in test_data.groupby(
        ["visitorid", "anchor_session_id"]
    ):
        top_items = group.nlargest(k, "prediction")["itemid"].tolist()
        recommendations[visitor_id] = top_items

    return recommendations

In [5]:
def load_mappings(pth: Path) -> tuple[dict, dict]:
    with open(pth / "hash_visitoridx_train.json") as f:
        idx2user = {int(float(k)): int(float(v)) for k, v in json.load(f).items()}
    with open(pth / "hash_itemidx_train.json") as f:
        idx2item = {int(float(k)): int(float(v)) for k, v in json.load(f).items()}
    return idx2user, idx2item


def load_als_and_sim(idx2user: dict, idx2item: dict, pth_in: Path) -> tuple[dict, dict]:
    als_recs = pd.read_parquet(pth_in / "als_recommendations.parquet")
    als_recs["visitorid"] = als_recs["visitoridx"].map(idx2user)
    als_recs["itemid"] = als_recs["itemidx"].map(idx2item)
    als_recs = als_recs.dropna(subset=["visitorid", "itemid"])
    als_recs.rename(columns={"rating": "als_score"}, inplace=True)

    als_user_lookup = {
        uid: dict(zip(df_u["itemid"], df_u["als_score"]))
        for uid, df_u in als_recs.groupby("visitorid")
    }

    sim_df = pd.read_parquet(pth_in / "similar_items_df.parquet")

    sim_df["itemid"] = sim_df["items_idx"].map(idx2item)
    sim_df["sim_itemid"] = sim_df["sim_item_id_idx"].map(idx2item)
    sim_df = sim_df.dropna(subset=["itemid", "sim_itemid"])
    sim_df = sim_df[sim_df["itemid"] != sim_df["sim_itemid"]]

    sim_index = {
        iid: dict(zip(g["sim_itemid"], g["score"]))
        for iid, g in sim_df.groupby("itemid")
    }

    return als_user_lookup, sim_index

In [6]:
idx2user, idx2item = load_mappings(ALS_pth)

als_user_lookup, sim_index = load_als_and_sim(idx2user, idx2item, ALS_pth)

In [7]:
als_user_lookup.get(1)

{384302: 0.0066687241196632385,
 393028: 0.005869865417480469,
 161623: 0.005481332074850798,
 91105: 0.005304347723722458,
 206880: 0.0050936173647642136,
 219512: 0.005062459036707878,
 11279: 0.004846337251365185,
 274798: 0.004784093238413334,
 289103: 0.004687211941927671,
 254477: 0.004485338926315308,
 321850: 0.004393714014440775,
 350103: 0.004338754341006279,
 110529: 0.004218859598040581,
 89688: 0.004209653940051794,
 197642: 0.004183892160654068,
 190536: 0.004177401773631573,
 301843: 0.004097780212759972,
 359251: 0.004060692153871059,
 118446: 0.004054447636008263,
 115426: 0.0040053860284388065,
 260093: 0.004004208371043205,
 204070: 0.003922739997506142,
 368488: 0.00391325494274497,
 187132: 0.0039127361960709095,
 45543: 0.003854788839817047,
 120763: 0.003849760629236698,
 230616: 0.003848025808110833,
 331311: 0.003744009416550398,
 190000: 0.0037105244118720293,
 149253: 0.0035858992487192154,
 45323: 0.003490645904093981,
 216225: 0.00343985459767282,
 126555: 

In [8]:
def convert_base_line_to_recommendations(
    recs_from_base_line: dict, k: int = 10
) -> dict:
    """
    Конвертирует  рекомендации из формата {user: {item: score}}
    в формат {user: [top_k_items]}
    """
    recommendations = {}

    for user_id, item_scores in recs_from_base_line.items():
        # Сортируем товары по убыванию скора и берем топ-k
        sorted_items = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)
        top_k_items = [item_id for item_id, score in sorted_items[:k]]
        recommendations[user_id] = top_k_items

    return recommendations

In [9]:
als_recs = convert_base_line_to_recommendations(als_user_lookup, num_recs)

sim_recs = convert_base_line_to_recommendations(sim_index, num_recs)

In [10]:
range_model_recs5 = generate_ranking_recommendations(
    model_5, test_range_model, model_5.feature_names_, cat_features_model, k=num_recs
)

range_model_recs8 = generate_ranking_recommendations(
    model_8, test_range_model, model_8.feature_names_, cat_features_model, k=num_recs
)

In [11]:
def calculate_recommendation_metrics(
    recommendations: dict,
    test_range_model: pd.DataFrame,
    k: int = 15,
    model_name: str = "model",
) -> dict:
    """
    Расчет метрик для рекомендательной системы

    Args:
        recommendations: словарь {user_id: [recommended_items]}
        test_range_model: тестовые данные с истинными взаимодействиями
        k: количество рекомендаций для оценки
        model_name: название модели для логирования

    Returns:
        dict: словарь с метриками
    """

    # Подготовка истинных взаимодействий из тестовых данных
    true_interactions = {}
    for _, row in test_range_model.iterrows():
        user_id = row["visitorid"]
        item_id = row["itemid"]

        if user_id not in true_interactions:
            true_interactions[user_id] = set()
        true_interactions[user_id].add(item_id)

    # Инициализация метрик
    precisions = []
    recalls = []
    f1_scores = []

    # Для каждого пользователя рассчитываем метрики
    for user_id in recommendations.keys():
        if user_id not in true_interactions:
            continue

        # Рекомендованные товары (топ-k)
        recommended_items = set(recommendations[user_id][:k])

        # Истинные взаимодействия пользователя
        true_items = true_interactions[user_id]

        # Пересечение рекомендаций и истинных взаимодействий
        relevant_recommended = recommended_items.intersection(true_items)

        # Precision@k = |relevant ∩ recommended| / |recommended|
        precision = (
            len(relevant_recommended) / len(recommended_items)
            if len(recommended_items) > 0
            else 0
        )
        precisions.append(precision)

        # Recall@k = |relevant ∩ recommended| / |relevant|
        recall = (
            len(relevant_recommended) / len(true_items) if len(true_items) > 0 else 0
        )
        recalls.append(recall)

        # F1@k
        f1 = (
            2 * precision * recall / (precision + recall)
            if (precision + recall) > 0
            else 0
        )
        f1_scores.append(f1)

    # Средние значения метрик
    avg_precision = np.mean(precisions) if precisions else 0
    avg_recall = np.mean(recalls) if recalls else 0
    avg_f1 = np.mean(f1_scores) if f1_scores else 0

    # Coverage - доля уникальных товаров в рекомендациях
    all_recommended_items = set()
    for recs in recommendations.values():
        all_recommended_items.update(recs[:k])

    all_items = set(test_range_model["itemid"].unique())
    coverage = len(all_recommended_items) / len(all_items) if len(all_items) > 0 else 0

    # Novelty - средняя популярность рекомендованных товаров (чем меньше, тем новее)
    item_popularity = test_range_model["itemid"].value_counts()
    total_interactions = len(test_range_model)

    novelty_scores = []
    for recs in recommendations.values():
        for item_id in recs[:k]:
            if item_id in item_popularity:
                # Популярность как доля от общего количества взаимодействий
                popularity = item_popularity[item_id] / total_interactions
                # Новизна = -log(популярность)
                novelty = -np.log(popularity) if popularity > 0 else 0
                novelty_scores.append(novelty)

    avg_novelty = np.mean(novelty_scores) if novelty_scores else 0

    # Собираем все метрики
    metrics = {
        "model_name": model_name,
        "k": k,
        "precision_at_k": avg_precision,
        "recall_at_k": avg_recall,
        "f1_at_k": avg_f1,
        "coverage": coverage,
        "novelty": avg_novelty,
        "num_users_evaluated": len(precisions),
        "num_recommendations": sum(len(recs[:k]) for recs in recommendations.values()),
    }

    return metrics

In [12]:
model5_metrics = calculate_recommendation_metrics(
    recommendations=range_model_recs5,
    test_range_model=test_range_model,
    k=num_recs,
    model_name="model5",
)

In [13]:
model5_metrics

{'model_name': 'model5',
 'k': 10,
 'precision_at_k': np.float64(1.0),
 'recall_at_k': np.float64(0.13956241850241313),
 'f1_at_k': np.float64(0.24215029809511837),
 'coverage': 0.6858085399021706,
 'novelty': np.float64(11.466331776634679),
 'num_users_evaluated': 19233,
 'num_recommendations': 192330}

In [14]:
model8_metrics = calculate_recommendation_metrics(
    recommendations=range_model_recs8,
    test_range_model=test_range_model,
    k=num_recs,
    model_name="model8",
)

In [15]:
model8_metrics

{'model_name': 'model8',
 'k': 10,
 'precision_at_k': np.float64(1.0),
 'recall_at_k': np.float64(0.13956241850241313),
 'f1_at_k': np.float64(0.24215029809511837),
 'coverage': 0.6863979979110435,
 'novelty': np.float64(11.45532126868709),
 'num_users_evaluated': 19233,
 'num_recommendations': 192330}

In [17]:
als_metrics = calculate_recommendation_metrics(
    recommendations=als_recs,
    test_range_model=test_range_model,
    k=num_recs,
    model_name="ALS",
)

In [18]:
als_metrics

{'model_name': 'ALS',
 'k': 10,
 'precision_at_k': np.float64(1.0),
 'recall_at_k': np.float64(0.06074396966961586),
 'f1_at_k': np.float64(0.11442899584161476),
 'coverage': 0.037704629830711794,
 'novelty': np.float64(8.315935037477049),
 'num_users_evaluated': 3724,
 'num_recommendations': 9919690}

In [19]:
sim_metrics = calculate_recommendation_metrics(
    recommendations=sim_recs,
    test_range_model=test_range_model,
    k=num_recs,
    model_name="similar_items",
)

In [20]:
sim_metrics

{'model_name': 'similar_items',
 'k': 10,
 'precision_at_k': np.float64(0.0010737628384687207),
 'recall_at_k': np.float64(0.00011391006527439793),
 'f1_at_k': np.float64(0.0002011026886381323),
 'coverage': 1.6662426705550213,
 'novelty': np.float64(13.483117191112992),
 'num_users_evaluated': 2142,
 'num_recommendations': 1598960}

In [21]:
import mlflow
from utils_mlflow import setup_env, setup_mlflow_client


EXPERIMENT_NAME = "online_recommendations_pr_final"
RUN_NAME = "collect_metrics"


setup_env()
client = setup_mlflow_client()


if client.get_experiment_by_name(EXPERIMENT_NAME) is None:
    experiment_id = client.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id


mlflow.set_tracking_uri(client.tracking_uri)
mlflow.set_registry_uri(client.tracking_uri)

In [23]:
def add_model_prefix(metrics_dict: dict, model_name: str) -> dict:
    """
    Добавляет префикс модели к именам метрик
    """
    prefixed_metrics = {}
    for key, value in metrics_dict.items():
        if key not in ["model_name"]:
            new_key = f"{model_name}_{key}"
            prefixed_metrics[new_key] = value
    return prefixed_metrics


# Добавляем префиксы
model5_metrics_prefixed = add_model_prefix(model5_metrics, "model5")
model8_metrics_prefixed = add_model_prefix(model8_metrics, "model8")
als_metrics_prefixed = add_model_prefix(als_metrics, "als")
sim_metrics_prefixed = add_model_prefix(sim_metrics, "similarity")

In [24]:
with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
    mlflow.log_metrics(model5_metrics_prefixed)
    mlflow.log_metrics(model8_metrics_prefixed)
    mlflow.log_metrics(als_metrics_prefixed)
    mlflow.log_metrics(sim_metrics_prefixed)

    mlflow.log_param("num_recs", num_recs)

🏃 View run collect_metrics at: http://127.0.0.1:5000/#/experiments/17/runs/477114c3531d4936bf6ff68fc3c2e561
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/17
