# Замеры метрик для текстового и гибридного поиска

Хотим проверить докидывает ли метрик векторный поиск как новый источник кандидатов, и как фича в L2 ранкере

In [38]:
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple

def calculate_metrics_for_ranking(df: pd.DataFrame, rank_col: str = 'rank', 
                                  group_by_col: str = 'query_text') -> Dict:
    """Расчет метрик для заданного способа ранжирования"""
    
    # Recall@k
    recall_at_k = {}
    for k in [1, 5, 10, 20]:
        recall = df.groupby(group_by_col).apply(
            lambda x: ((x[rank_col] <= k) & (x['relevant'] == 1)).any()
        ).mean()
        recall_at_k[k] = recall
    
    # MRR@10
    first_relevant_ranks = df[df['relevant'] == 1].groupby(group_by_col)[rank_col].min()
    first_relevant_ranks = first_relevant_ranks[first_relevant_ranks <= 10]
    reciprocal_ranks = 1.0 / first_relevant_ranks
    mrr = reciprocal_ranks.mean() if len(reciprocal_ranks) > 0 else 0
    
    # Precision@10
    df_top10 = df[df[rank_col] <= 10]
    precision_at_10 = df_top10.groupby(group_by_col)['relevant'].mean().mean()
    
    # nDCG@10
    ndcg_at_10 = calculate_ndcg(df, rank_col, group_by_col, k=10)
    
    # Average Precision@10
    ap_at_10 = calculate_average_precision(df, rank_col, group_by_col, k=10)
    
    return {
        'recall': recall_at_k,
        'mrr@10': mrr,
        'precision@10': precision_at_10,
        'ndcg@10': ndcg_at_10,
        'ap@10': ap_at_10,
        'queries_processed': df[group_by_col].nunique(),
        'avg_rank_relevant': df[df['relevant'] == 1][rank_col].mean()
    }

def calculate_ndcg(df: pd.DataFrame, rank_col: str, group_by_col: str, k: int = 10) -> float:
    """Расчет nDCG@k"""
    ndcg_scores = []
    
    for query in df[group_by_col].unique():
        query_results = df[df[group_by_col] == query]
        query_results = query_results.sort_values(rank_col).head(k)
        
        # Получаем релевантности и ранги
        y_true = query_results['relevant'].values
        if len(y_true) == 0:
            continue
            
        # Идеальный порядок (отсортированный по релевантности)
        ideal_order = np.sort(y_true)[::-1]
        
        # Расчет DCG
        dcg = np.sum(y_true / np.log2(np.arange(2, len(y_true) + 2)))
        
        # Расчет идеального DCG
        idcg = np.sum(ideal_order / np.log2(np.arange(2, len(ideal_order) + 2)))
        
        if idcg > 0:
            ndcg = dcg / idcg
            ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores) if ndcg_scores else 0

def calculate_average_precision(df: pd.DataFrame, rank_col: str, 
                               group_by_col: str, k: int = 10) -> float:
    """Расчет Average Precision@k"""
    ap_scores = []
    
    for query in df[group_by_col].unique():
        query_results = df[df[group_by_col] == query]
        query_results = query_results.sort_values(rank_col).head(k)
        
        relevant_docs = query_results[query_results['relevant'] == 1]
        if len(relevant_docs) == 0:
            ap_scores.append(0)
            continue
        
        precisions = []
        for idx, (_, row) in enumerate(relevant_docs.iterrows(), 1):
            # Precision на текущем ранге
            precision_at_r = idx / row[rank_col]
            precisions.append(precision_at_r)
        
        ap = np.mean(precisions) if precisions else 0
        ap_scores.append(ap)
    
    return np.mean(ap_scores) if ap_scores else 0

def compare_hybrid_rankings(df: pd.DataFrame) -> Dict:
    """Сравнение разных способов ранжирования в гибридном поиске"""
    
    # Фильтруем только гибридный поиск
    df_hybrid = df[df['search_type'] == 'hybrid'].copy()
    df_text = df[df['search_type'] == 'text'].copy()
    
    # Создаем два разных ранжирования для гибридного поиска
    
    # 1. Оригинальное ранжирование (по score)
    df_hybrid_original = df_hybrid.copy()
    
    # 2. Ранжирование по score_text_only
    df_hybrid_text = df_hybrid.copy()
    
    # Для каждого запроса пересчитываем ранги по score_text_only
    def rank_by_text_score(group):
        group = group.sort_values('score_text_only', ascending=False)
        group['rank_text_only'] = range(1, len(group) + 1)
        return group
    
    df_hybrid_text = df_hybrid_text.groupby('query_text', group_keys=False).apply(rank_by_text_score)
    
    # 3. Ранжирование по комбинации score и score_text_only (например, среднее)
    df_hybrid_combined = df_hybrid.copy()
    df_hybrid_combined['combined_score'] = 0.7 * df_hybrid_combined['score'] + 0.3 * df_hybrid_combined['score_text_only']
    
    def rank_by_combined_score(group):
        group = group.sort_values('combined_score', ascending=False)
        group['rank_combined'] = range(1, len(group) + 1)
        return group
    
    df_hybrid_combined = df_hybrid_combined.groupby('query_text', group_keys=False).apply(rank_by_combined_score)
    
    # Считаем метрики для каждого способа
    metrics_original = calculate_metrics_for_ranking(df_hybrid_original, 'rank', 'query_text')
    metrics_text_only = calculate_metrics_for_ranking(df_hybrid_text, 'rank_text_only', 'query_text')
    metrics_text = calculate_metrics_for_ranking(df_text, 'rank', 'query_text')
    
    return {
        'text': metrics_text,
        'hybrid': metrics_text_only,
        'hybrid_with_l2_feature': metrics_original,
    }


def print_metrics_comparison(metrics_dict: Dict):
    """Красивый вывод сравнения метрик"""
    
    print("\n" + "="*100)
    print("COMPARISON OF HYBRID SEARCH RANKING METHODS")
    print("="*100)
    
    # Создаем DataFrame для удобного отображения
    comparison_data = []
    
    for method_name, metrics in metrics_dict.items():
        comparison_data.append({
            'Method': method_name,
            'MRR@10': f"{metrics['mrr@10']:.4f}",
            'Precision@10': f"{metrics['precision@10']:.4f}",
            'nDCG@10': f"{metrics['ndcg@10']:.4f}",
            'AP@10': f"{metrics['ap@10']:.4f}",
            'Recall@1': f"{metrics['recall'][1]:.2%}",
            'Recall@5': f"{metrics['recall'][5]:.2%}",
            'Recall@10': f"{metrics['recall'][10]:.2%}",
            'Recall@20': f"{metrics['recall'][20]:.2%}",
            'Avg Rank': f"{metrics['avg_rank_relevant']:.1f}",
            'Queries': metrics['queries_processed']
        })
    
    df_comparison = pd.DataFrame(comparison_data)
    
    # Выводим таблицу
    print("\n" + df_comparison.to_string(index=False))
    
    # Рассчитываем улучшения
    if 'hybrid_original' in metrics_dict and 'hybrid_text_only' in metrics_dict:
        orig = metrics_dict['hybrid_original']
        text = metrics_dict['hybrid_text_only']
        
        print("\n" + "-"*80)
        print("IMPROVEMENT OF TEXT-ONLY RANKING OVER ORIGINAL HYBRID:")
        print("-"*80)
        
        improvements = {
            'MRR@10': (text['mrr@10'] - orig['mrr@10']) / orig['mrr@10'] * 100 if orig['mrr@10'] > 0 else 0,
            'Precision@10': (text['precision@10'] - orig['precision@10']) / orig['precision@10'] * 100 if orig['precision@10'] > 0 else 0,
            'Recall@10': (text['recall'][10] - orig['recall'][10]) / orig['recall'][10] * 100 if orig['recall'][10] > 0 else 0,
        }
        
        for metric, improvement in improvements.items():
            direction = "↑" if improvement > 0 else "↓"
            print(f"  {metric}: {direction} {improvement:+.1f}%")
        
        best_method = max(metrics_dict.items(), 
                         key=lambda x: x[1]['mrr@10'] * 0.4 + 
                                      x[1]['precision@10'] * 0.3 + 
                                      x[1]['recall'][10] * 0.3)
        
        print(f"\n★ Best method by weighted score: {best_method[0]} "
              f"(MRR: {best_method[1]['mrr@10']:.4f}, P@10: {best_method[1]['precision@10']:.4f})")

df = pd.read_parquet("../search_results.parquet")

print("="*80)
print("ANALYSIS OF SEARCH RESULTS")
print("="*80)

hybrid_comparison = compare_hybrid_rankings(df)
print_metrics_comparison(hybrid_comparison)


ANALYSIS OF SEARCH RESULTS


  df_hybrid_text = df_hybrid_text.groupby('query_text', group_keys=False).apply(rank_by_text_score)
  df_hybrid_combined = df_hybrid_combined.groupby('query_text', group_keys=False).apply(rank_by_combined_score)
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(
  recall = df.groupby(group_by_col).apply(



COMPARISON OF HYBRID SEARCH RANKING METHODS

                Method MRR@10 Precision@10 nDCG@10  AP@10 Recall@1 Recall@5 Recall@10 Recall@20 Avg Rank  Queries
                  text 0.4220       0.0222  0.5558 0.0900    5.11%   13.61%    21.32%    30.23%     16.0      999
                hybrid 0.4807       0.0494  0.6016 0.2377   14.91%   34.83%    49.45%    63.86%     16.3      999
hybrid_with_l2_feature 0.8423       0.0906  0.8809 0.7631   68.17%   86.59%    90.59%    92.89%      2.8      999


Добавление векторного источника кандидатов докидывает метрик, но сильный прирост видно именно после добавления фичи в ранжирующую модель