In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import re


In [None]:
import csv
import logging
from typing import Dict, List, Tuple

import numpy as np
import pytrec_eval

def evaluate(
    qrels: Dict[str, Dict[str, int]],
    results: Dict[str, Dict[str, float]],
    k_values: List[int],
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]:
    """
    仿照 beir.retrieval.evaluation.EvaluateRetrieval.evaluate 编写的评估函数。
    """
    ndcg = {}
    _map = {}
    recall = {}
    precision = {}

    for k in k_values:
        ndcg[f"NDCG@{k}"] = 0.0
        _map[f"MAP@{k}"] = 0.0
        recall[f"Recall@{k}"] = 0.0
        precision[f"P@{k}"] = 0.0

    map_string = "map_cut." + ",".join([str(k) for k in k_values])
    ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
    recall_string = "recall." + ",".join([str(k) for k in k_values])
    precision_string = "P." + ",".join([str(k) for k in k_values])
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels, {map_string, ndcg_string, recall_string, precision_string}
    )
    scores = evaluator.evaluate(results)

    for query_id in scores.keys():
        for k in k_values:
            ndcg[f"NDCG@{k}"] += scores[query_id]["ndcg_cut_" + str(k)]
            _map[f"MAP@{k}"] += scores[query_id]["map_cut_" + str(k)]
            recall[f"Recall@{k}"] += scores[query_id]["recall_" + str(k)]
            precision[f"P@{k}"] += scores[query_id]["P_" + str(k)]

    for k in k_values:
        ndcg[f"NDCG@{k}"] = round(ndcg[f"NDCG@{k}"] / len(scores), 5)
        _map[f"MAP@{k}"] = round(_map[f"MAP@{k}"] / len(scores), 5)
        recall[f"Recall@{k}"] = round(recall[f"Recall@{k}"] / len(scores), 5)
        precision[f"P@{k}"] = round(precision[f"P@{k}"] / len(scores), 5)

    # for eval_metric in [ndcg, _map, recall, precision]:
    #     logging.info("\n")
    #     for k, v in eval_metric.items():
    #         logging.info(f"{k}: {v:.4f}")

    return recall


def load_gt(gt_path: str) -> Dict[str, Dict[str, int]]:
    """
    加载 ground-truth npy 文件并转换为 pytrec_eval 所需的 qrels 格式。
    查询ID将使用其在npy文件中的索引（0, 1, 2, ...）。
    """
    gt_data = np.load(gt_path, allow_pickle=True)
    qrels = {}
    for i, gt_list in enumerate(gt_data):
        query_id = str(i)
        qrels[query_id] = {}
        for passage_id in gt_list:
            qrels[query_id][str(passage_id)] = 1  # 假设相关性得分为1
    return qrels


def load_results(results_path: str) -> Dict[str, Dict[str, float]]:
    """
    加载检索结果的tsv文件并转换为 pytrec_eval 所需的 results 格式。
    - 如果文件有4列 (query_id, passage_id, rank, score)，则使用第四列的分数。
    - 如果文件只有3列 (query_id, passage_id, rank)，则使用 1/rank 作为分数。
    """
    results = {}
    with open(results_path, "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
            if not row: continue # Skip empty lines

            query_id, passage_id = row[0], row[1]
            
            if query_id not in results:
                results[query_id] = {}
            
            # 判断使用真实分数还是生成代理分数
            if len(row) == 4:
                score = float(row[3])
            elif len(row) == 3:
                rank = int(row[2])
                score = 1.0 / rank
            else:
                logging.warning(f"Skipping malformed line with {len(row)} columns: {row}")
                continue
            
            results[query_id][passage_id] = score
            
    return results

In [None]:

# HELLO PROFESSOR THIS IS FOR READING THE RESULTS OF THE MVRHNSW
import os
import re
import pandas as pd
directory = "/data/lijunlin/sigmod2025-results/multi-hnsw-result"
keyword = "clip-multi-clustering"

# ----------------------------
# File pairing
# ----------------------------
all_files = os.listdir(directory)
txt_files = [
    f for f in all_files
    if f.endswith(".txt") and keyword in f and "metadata" not in f
]

file_pairs = {}
for f in txt_files:
    if f.endswith("_summary.txt"):
        base_name = f[:-len("_summary.txt")]
        file_pairs.setdefault(base_name, {})["summary"] = os.path.join(directory, f)
    else:
        base_name = f[:-len(".txt")]
        file_pairs.setdefault(base_name, {})["results"] = os.path.join(directory, f)

# ----------------------------
# Helpers
# ----------------------------
def load_results_tsv(path: str) -> pd.DataFrame:
    # Results: query_id, passage_id, rank, score (tab-separated, no header)
    return pd.read_csv(
        path,
        sep="\t",
        header=None,
        names=["query_id", "passage_id", "rank", "score"],
        dtype={"query_id": int, "passage_id": int, "rank": int, "score": float}
    )

def extract_avg_retrieval_time(summary_text: str) -> float | None:
    # Robust parser for avg retrieval time (ms)
    patterns = [
        r"Average\s+Query\s+Time:\s*([\d.]+)\s*ms",
        r"Avg\s*retrieval\s*time\s*[:=]\s*([\d.]+)\s*ms",
        r"retrieval_time_single_query_average\(ms\)\s*[:=]\s*([\d.]+)",
        r"avg.*?ms\s*[:=]\s*([\d.]+)",
    ]
    for pat in patterns:
        m = re.search(pat, summary_text, flags=re.IGNORECASE)
        if m:
            try:
                return float(m.group(1))
            except ValueError:
                pass
    return None

def df_results_to_dict(df_res: pd.DataFrame) -> dict:
    # Drop duplicates (query, passage) keeping highest score; cast IDs to str
    df = (
        df_res.sort_values('score', ascending=False)
              .drop_duplicates(['query_id', 'passage_id'])
              .astype({'query_id': str, 'passage_id': str})
    )
    return (
        df.groupby('query_id')
          .apply(lambda g: dict(zip(g['passage_id'], g['score'])))
          .to_dict()
    )

# ----------------------------
# Build final DataFrame: one row per base with qps + metrics
# ----------------------------
rows = []
for base, pair in file_pairs.items():
    # Summary → avg_ms → qps
    avg_ms = None
    if 'summary' in pair and pair['summary']:
        with open(pair['summary'], 'r', encoding='utf-8', errors='ignore') as f:
            avg_ms = extract_avg_retrieval_time(f.read())
    qps = (1000.0 / avg_ms) if (avg_ms is not None and avg_ms > 0) else None

    # Results → metrics via evaluate
    metrics = {}
    if 'results' in pair and pair['results']:
        df_res = load_results_tsv(pair['results'])
        results_dict = df_results_to_dict(df_res)
        metrics = evaluate(qrels=qrels_data, results=results_dict, k_values=K_VALUES) or {}

    row = {'Algorithm': "MVRHNSW", 'QPS': qps, "Dataset": "DBpedia-entity", "Recall": metrics["Recall@10"]}
    # row.update(metrics)  # adds e.g. 'Recall@10', etc.
    rows.append(row)

mvrhnsw_df = pd.DataFrame(rows).reset_index(drop=True)
mvrhnsw_df

In [None]:
def extract_limit(row):
    identifier = row['Identifier']
    name = row['Algorithm']
    
    # Default to None
    match = None

    if name == "IGP":
        # Extract probe_topk
        match = re.search(r'probe_topk_(\d+)', identifier)
    
    if match:
        return int(match.group(1))
    else:
        return 0  # If pattern not found, assume safe to keep


In [None]:
# --- Define multiple keyword pairs with names ---
keyword_pairs = [
    ("clip-multi-clustering-retrieval-IGP", "clip-multi-clustering-IGP", "IGP"),
    ("clip-multi-clustering-retrieval-dessert", "clip-multi-clustering-dessert", "Dessert"),
    ("clip-multi-clustering-retrieval-plaid", "clip-multi-clustering-plaid", "Plaid"),
]

all_results = []

for KEYWORD_JSON, KEYWORD_TSV, name in keyword_pairs:
    # match = re.search(r'top\d+-.*topk_(\d+)', identifier)
    # if match:
    #     topk_value = int(match.group(1))
    #     if topk_value > 10000:
    #         logging.info(f"Skipping {identifier} because topk={topk_value} > 10000")
    #         continue
    logging.info(f"Processing keyword pair: JSON='{KEYWORD_JSON}', TSV='{KEYWORD_TSV}' (Name='{name}')")
    
    # Find files
    json_files = find_files_with_keyword(JSON_ROOT_DIR, KEYWORD_JSON, ".json")
    tsv_files = [
        os.path.join(TSV_ROOT_DIR, f)
        for f in os.listdir(TSV_ROOT_DIR)
        if f.startswith(KEYWORD_TSV + "-") and f.endswith(".tsv")
    ]

    # Pair files by identifier
    pairs = pair_json_tsv(json_files, tsv_files, KEYWORD_JSON, KEYWORD_TSV)

    # Process each pair
    for identifier, json_file, tsv_file in pairs:
        logging.info(f"Processing pair: {identifier}")

        data = load_json_file(json_file)
        if data is None:
            continue
        
        if name == 'IGP':
            avg_ms = float(data['search_time']['retrieval_time_single_query_average(ms)'])
        else:
            avg_ms = float(data['search_time']['average_query_time_ms'])
        qps = 1000 / avg_ms

        results_data = load_results(tsv_file)
        recall = evaluate(qrels=qrels_data, results=results_data, k_values=K_VALUES)

        all_results.append({
            "Identifier":identifier,
            'Recall': recall['Recall@10'],
            'QPS': qps,
            'Algorithm': name,  # <-- Add name here
            "Dataset": "DBpedia-entity"
        })

# Convert to DataFrame
df_all_results = pd.DataFrame(all_results)

df_all_results['limit_value'] = df_all_results.apply(extract_limit, axis=1)

# Filter rows where limit_value <= 10000
df_filtered = df_all_results[df_all_results['limit_value'] <= 1000].copy()

# Drop the temporary column
df_filtered.drop(columns=['limit_value'], inplace=True)


df_sorted = df_filtered.groupby("Algorithm", group_keys=False).apply(
    lambda x: x.sort_values(by="Recall", ascending=True)
)

df_sorted


In [None]:
my_sorted_recall_list = [0.845,0.862,0.876,0.874,0.877,0.874,0.875,0.856,0.857,0.867,0.88]
my_sorted_qps_list = [774.494,745.388,719.843,695.595,719.843,695.595,646.601,566.891,489.971,422.421,334.579]
df_temp = pd.DataFrame({
    'Recall': my_sorted_recall_list,
    'QPS': my_sorted_qps_list,
    'Algorithm': 'Multi-HNSW',
    'Dataset': 'DBpedia-entity'
})
my_sorted_recall_list = [0.259,0.4,0.472,0.533,0.569,0.667,0.719,0.742,0.773,0.769,0.772]
my_sorted_qps_list = [11179.2,8459.66,7024,7007.71,6994.57,4917.48,4624.73,3934.13,3012.67,3141.93,3082.59]
df_temp2 = pd.DataFrame({
    'Recall': my_sorted_recall_list,
    'QPS': my_sorted_qps_list,
    'Algorithm': 'SVR-HNSW',
    'Dataset': 'DBpedia-entity'
})

df_combined = pd.concat([df_filtered, df_temp], ignore_index=True)
df_combined = pd.concat([df_combined, df_temp2], ignore_index=True)

df_combined = df_combined.drop(columns=['Identifier'])
df_combined = pd.concat([df_combined, mvrhnsw_df], ignore_index=True)

In [None]:
df =df_combined[df_combined['QPS'] <= 4000] 

In [None]:
algorithms = ["IGP", "Dessert", "Plaid", "Multi-HNSW", "MVRHNSW", 'SVR-HNSW']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set a clean style for the plots
sns.set_style("whitegrid")

# Define a consistent color/style map for algorithms
markers = ['o', 's', 'D', 'v', '^', '>', '<', 'p', '*', 'h', 'H', '+', 'x']

# Use a high-contrast qualitative colormap (e.g., 'tab10')
from matplotlib import colormaps as cm
cmap = cm.get_cmap('tab10')  # tab10 is great for distinct colors
color_list = list(getattr(cmap, 'colors', [cmap(i / 10.0) for i in range(10)]))

algorithm_map = {algo: {'marker': markers[i % len(markers)],
                        'color': color_list[i % len(color_list)]}
                 for i, algo in enumerate(algorithms)}

# Assuming you want to display up to 6 plots (2 rows, 3 cols)
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
axes = axes.flatten()

# Ensure QPS column exists (assuming 'df' is loaded outside this block)
if 'QPS' not in df.columns and 'Time (msec)' in df.columns:
    df['QPS'] = 1000.0 / df['Time (msec)']

for i, ds in enumerate(datasets):
    if i >= len(axes): break # Safety break
    ax = axes[i]
    df_subset = df[df['Dataset'] == ds]
    
    for algo in algorithms:
        data = df_subset[df_subset['Algorithm'] == algo].copy()
        
        # Sort by Recall to make lines smooth
        data = data.sort_values(by='Recall')
        style = algorithm_map[algo]
        ax.plot(
            data['Recall'],
            data['QPS'],
            label=algo,
            color=style['color'],
            marker=style['marker'],
            linestyle='-',
            markersize=5,   # INCREASED MARKER SIZE
            linewidth=3     # INCREASED LINE WIDTH
        )
    
    # Logarithmic Y-axis
    # ax.set_yscale('log')
    
    # Manually set X-axis limits to 0 to 1.05 for better clarity in the relevant range
    ax.set_xlim(0, 1.05) 
    
    # Bold axis labels
    if i % 3 == 0:
        ax.set_ylabel('QPS', fontsize=14, fontweight='bold')
    if i >= 3:
        ax.set_xlabel('Recall', fontsize=14, fontweight='bold')
    
    # Bold tick labels
    ax.tick_params(axis='both', labelsize=12, width=1.5)
    
    # Subplot label
    ax.text(
        0.5, -0.25,
        subplot_labels.get(ds, ds),
        transform=ax.transAxes,
        ha='center',
        fontsize=14,
        fontweight='bold'
    )

# Legend at top (adjusted location)
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(
    handles,
    labels,
    loc='upper center',
    bbox_to_anchor=(0.5, 1.02), # Slightly lower for tighter layout
    ncol=len(algorithms),
    fontsize=13, # Slightly larger font
    frameon=False
)

# Main title
# Moved title to a safer place to avoid collision with the legend
fig.suptitle('Query Performance Comparison', fontsize=18, y=1.06, fontweight='bold') 

# Hide unused subplots
for j in range(len(datasets), len(axes)):
    axes[j].axis('off')

# Use standard tight_layout
plt.tight_layout(rect=[0, 0, 1, 1]) # Standard tight_layout, title is outside
fig.savefig('query_performance_recreation_better.png', dpi=300, bbox_inches='tight')
plt.show()