In [1]:
import os
import glob
import numpy as np
import struct
import pandas as pd
import threading
import subprocess
import time
from tqdm import tqdm
from utils import *

hw_type = 'gpu'
HOST_ROOT_DIR = '/raid/cuvs-bench-runner'
DATASET_NAME = 'miracl-5M'
ALGORITHM = 'cuvs_cagra'
K = 10 # Top K to return during search
query_batch_sizes = [1, 10, 100] # List of query batch sizes to evaluate

# Compute ground truth. Only needs to be ran once. Note: only works with GPU image (bug?).
compute_ground_truth = True

# Run parameter optimization grid search
run_parameter_sweep = False

# Delete previous results in ./datasets/$DATASET_NAME/result/ 
# and /datasets/$DATASET_NAME/index folder
clear_previous_results = True

# Docker images for CPU and GPU runs
CUVS_BENCH_CPU_DOCKER_IMG = 'rapidsai/cuvs-bench-cpu:25.10a-py3.12-amd64'
CUVS_BENCH_GPU_DOCKER_IMG = 'rapidsai/cuvs-bench:25.10-cuda13.0-py3.12'

SEARCH_MODE = 'latency' # Choose 'latency' or 'throughput'

# TODO: restrict algorithms based on hw_type

# Compute Ground Truth for Vector Search Validation

In [2]:
def generate_docker_run_cmd(command, HOST_ROOT_DIR, hw_type):
    """
    Docker run command wrapper.

    hw_type: str
      Select 'cpu' or 'gpu'.
    """
    if hw_type == 'cpu':
        docker_run_cmd = f"""docker run --rm \
          --user root \
          --shm-size=16GB \
          --entrypoint /bin/bash \
          --workdir /data/benchmarks \
          -v {HOST_ROOT_DIR}:/data/benchmarks \
          {CUVS_BENCH_CPU_DOCKER_IMG} -c "{command}"
          """
    elif hw_type == 'gpu':
        docker_run_cmd = f"""docker run --gpus all --rm \
          --user root \
          --shm-size=16GB \
          --entrypoint /bin/bash \
          --workdir /data/benchmarks \
          -v {HOST_ROOT_DIR}:/data/benchmarks \
          {CUVS_BENCH_GPU_DOCKER_IMG} -c "{command}"
          """
    else:
        raise ValueError(f"Unknown hardware value: {hw_type}. Select 'cpu' or 'gpu'.")

    return(docker_run_cmd)

if compute_ground_truth:
    print('Computing ground truth.... \n')
    
    # Get file extension for base.* file in data directory
    base_file = glob.glob(f'{HOST_ROOT_DIR}/datasets/{DATASET_NAME}/base.*')
    PRECISION_SUFFIX = base_file[0].split('.')[-1]
    
    # Generate ground truth data
    CONTAINER_DATASET_PATH = '/data/benchmarks/datasets'
    CONTAINER_DATASET_PATH = CONTAINER_DATASET_PATH + '/' + DATASET_NAME
    gen_ground_truth_cmd = f'''python -m cuvs_bench.generate_groundtruth \
        {CONTAINER_DATASET_PATH}/base.{PRECISION_SUFFIX} \
        --queries={CONTAINER_DATASET_PATH}/query.{PRECISION_SUFFIX} \
        --output {CONTAINER_DATASET_PATH} -k {K}
        '''
    
    # Generate docker command for generating ground truth in cuvs-bench.
    # Currently implementation ONLY works with GPU.
    ground_truth_docker_cmd = generate_docker_run_cmd(gen_ground_truth_cmd, HOST_ROOT_DIR, 'gpu')
    
    # Run the command
    subprocess.run(ground_truth_docker_cmd, shell=True)
    
elif compute_ground_truth==False:
    print(f'Skipping ground truth computation since compute_ground_truth=False.')

else:
    raise ValueError(f'Unknown compute_ground_truth value. It must be a boolean.')

Computing ground truth.... 

Reading whole dataset
Dataset size    1.9 GB, shape (4999995, 384), dtype int8
Reading queries from file /data/benchmarks/datasets/miracl-5M/query.i8bin
Calculating true nearest neighbors
Step 0/9:
Step 1/9:
Step 2/9:
Step 3/9:
Step 4/9:
Step 5/9:
Step 6/9:
Step 7/9:
Step 8/9:
Step 9/9:
writing /data/benchmarks/datasets/miracl-5M/groundtruth.neighbors.ibin (10005, 10) uint32 ...
writing /data/benchmarks/datasets/miracl-5M/groundtruth.distances.fbin (10005, 10) float32 ...


# Run Parameter Optimization Grid Search

In [3]:
%%time

# Root path scripts directory mounted to within docker container
DOCKER_ROOT_PATH = '/data/benchmarks'

if clear_previous_results:
    subprocess.run(f'rm -r {DOCKER_ROOT_PATH}/datasets/{DATASET_NAME}/result/', shell=True)
    subprocess.run(f'rm -r {DOCKER_ROOT_PATH}/datasets/{DATASET_NAME}/index/', shell=True)

if run_parameter_sweep:
    param_sweep_batch_size = 10000

    cuvs_param_sweep_cmd = generate_cuvs_bench_run_cmd(DATASET_NAME, ALGORITHM, 
                                                        K, param_sweep_batch_size, 'base', 
                                                        SEARCH_MODE, SEARCH_ONLY=False)

    param_sweep_docker_cmd = generate_docker_run_cmd(cuvs_param_sweep_cmd, HOST_ROOT_DIR, hw_type)
    subprocess.run(param_sweep_docker_cmd, shell=True)
    
    # Plotting results
    plotting_cmd = f'''python -m cuvs_bench.plot \
        --dataset {DATASET_NAME} \
        --mode latency \
        --output-filepath /data/benchmarks/datasets/{DATASET_NAME} \
        --dataset-path /data/benchmarks/datasets \
        --algorithms {ALGORITHM} \
        --search -bs {param_sweep_batch_size} -k {K}
        '''
    
    plotting_docker_cmd = generate_docker_run_cmd(plotting_cmd, HOST_ROOT_DIR, hw_type)
    
    # Run the command
    subprocess.run(plotting_docker_cmd, shell=True)
    
elif run_parameter_sweep == False:
    print(f'Skipping parameter grid search since run_parameter_sweep=False. \n')

else:
    raise ValueError(f'Unknown run_parameter_sweep value. It must be a boolean.')

Skipping parameter grid search since run_parameter_sweep=False. 

CPU times: user 3.13 ms, sys: 0 ns, total: 3.13 ms
Wall time: 5.32 ms


rm: cannot remove '/data/benchmarks/datasets/miracl-5M/result/': No such file or directory
rm: cannot remove '/data/benchmarks/datasets/miracl-5M/index/': No such file or directory


# Run Benchmark with Optimal Parameters
Determine build/search parameters that are optimal based on the grid search performed. Update the 'goups.test' field in `./configs/ALGORITHM.yaml` for the specific algorithm desired. 

The following code will run a single index build and then multiple searches at different batch sizes to evaluate search performance. Outputs are provided as a csv file ending with `*merged.csv`.

In [4]:
# Run single index build command:
print(f'Building search index with {ALGORITHM}. \n')
cuvs_bench_build_cmd = generate_cuvs_bench_run_cmd(DATASET_NAME, ALGORITHM, K, 1, 'test', SEARCH_MODE, SEARCH_ONLY=False)
build_telem = run_command_with_telemetry(generate_docker_run_cmd(cuvs_bench_build_cmd, HOST_ROOT_DIR, hw_type), 'build', hw_type)
build_telem_df = pd.Series(build_telem).to_frame().T

# Build stage also does a single search afterwards. 
# --build flag errors out looking for search results.

print(f'Indexing completed in {build_telem_df["build_duration_sec"].values[0]:.2f} s. [Including docker init] \n')

# Run vector search at various batch sizes:
print(f'Running vector search with {ALGORITHM}. \n')
search_telem_store = []
for bs in query_batch_sizes:
    print(f'  Query batch size: {bs}')
    cuvs_bench_search_cmd = generate_cuvs_bench_run_cmd(DATASET_NAME, ALGORITHM, K, bs, 'test', SEARCH_MODE, SEARCH_ONLY=True)
    search_telem = run_command_with_telemetry(generate_docker_run_cmd(cuvs_bench_search_cmd, HOST_ROOT_DIR, hw_type), 'search')
    search_telem['query_batch_size'] = bs
    search_telem_store.append(search_telem)

print('All search tasks have been completed.')
search_telem_df = pd.DataFrame(search_telem_store)

Building search index with cuvs_cagra. 



2025-11-19T21:58:06+00:00
Running /opt/conda/bin/ann/CUVS_CAGRA_ANN_BENCH
Run on (128 X 2395.52 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x64)
  L1 Instruction 32 KiB (x64)
  L2 Unified 512 KiB (x64)
  L3 Unified 16384 KiB (x32)
Load Average: 3.17, 6.05, 8.07
command_line: /opt/conda/bin/ann/CUVS_CAGRA_ANN_BENCH --build --data_prefix=/data/benchmarks/datasets --benchmark_out_format=json --benchmark_counters_tabular=true --benchmark_out=/data/benchmarks/datasets/miracl-5M/result/build/cuvs_cagra,test.json.lock --force miracl-5M_cuvs_cagra,test,k10,bs1_d44fab54-c592-11f0-a33b-f2f9a06ebb36.json
dataset: miracl-5M
dim: 384
distance: euclidean
gpu_driver_version: 13.0
gpu_gpuDirectRDMASupported: 1
gpu_hostNativeAtomicSupported: 0
gpu_mem_bus_width: 5120
gpu_mem_freq: 1512000000.000000
gpu_mem_global_size: 85094825984
gpu_mem_shared_size: 167936
gpu_name: NVIDIA A100 80GB PCIe
gpu_pageableMemoryAccess: 0
gpu_pageableMemoryAccessUsesHostPageTables: 0
gpu_runtime_version: 13.0
gpu_sm_count: 108

[I] [21:58:06.881022] Using the dataset file '/data/benchmarks/datasets/miracl-5M/base.i8bin'
[I] [21:58:06.887192] Overwriting file: /data/benchmarks/datasets/miracl-5M/index/cuvs_cagra_test.graph_degree96.intermediate_graph_degree96.graph_build_algoNN_DESCENT
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Benchmark                                                                                                             Time             CPU   Iterations        GPU graph_degree index_size intermediate_graph_degree
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
cuvs_cagra_test.graph_degree96.intermediate_graph_degree96.graph_build_algoNN_DESCENT/process_time/

2025-11-19T21:59:15+00:00
Running /opt/conda/bin/ann/CUVS_CAGRA_ANN_BENCH
Run on (128 X 2395.52 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x64)
  L1 Instruction 32 KiB (x64)
  L2 Unified 512 KiB (x64)
  L3 Unified 16384 KiB (x32)
Load Average: 24.92, 12.50, 10.18
command_line: /opt/conda/bin/ann/CUVS_CAGRA_ANN_BENCH --search --data_prefix=/data/benchmarks/datasets --benchmark_counters_tabular=true --override_kv=k:10 --override_kv=n_queries:1 --benchmark_min_warmup_time=1 --benchmark_out_format=json --mode=latency --benchmark_out=/data/benchmarks/datasets/miracl-5M/result/search/cuvs_cagra,test,k10,bs1.json --force miracl-5M_cuvs_cagra,test,k10,bs1_fcfce1e8-c592-11f0-91c9-06fae178afbb.json
dataset: miracl-5M
dim: 384
distance: euclidean
gpu_driver_version: 13.0
gpu_gpuDirectRDMASupported: 1
gpu_hostNativeAtomicSupported: 0
gpu_mem_bus_width: 5120
gpu_mem_freq: 1512000000.000000
gpu_mem_global_size: 85094825984
gpu_mem_shared_size: 167936
gpu_name: NVIDIA A100 80GB PCIe
gpu_pageableMemoryA

[I] [21:59:15.137142] Using the query file '/data/benchmarks/datasets/miracl-5M/query.i8bin'
[I] [21:59:15.137192] Using the ground truth file '/data/benchmarks/datasets/miracl-5M/groundtruth.neighbors.ibin'
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Benchmark                                                                                                             Time             CPU   Iterations        GPU    Latency     Recall end_to_end items_per_second      itopk          k  n_queries search_width total_queries
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

2025-11-19T21:59:24+00:00
Running /opt/conda/bin/ann/CUVS_CAGRA_ANN_BENCH
Run on (128 X 2395.52 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x64)
  L1 Instruction 32 KiB (x64)
  L2 Unified 512 KiB (x64)
  L3 Unified 16384 KiB (x32)
Load Average: 21.32, 12.14, 10.09
command_line: /opt/conda/bin/ann/CUVS_CAGRA_ANN_BENCH --search --data_prefix=/data/benchmarks/datasets --benchmark_counters_tabular=true --override_kv=k:10 --override_kv=n_queries:10 --benchmark_min_warmup_time=1 --benchmark_out_format=json --mode=latency --benchmark_out=/data/benchmarks/datasets/miracl-5M/result/search/cuvs_cagra,test,k10,bs10.json --force miracl-5M_cuvs_cagra,test,k10,bs10_0260037c-c593-11f0-ab72-da0b2695bcbf.json
dataset: miracl-5M
dim: 384
distance: euclidean
gpu_driver_version: 13.0
gpu_gpuDirectRDMASupported: 1
gpu_hostNativeAtomicSupported: 0
gpu_mem_bus_width: 5120
gpu_mem_freq: 1512000000.000000
gpu_mem_global_size: 85094825984
gpu_mem_shared_size: 167936
gpu_name: NVIDIA A100 80GB PCIe
gpu_pageableMemo

[I] [21:59:24.171969] Using the query file '/data/benchmarks/datasets/miracl-5M/query.i8bin'
[I] [21:59:24.172017] Using the ground truth file '/data/benchmarks/datasets/miracl-5M/groundtruth.neighbors.ibin'
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Benchmark                                                                                                             Time             CPU   Iterations        GPU    Latency     Recall end_to_end items_per_second      itopk          k  n_queries search_width total_queries
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

2025-11-19T21:59:33+00:00
Running /opt/conda/bin/ann/CUVS_CAGRA_ANN_BENCH
Run on (128 X 2395.52 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x64)
  L1 Instruction 32 KiB (x64)
  L2 Unified 512 KiB (x64)
  L3 Unified 16384 KiB (x32)
Load Average: 18.19, 11.77, 9.99
command_line: /opt/conda/bin/ann/CUVS_CAGRA_ANN_BENCH --search --data_prefix=/data/benchmarks/datasets --benchmark_counters_tabular=true --override_kv=k:10 --override_kv=n_queries:100 --benchmark_min_warmup_time=1 --benchmark_out_format=json --mode=latency --benchmark_out=/data/benchmarks/datasets/miracl-5M/result/search/cuvs_cagra,test,k10,bs100.json --force miracl-5M_cuvs_cagra,test,k10,bs100_07c2bf30-c593-11f0-9805-725331ad4b15.json
dataset: miracl-5M
dim: 384
distance: euclidean
gpu_driver_version: 13.0
gpu_gpuDirectRDMASupported: 1
gpu_hostNativeAtomicSupported: 0
gpu_mem_bus_width: 5120
gpu_mem_freq: 1512000000.000000
gpu_mem_global_size: 85094825984
gpu_mem_shared_size: 167936
gpu_name: NVIDIA A100 80GB PCIe
gpu_pageableMe

[I] [21:59:33.204200] Using the query file '/data/benchmarks/datasets/miracl-5M/query.i8bin'
[I] [21:59:33.204248] Using the ground truth file '/data/benchmarks/datasets/miracl-5M/groundtruth.neighbors.ibin'
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Benchmark                                                                                                             Time             CPU   Iterations        GPU    Latency     Recall end_to_end items_per_second      itopk          k  n_queries search_width total_queries
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
# Load results from ./result/search dir and merge with telemetry
latency_test_csv = glob.glob(f'{HOST_ROOT_DIR}/datasets/{DATASET_NAME}/result/search/{ALGORITHM},test,k{K},bs*,latency.csv')

latency_test_results = pd.concat([pd.read_csv(fn) for fn in latency_test_csv])

# Merge result dataframe with telemetry
latency_test_results = pd.concat([latency_test_results, build_telem_df], axis=1)
latency_test_results[['n_queries', 'k']] = latency_test_results[['n_queries', 'k']].astype(int)
latency_test_results = latency_test_results.merge(search_telem_df, left_on='n_queries', right_on='query_batch_size')

latency_test_results.to_csv(f'{HOST_ROOT_DIR}/datasets/{DATASET_NAME}/{ALGORITHM},test,k{K},latency,merged.csv', index=False)

latency_test_results

Unnamed: 0,algo_name,index_name,recall,throughput,latency,threads,cpu_time,GPU,end_to_end,itopk,...,build_max_cpu_util,build_max_ram_gb,build_avg_gpu_util,build_max_gpu_util,build_max_vram_gb,search_duration_sec,search_avg_cpu_util,search_max_cpu_util,search_max_ram_gb,query_batch_size
0,cuvs_cagra_test,cuvs_cagra_test.graph_degree96.intermediate_gr...,0.95502,18409.491429,0.000543,1,0.543216,0.000533,0.699642,64.0,...,97.6,24.644588,53.911765,100.0,13.326294,8.017712,2.044444,11.9,18.237324,10
1,cuvs_cagra_test,cuvs_cagra_test.graph_degree96.intermediate_gr...,0.951382,1904.189319,0.000525,1,0.525218,0.000515,0.703194,64.0,...,97.6,24.644588,53.911765,100.0,13.326294,8.017409,2.044444,11.9,18.123444,1
2,cuvs_cagra_test,cuvs_cagra_test.graph_degree96.intermediate_gr...,0.95467,140448.937366,0.000712,1,0.712112,0.000702,0.699901,64.0,...,97.6,24.644588,53.911765,100.0,13.326294,8.019078,2.033333,11.9,18.331116,100
