In [1]:
import time

from loguru import logger
from tqdm import tqdm
from dataclasses import dataclass
import h5py


import matplotlib.pyplot as plt
import pprint
import json
import numpy as np
import psutil
from typing import List, Union

# index import
import hnswlib
from rich import print as rprint  

In [2]:
# load data

data_dir = '/data/sift_1m_old_dist.h5'

threads_count = 48

@dataclass
class DataSet:
    train: np.ndarray
    test: np.ndarray
    neighbors: np.ndarray
    train_attr: np.ndarray
    test_attr: np.ndarray


def get_stats(index, data, neighbors):

        k = 10
        if index is None:
            raise ValueError("You need to build the index first.")
        recalls = {'top10': [], 'top100': []}
        start_time = time.time()
        labels, distances = index.knn_query(
            data.test, data.test_attr, k=k)
        qps = int(data.test.shape[0] / (time.time() - start_time))
        c = 0

        for neighbors, true_neighbors in zip(labels, data.neighbors):

            recall_at_10 = len(np.intersect1d(
                true_neighbors[:k], neighbors)) / min(k, len(true_neighbors))
            recalls['top10'].append(recall_at_10)


        return {

            f"qps_4_threads": qps,
            "recalls": {"top10": round(np.mean(recalls['top10']), 3)},
            "query_count":len(recalls['top10'])
        }

def get_stats_deep(index, data, neighbors):

        index.set_num_threads(1)
        k = 10
        # ef = 100

        # logger.info("🧪 Evaluating index ...")
        if index is None:
            raise ValueError("You need to build the index first.")
        recalls = {'top10': [], 'top100': []}
        start_time = time.time()
        labels, distances = index.knn_query(
            data.test, data.test_attr, k=k)
        labels, distances, nhops, valid_ratio, distances_count = index.knn_query(
            data.test, data.test_attr, k=k, collect_metrics=True)
        distances_count = distances_count[:data.test.shape[0]]
        qps = int(data.test.shape[0] / (time.time() - start_time))
        c = 0

        for neighbors, true_neighbors in zip(labels, data.neighbors):

            recall_at_10 = len(np.intersect1d(
                true_neighbors[:k], neighbors)) / min(k, len(true_neighbors))
            recalls['top10'].append(recall_at_10)

        return {

            f"qps_uni_threads": qps,
            "recalls": {"top10": round(np.mean(recalls['top10']), 3)},
            "neighbors":labels,
            "distances":distances,

            "nhops": {
                "nhops_max": nhops.max(),
                "nhops_min": nhops.min(),
                "nhops_mean": nhops.mean(),
                "nhops_std":nhops.std(),
            },
          "distances_comps": {
                "distances_comps_max": distances_count.max(),
                "distances_comps_min": distances_count.min(),
                "distances_comps_mean": distances_count.mean(),
                "distances_comps_std":distances_count.std(),
            }
        }
def h5_to_memory(exp_env_name, selectivity=None):

    with h5py.File(exp_env_name, 'r') as dataset:
        # Load datasets into memory
        train = np.array(dataset["train_vectors"])
        train_attr = np.array(dataset["train_attr_vectors"])
        test = np.array(dataset["test_vectors"])
        test_attr = np.array(dataset["test_attr_vectors_0"])
        neighbors = np.array(dataset["neighbors_0"])
        if selectivity:
            # Mask test attributes based on selectivity
            column_idx_start, column_idx_end = selectivity
            train_attr = train_attr[:,column_idx_start:column_idx_end]
            test_attr = test_attr[:,column_idx_start:column_idx_end]

            # select valid queries 
            valid_queries = np.argwhere(test_attr.sum(axis=1)>0).ravel()
            # print(valid_queries[:10])
            test_attr = test_attr[valid_queries]
            test = test[valid_queries]
            neighbors = neighbors[valid_queries]

        return DataSet(
            train=train,
            test=test,
            train_attr=train_attr,
            test_attr=test_attr,
            neighbors=neighbors,
        )

selectivity =(0,100)
data = h5_to_memory(data_dir, selectivity=selectivity)



In [3]:
dim = data.train.shape[1]
dim_attr = data.train_attr.shape[1]
max_num_elements = data.train.shape[0]
hnsw_index = hnswlib.Index(space='l2', dim=dim, dim_attr=dim_attr)
use_cache = False

index_name = "/data/anas.aitaomar/exp_theory_1m.bin"

if use_cache:
    hnsw_index.load_index(index_name)
else:
    time_start = time.time() 
    hnsw_index.init_index(max_elements=max_num_elements,
                          ef_construction=200, M=32)
    hnsw_index.set_num_threads(threads_count)
    hnsw_index.add_items(data=data.train, data_attr=data.train_attr)
    print("index time =", time.time() - time_start,"s")
    hnsw_index.save_index(index_name)


search_modes = {
    "rwalks":0,
    "hnsw-inline":1,
    "hnsw-post":3
}

index time = 28.747633695602417 s


# RWalks


In [4]:
logger= []


hybrid_factor = 0.0
pron_factor = 0.0
hnsw_index.set_num_threads(threads_count)
hnsw_index.set_search_mode(search_modes["rwalks"]) 

for ef in [10,50,55,60,70,100,180,200,250,300,320,400,500]:
    # print(ef)
    hnsw_index.set_ef(ef)
    hnsw_index.set_hybrid_factor(hybrid_factor)
    hnsw_index.set_pron_factor(pron_factor)
    
    # score
    stats = get_stats(hnsw_index, data, data.neighbors)
    stats["ef"]=ef
    logger.append({'ef': ef, 'qps': stats["qps_4_threads"], 'recall': stats["recalls"]["top10"]})

    
logger

[{'ef': 10, 'qps': 48430, 'recall': 0.134},
 {'ef': 50, 'qps': 121484, 'recall': 0.462},
 {'ef': 55, 'qps': 114766, 'recall': 0.485},
 {'ef': 60, 'qps': 99244, 'recall': 0.506},
 {'ef': 70, 'qps': 91661, 'recall': 0.543},
 {'ef': 100, 'qps': 72441, 'recall': 0.623},
 {'ef': 180, 'qps': 45172, 'recall': 0.742},
 {'ef': 200, 'qps': 40566, 'recall': 0.759},
 {'ef': 250, 'qps': 34573, 'recall': 0.795},
 {'ef': 300, 'qps': 29464, 'recall': 0.822},
 {'ef': 320, 'qps': 27514, 'recall': 0.831},
 {'ef': 400, 'qps': 22670, 'recall': 0.857},
 {'ef': 500, 'qps': 18295, 'recall': 0.878}]

In [5]:
hnsw_index.set_search_mode(search_modes["rwalks"]) 
ef = 500
hybrid_factor = 0.0
pron_factor = 0.0
hnsw_index.set_ef(ef)
hnsw_index.set_hybrid_factor(hybrid_factor)
hnsw_index.set_pron_factor(pron_factor)

# score
stats = get_stats_deep(hnsw_index, data, data.neighbors)
rprint(stats)

In [6]:
top_k_count = []
for _nn_set in stats["neighbors"] : 
    top_k_count.append(len([n for n in _nn_set if n != 0]))

print("top10 set count avg =",sum(top_k_count)/len(stats["neighbors"]))

# compute avg distance and max distance 
_distances = stats["distances"]
_distances_max = np.max(_distances,axis=1)
_distances_avg = np.mean(_distances,axis=1)
print(
    f"""
Max distance (avg): {np.mean(_distances_max)}
Mean distance (avg): {np.mean(_distances_avg)}
    """
)

top10 set count avg = 9.544691061787642

Max distance (avg): 76931.1640625
Mean distance (avg): 70371.328125
    


# HNSW-Inline


In [7]:
# num_threads 48 / latency 1m2s
logger= []


hybrid_factor = 0.0
pron_factor = 0.0
hnsw_index.set_num_threads(threads_count)
hnsw_index.set_search_mode(search_modes["hnsw-inline"]) 

for ef in [10, 20, 50, 500, 1_000_000]:
    # print(ef)
    hnsw_index.set_ef(ef)
    hnsw_index.set_hybrid_factor(hybrid_factor)
    hnsw_index.set_pron_factor(pron_factor)
    
    # score
    stats = get_stats(hnsw_index, data, data.neighbors)
    stats["ef"]=ef
    logger.append({'ef': ef, 'qps': stats["qps_4_threads"], 'recall': stats["recalls"]["top10"]})

    
logger

[{'ef': 10, 'qps': 5573, 'recall': 0.758},
 {'ef': 20, 'qps': 3866, 'recall': 0.759},
 {'ef': 50, 'qps': 1924, 'recall': 0.759},
 {'ef': 500, 'qps': 309, 'recall': 0.759},
 {'ef': 1000000, 'qps': 30, 'recall': 0.759}]

In [8]:
hnsw_index.set_search_mode(search_modes["hnsw-inline"]) 
ef = 10
hybrid_factor = 0.0
pron_factor = 0.0
hnsw_index.set_ef(ef)
hnsw_index.set_hybrid_factor(hybrid_factor)
hnsw_index.set_pron_factor(pron_factor)

# score
stats = get_stats_deep(hnsw_index, data, data.neighbors)
rprint(stats)

In [9]:
top_k_count = []
for _nn_set in stats["neighbors"] : 
    top_k_count.append(len([n for n in _nn_set if n != 0]))

print("top10 set count avg =",sum(top_k_count)/len(stats["neighbors"]))

# compute avg distance and max distance 
_distances = stats["distances"]
_distances_max = np.max(_distances,axis=1)
_distances_avg = np.mean(_distances,axis=1)
print(
    f"""
Max distance (avg): {np.mean(_distances_max)}
Mean distance (avg): {np.mean(_distances_avg)}
    """
)

top10 set count avg = 8.518296340731853

Max distance (avg): 74924.34375
Mean distance (avg): 60724.9921875
    
