In [1]:
from usearch.io import load_matrix, save_matrix



In [2]:
vectors = load_matrix('vectors.hbin', view=True)
vectors.shape

(247154006, 1024)

In [3]:
dimensions = load_matrix('dimensions.ibin', view=False)
dimensions.shape

(1024, 1024)

In [4]:
import numpy as np

def select_greedy_dimensions(distance_matrix, num_rows_to_select):
    # Initialize the set with the index of the first row
    selected_indices = {np.argmax(np.sum(distance_matrix, axis=1))}
    
    # While we need more rows
    while len(selected_indices) < num_rows_to_select:
        max_dist = 0
        max_idx = -1
        
        # Find the row with the largest minimum distance to the selected set
        for i in range(distance_matrix.shape[0]):
            if i not in selected_indices:
                # Minimum distance to any row in the selected set
                min_dist = np.min([distance_matrix[i, j] for j in selected_indices])
                if min_dist > max_dist:
                    max_dist = min_dist
                    max_idx = i
        
        # Add the index with the largest minimum distance
        selected_indices.add(max_idx)
    
    return list(selected_indices)

In [5]:
dimensions_selected = select_greedy_dimensions(dimensions, 64)
print(dimensions_selected)
print(f'{len(dimensions_selected) = }')

[640, 257, 6, 135, 774, 395, 908, 140, 655, 785, 148, 916, 661, 404, 794, 158, 416, 162, 35, 292, 805, 679, 169, 938, 560, 434, 51, 692, 315, 700, 573, 829, 703, 959, 961, 193, 835, 331, 75, 973, 338, 979, 471, 218, 604, 352, 226, 482, 868, 995, 870, 872, 250, 366, 751, 752, 750, 371, 884, 755, 115, 631, 378, 509]
len(dimensions_selected) = 64


In [6]:
import numpy as np
from numba import jit, prange

In [7]:
@jit(parallel=True, nopython=False)
def read_dimensions(vectors, dimensions):
    # For every row we want to load just a few dimensions
    # quantizing them into bits on the fly. We then have to copy it to
    # guarantee continuity of the memory layout.
    # 
    #   return np.packbits((vectors[:, dimensions] > 0).astype(np.uint8), axis=1).copy()
    #
    # That, however, is extrememly slow, so let's write a NumBa kernel to
    # do this in parallel.
    nvec, _ = vectors.shape
    ndim_new = len(dimensions)
    nbits = (ndim_new + 7) // 8
    packed_bits = np.zeros((nvec, nbits), dtype=np.uint8)
    
    # Moreover, Numba can't deal with `float16`. We want to find positive values.
    # We can represent the values as `uint16` and then check the binary representation
    # against zero and negative masks.
    negative_mask: np.uint16 = 0x8000
    zero_mask: np.uint16 = 0x0000
    vectors = vectors.view(np.uint16)
    
    for i in prange(nvec):
        # The inner loop goes over the columns.
        # We process 8 columns at a time, packing them into a single byte.
        for j in range(0, ndim_new, 8):
            byte = 0
            # Process each bit; note that we need to handle the case where
            # the number of columns is not a multiple of 8
            for bit in range(8):
                # Shift the bit into the correct position and add it to the byte
                k = dimensions[j + bit]
                scalar = vectors[i, k]
                scalar_is_positive = (scalar & negative_mask) != 0 and (scalar != zero_mask)
                byte |= (scalar_is_positive << (7 - bit))
            # Store the packed byte in the corresponding position
            packed_bits[i, j // 8] = byte

    return packed_bits    

In [13]:
bits_all = read_dimensions(vectors.view(np.uint16), np.arange(1024))
bits_all.shape

(247154006, 128)

In [8]:
bits_first = read_dimensions(vectors.view(np.uint16), np.arange(64))
bits_first.shape

(247154006, 8)

In [12]:
bits_greedy = read_dimensions(vectors.view(np.uint16), np.array(dimensions_selected))
bits_greedy.shape

(247154006, 8)

In [9]:
bits_first

array([[ 35,  98, 135, ..., 159,  17,  13],
       [ 11, 237,  29, ..., 197,  19,   6],
       [ 11, 141,  56, ..., 133,   6,   6],
       ...,
       [143, 133,  52, ..., 162,  26, 130],
       [201, 133, 134, ..., 131,  14, 246],
       [201, 133, 162, ..., 135,  92, 162]], dtype=uint8)

In [13]:
bits_greedy

array([[127,  35,  92, ..., 210,  60, 173],
       [ 61, 133, 218, ..., 214, 100,   8],
       [ 63, 197, 216, ..., 214,  76,  41],
       ...,
       [102, 122, 204, ..., 129,  10,  19],
       [214, 234, 105, ..., 197, 171,  22],
       [ 90, 201,  73, ..., 135,  40,   6]], dtype=uint8)

In [10]:
save_matrix(bits_first, 'vectors-first-64d.bbin')

In [None]:
save_matrix(bits_greedy, 'vectors-greedy-64d.bbin')

In [14]:
save_matrix(bits_all, 'vectors-binary-1024d.bbin')

In [17]:
from usearch.index import Index, MetricKind
index = Index(ndim=len(dimensions_selected), metric=MetricKind.Hamming)
index.hardware_acceleration

'ice'

In [12]:
np.random.shuffle(bits_first)
bits_first.shape

(247154006, 8)

In [18]:
_ = index.add(None, bits_first, log=True)

Add: 100%|██████████| 247154006/247154006 [30:22<00:00, 135580.20vector/s]


In [19]:
matches = index.search(bits_first, 10, log=True)

Search: 100%|██████████| 247154006/247154006 [33:52<00:00, 121617.76vector/s]


In [20]:
index

usearch.Index
- config
-- data type: ScalarKind.B1
-- dimensions: 64
-- metric: MetricKind.Hamming
-- multi: False
-- connectivity: 16
-- expansion on addition :128 candidates
-- expansion on search: 64 candidates
- binary
-- uses OpenMP: 0
-- uses SimSIMD: 1
-- supports half-precision: 1
-- uses hardware acceleration: ice
- state
-- size: 247,154,006 vectors
-- memory usage: 73,001,884,864 bytes
-- max level: 5
--- 0. 247,154,006 nodes
--- 1. 15,341,176 nodes
--- 2. 976,320 nodes
--- 3. 61,632 nodes
--- 4. 3,648 nodes
--- 5. 384 nodes

In [22]:
from usearch.eval import SearchStats

keys = np.arange(bits_first.shape[0])
count_matches: int = matches.count_matches(keys)
stats = SearchStats(
    index_size=len(index),
    count_queries=len(keys),
    count_matches=count_matches,
    visited_members=matches.visited_members,
    computed_distances=matches.computed_distances,
)
stats

SearchStats(index_size=247154006, count_queries=247154006, count_matches=243969333, visited_members=21663765829, computed_distances=504377081463)

In [23]:
stats.mean_recall

0.9871146211564946

In [24]:
stats.mean_efficiency

0.9999917430427498

In [25]:
count_matches_top1: int = matches.count_matches(keys, count=1)
stats_top1 = SearchStats(
    index_size=len(index),
    count_queries=len(keys),
    count_matches=count_matches_top1,
    visited_members=matches.visited_members,
    computed_distances=matches.computed_distances,
)
stats_top1

SearchStats(index_size=247154006, count_queries=247154006, count_matches=241915137, visited_members=21663765829, computed_distances=504377081463)

In [26]:
stats_top1.mean_recall

0.978803220369408

In [28]:
stats_top1.mean_efficiency

0.9999917430427498