In [1]:
!pip install --upgrade --force-reinstall pandas pyarrow usearch faiss-cpu

Collecting pandas
  Using cached pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pyarrow
  Using cached pyarrow-15.0.2-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting usearch
  Using cached usearch-2.9.2-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (26 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting numpy<2,>=1.26.0 (from pandas)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting tqdm (from usearch)
  Downloading tqdm-4.66

In [2]:
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

We can potentially download all of 500 GB of content, but we don't need all of that. We don't need the original float-based embeddings. And we may not need all the languages at once, so let's start with just English, comparing Cohere and MixedBread embeddings.

In [3]:
file_path = 'cohere/en-00000.parquet'
df = pq.read_table(file_path)
print(f"rows: {len(df):,}")

rows: 100,000


In [4]:
df['emb'][0]

<pyarrow.FixedSizeBinaryScalar: b'\xbeiOZ\xcb\xc6\xf7\x89=\xe3]\xfa\xaf\x1f\x9d:^\xf0[*\xdb\xd8k\n\xa8\x16q; \xa2\xdc0v\xa5|\xa2.\x80;\xe0$\xce\x1d\xa0\xea\x1d]\x1dg\x80?\x14\x90\xa7\x9e\xdf\xeav%\xa5\x06\x96\x87\xb5j\xb8@\xed\x878;\x81\x8e\xae\x11]\xae3\x10\x19\xb4\xb9\x9f/\xe3\xee\x7f\xdf&}\x8c\xe5$\x03\x89=*\xdf\x0c\x92\x88n\xd4\xd8L\xf1\xf9\xd8\x81^\xc1~\x9ed\xd8QK\xfe\xee\x92\xf8uE\xe8\x00\xdf\xf46'>

In [5]:
def to_array(x):
    binary_data = x.as_buffer()
    array_uint8 = np.frombuffer(binary_data, dtype=np.uint8)
    return array_uint8

In [6]:
def vstack_files(dir):
    embeddings = []
    filenames = sorted(os.listdir(dir))

    # Iterate through all the Parquet files in the directory
    for filename in filenames:
        if filename.endswith(".parquet"):
            file_path = os.path.join(dir, filename)
            # Read the table from the Parquet file
            df = pq.read_table(file_path)
            df_embeddings = [to_array(x) for x in df['emb']]
            # print(f"Read {len(df_embeddings):,} embeddings from {filename} with shape {df_embeddings[0].shape}")
            embeddings.extend(df_embeddings)
    
    embeddings = np.vstack(embeddings)
    return embeddings

In [7]:
embeddings = vstack_files('cohere')
embeddings.shape

(41488110, 128)

First let's benchmark exact search on a small subset of the data with FAISS and USearch, [like in the Sentence Transformers benchmark](https://github.com/UKPLab/sentence-transformers/pull/2549).

In [8]:
brute_force_limit_vectors = 100_000
brute_force_limit_queries = 1_000
brute_force_limit_matches = 10

Let's shuffle the data a bit, so the results are not biased by the order of the data.

In [9]:
brute_force_vectors_indices = np.random.choice(embeddings.shape[0], size=brute_force_limit_vectors, replace=False)
brute_force_vectors = embeddings[brute_force_vectors_indices]
brute_force_queries_indices = np.random.choice(brute_force_limit_vectors, size=brute_force_limit_queries, replace=False)
brute_force_queries = embeddings[brute_force_queries_indices]

## Exact Search

### FAISS

In [10]:
import faiss

In [11]:
dim = 1024  # This is the bit dimension, change according to your data
index = faiss.IndexBinaryFlat(dim)
index.add(brute_force_vectors)

In [12]:
%%timeit -n 1 -r 1
index = faiss.IndexBinaryFlat(dim)
index.add(brute_force_vectors)
faiss_distances, faiss_indices = index.search(brute_force_queries, brute_force_limit_matches)

402 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### USearch

In [13]:
from usearch.index import MetricKind, search

In [14]:
%%timeit -n 1 -r 1
usearch_results = search(brute_force_vectors, brute_force_queries, brute_force_limit_matches, MetricKind.Hamming, exact=True)

795 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Approximate Search

### USearch

In [15]:
from usearch.index import Index, MetricKind
index = Index(ndim=1024, metric=MetricKind.Hamming)
index.hardware_acceleration

'avx2'

In [16]:
_ = index.add(None, embeddings, log=True)

Add: 100%|██████████| 41488110/41488110 [05:33<00:00, 124240.87vector/s]


In [17]:
matches = index.search(embeddings, 10, log=True)

Search: 100%|██████████| 41488110/41488110 [02:12<00:00, 312524.52vector/s]


In [18]:
index

usearch.Index
- config
-- data type: ScalarKind.B1
-- dimensions: 1024
-- metric: MetricKind.Hamming
-- multi: False
-- connectivity: 16
-- expansion on addition:128 candidates
-- expansion on search: 64 candidates
- binary
-- uses OpenMP: 0
-- uses SimSIMD: 1
-- supports half-precision: 1
-- uses hardware acceleration: avx2
- state
-- size: 41,488,110 vectors
-- memory usage: 17,704,181,120 bytes
-- max level: 4
--- 0. 41,488,110 nodes
--- 1. 2,576,252 nodes
--- 2. 164,927 nodes
--- 3. 11,904 nodes
--- 4. 1,344 nodes

In [19]:
from usearch.eval import SearchStats

keys = np.arange(embeddings.shape[0])
count_matches: int = matches.count_matches(keys)
stats = SearchStats(
    index_size=len(index),
    count_queries=len(keys),
    count_matches=count_matches,
    visited_members=matches.visited_members,
    computed_distances=matches.computed_distances,
)
stats

SearchStats(index_size=41488110, count_queries=41488110, count_matches=40068659, visited_members=3677721960, computed_distances=76061010852)

In [20]:
stats.mean_recall

0.965786559088857

In [21]:
stats.mean_efficiency

0.9999558109371663