In [1]:
!pip install pandas pyarrow datasets usearch faiss-cpu

Collecting pyarrow
  Downloading pyarrow-15.0.2-cp311-cp311-win_amd64.whl (24.8 MB)
                                              0.0/24.8 MB ? eta -:--:--
                                              0.0/24.8 MB 1.3 MB/s eta 0:00:20
                                             0.0/24.8 MB 487.6 kB/s eta 0:00:51
                                             0.1/24.8 MB 871.5 kB/s eta 0:00:29
                                             0.1/24.8 MB 717.5 kB/s eta 0:00:35
                                             0.1/24.8 MB 708.1 kB/s eta 0:00:35
                                             0.2/24.8 MB 655.4 kB/s eta 0:00:38
                                             0.2/24.8 MB 620.6 kB/s eta 0:00:40
                                             0.2/24.8 MB 625.1 kB/s eta 0:00:40
                                             0.2/24.8 MB 602.4 kB/s eta 0:00:41
                                             0.3/24.8 MB 582.4 kB/s eta 0:00:43
                                             

In [2]:
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

We can potentially download all of 500 GB of content, but we don't need all of that. We don't need the original float-based embeddings. And we may not need all the languages at once, so let's start with just English, comparing Cohere and MixedBread embeddings.

In [3]:
file_path = 'mixedbread/00000.parquet'
df = pq.read_table(file_path)
print(f"rows: {len(df):,}")

rows: 57,543


In [4]:
df['emb'][0]

<pyarrow.FixedSizeBinaryScalar: b'/#\xecL\x9a\xf6\x98\xba\xda\x08\xe7S]C5a\x07\xcde\xc56\x11\xfbKFz\xe6gp\x7f2\\\xdd\x90B\xdd\x13@t\x1e\xf6\x80\xbb\x85u\xc1\xb6/\x99\xcc\xc5\x85\x11\xb9\x1b\xc7\xe9Zo\x8d\xd0U\x08D\xfd\x17\x81\xe4\x02\xa7x\xfc\x98\xf1\n(\xf6\xa6l\xf4\x8f\xec \x80\xb2\x82\xa1;\xa4\xa9\x96N\xbe\x060\xec\xa0\xab9\x1cZ\x03\xa7X\xc2\xc0?\xcc\x17mq\x13\xc4\xec\x9a\x1f~G\x90\xd5MVU?\xf8\x19T1'>

In [5]:
def to_array(x):
    binary_data = x.as_buffer()
    array_uint8 = np.frombuffer(binary_data, dtype=np.uint8)
    return array_uint8

In [27]:
def vstack_files(dir):
    embeddings = []
    filenames = sorted(os.listdir(dir))

    # Iterate through all the Parquet files in the directory
    for filename in filenames:
        if filename.endswith(".parquet"):
            file_path = os.path.join(dir, filename)
            # Read the table from the Parquet file
            df = pq.read_table(file_path)
            df_embeddings = [to_array(x) for x in df['emb']]
            # print(f"Read {len(df_embeddings):,} embeddings from {filename} with shape {df_embeddings[0].shape}")
            embeddings.extend(df_embeddings)
    
    embeddings = np.vstack(embeddings)
    return embeddings

In [28]:
embeddings = vstack_files('mixedbread')
embeddings.shape

(5310478080,)

First let's benchmark of exact search on a small subset of the data with FAISS and USearch, [like in the Sentence Transformers benchmark](https://github.com/UKPLab/sentence-transformers/pull/2549).

In [8]:
brute_force_limit_vectors = 100_000
brute_force_limit_queries = 1_000
brute_force_limit_matches = 10

Let's shuffle the data a bit, so the results are not biased by the order of the data.

In [9]:
brute_force_vectors_indices = np.random.choice(embeddings.shape[0], size=brute_force_limit_vectors, replace=False)
brute_force_vectors = embeddings[brute_force_vectors_indices]
brute_force_queries_indices = np.random.choice(brute_force_limit_vectors, size=brute_force_limit_queries, replace=False)
brute_force_queries = embeddings[brute_force_queries_indices]


## Exact Search

### FAISS

In [10]:
import faiss

In [11]:
dim = 1024  # This is the bit dimension, change according to your data
index = faiss.IndexBinaryFlat(dim)
index.add(brute_force_vectors)

In [12]:
%%timeit
faiss_distances, faiss_indices = index.search(brute_force_queries, brute_force_limit_matches)

181 ms ± 35.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### USearch

In [13]:
from usearch.index import MetricKind, search

In [14]:
%%timeit
usearch_results = search(brute_force_vectors, brute_force_queries, brute_force_limit_matches, MetricKind.Tanimoto, exact=True)

973 ms ± 97.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Approximate Search

### USearch

In [None]:
from usearch.index import Index, MetricKind
index = Index(ndim=1024, metric=MetricKind.Hamming)
index.hardware_acceleration

In [None]:
index.add(None, embeddings, log=True)

In [None]:
matches = index.search(embeddings, 10, log=True)

In [None]:
print(index)