# Export Compressed Metadata

In [20]:
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

In [2]:
file_path = 'cohere/en-00000.parquet'
df = pq.read_table(file_path)
print(f"rows: {len(df):,}")

rows: 100,000


In [3]:
df

pyarrow.Table
_id: string
url: string
title: string
text: string
emb: list<item: float>
  child 0, item: float
----
_id: [["20231101.en_13194570_0","20231101.en_13194570_1","20231101.en_13194570_2","20231101.en_13194570_3","20231101.en_13194570_4",...,"20231101.en_13212981_1","20231101.en_13212981_2","20231101.en_13212981_3","20231101.en_13212981_4","20231101.en_13212981_5"],["20231101.en_13212981_6","20231101.en_13212981_7","20231101.en_13212983_0","20231101.en_13212983_1","20231101.en_13212983_2",...,"20231101.en_13231439_7","20231101.en_13231439_8","20231101.en_13231439_9","20231101.en_13231439_10","20231101.en_13231439_11"],...,["20231101.en_13345126_1","20231101.en_13345126_2","20231101.en_13345126_3","20231101.en_13345126_4","20231101.en_13345126_5",...,"20231101.en_13364092_0","20231101.en_13364092_1","20231101.en_13364092_2","20231101.en_13364092_3","20231101.en_13364092_4"],["20231101.en_13364092_5","20231101.en_13364095_0","20231101.en_13364095_1","20231101.en_13364095_2","20

In [4]:
def yield_titles_and_links(dir):
    filenames = sorted(os.listdir(dir))

    # Iterate through all the Parquet files in the directory
    for filename in filenames:
        if filename.endswith(".parquet"):
            file_path = os.path.join(dir, filename)
            # Read the table from the Parquet file
            df = pq.read_table(file_path)
            yield from zip(df['title'], df['url'])

In [5]:
from itertools import islice

head = list(islice(yield_titles_and_links('cohere'), 5))
head

[(<pyarrow.StringScalar: 'Аԥсуа бызшәа'>,
  <pyarrow.StringScalar: 'https://ab.wikipedia.org/wiki/%D0%90%D4%A5%D1%81%D1%83%D0%B0%20%D0%B1%D1%8B%D0%B7%D1%88%D3%99%D0%B0'>),
 (<pyarrow.StringScalar: 'Аԥсуа бызшәа'>,
  <pyarrow.StringScalar: 'https://ab.wikipedia.org/wiki/%D0%90%D4%A5%D1%81%D1%83%D0%B0%20%D0%B1%D1%8B%D0%B7%D1%88%D3%99%D0%B0'>),
 (<pyarrow.StringScalar: 'Аԥсуа бызшәа'>,
  <pyarrow.StringScalar: 'https://ab.wikipedia.org/wiki/%D0%90%D4%A5%D1%81%D1%83%D0%B0%20%D0%B1%D1%8B%D0%B7%D1%88%D3%99%D0%B0'>),
 (<pyarrow.StringScalar: 'Аҟәа'>,
  <pyarrow.StringScalar: 'https://ab.wikipedia.org/wiki/%D0%90%D2%9F%D3%99%D0%B0'>),
 (<pyarrow.StringScalar: 'Аҟәа'>,
  <pyarrow.StringScalar: 'https://ab.wikipedia.org/wiki/%D0%90%D2%9F%D3%99%D0%B0'>)]

In [6]:
def to_str(x):
    x = str(x).replace('\n', ' ').replace('\r', ' ')
    if x is None and len(x) == 0:
        return 'missing'
    return x

In [6]:
titles_file = 'cohere-titles.txt'
urls_file = 'cohere-urls.txt'

In [7]:
from tqdm import tqdm

upper_bound = int(250e6)

# Open the files in write mode
pbar = tqdm(total=upper_bound)
with open(titles_file, 'w', encoding='utf-8') as tf, open(urls_file, 'w', encoding='utf-8') as uf:
    for title, url in yield_titles_and_links('cohere'):
        clean_title = to_str(title)
        clean_url = to_str(url)
        tf.write(clean_title + '\n')
        uf.write(clean_url + '\n')
        pbar.update(1)
        
pbar.close()

 99%|█████████▉| 247154006/250000000 [22:48<00:15, 180636.29it/s]


## Validation

Let's make sure that our search works as expected. For that, lets memory-map the titles and URLs with StringZilla and construct the index with USearch. We can start with a small dataset and then scale up.

In [2]:
!pip install stringzilla

Collecting stringzilla
  Downloading stringzilla-3.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading stringzilla-3.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.7/255.7 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stringzilla
Successfully installed stringzilla-3.7.1


In [4]:
from stringzilla import Str, Strs, File

In [9]:
rows_titles = Str(File(titles_file)).split('\n')
len(rows_titles)

<stringzilla.Strs object at 0x70713c3d4270>

In [14]:
rows_urls = Str(File(urls_file)).split('\n')
len(rows_urls)

247154007

In [12]:
rows_titles[-2]

sz.Str('I-Titanic Thompson')

In [39]:
rows_titles[10]

sz.Str('Аҟәа')

In [40]:
rows_urls[10]

sz.Str('https://ab.wikipedia.org/wiki/%D0%90%D2%9F%D3%99%D0%B0')

In [13]:
File(titles_file)

<stringzilla.File at 0x70713c3390b0>

In [15]:
from usearch.io import load_matrix

In [41]:
upper_bound = None # 10_000_000
vectors_hbin = load_matrix(
    'vectors.hbin', 
    count_rows=upper_bound, 
    view=True,
)
vectors_hbin.shape

(247154006, 1024)

In [42]:
vectors_hbin[0]

memmap([ 0.04727 ,  0.0861  , -0.01083 , ...,  0.03882 , -0.03372 ,
        -0.011894], dtype=float16)

In [67]:
upper_bound = None # 10_000_000
vectors_binary = load_matrix(
    'vectors-binary-1024d.bbin', 
    count_rows=upper_bound, 
    view=True,
)
vectors_float = load_matrix(
    'vectors.hbin', 
    count_rows=upper_bound, 
    view=True,
)
vectors_binary.shape, vectors_float.shape

((247154006, 128), (247154006, 1024))

In [47]:
from usearch.index import Index

In [69]:
from usearch.index import Index, MetricKind
index_float = Index(ndim=1024, metric=MetricKind.Cos)
index_float.hardware_acceleration

'haswell'

In [49]:
from usearch.index import Index, MetricKind
index64 = Index(ndim=64, metric=MetricKind.Hamming)
index128 = Index(ndim=128, metric=MetricKind.Hamming)
index256 = Index(ndim=256, metric=MetricKind.Hamming)
index512 = Index(ndim=512, metric=MetricKind.Hamming)
index1024 = Index(ndim=1024, metric=MetricKind.Hamming)

index64.hardware_acceleration

'ice'

In [50]:
keys = np.arange(vectors_binary.shape[0])

In [51]:
_ = index64.add(keys, vectors_binary[:, :8], log=True)

Add: 100%|██████████| 247154006/247154006 [30:52<00:00, 133389.97vector/s]


In [52]:
_ = index128.add(keys, vectors_binary[:, :128//8], log=True)

Add: 100%|██████████| 247154006/247154006 [29:45<00:00, 138450.34vector/s]


In [53]:
_ = index256.add(keys, vectors_binary[:, :256//8], log=True)

Add: 100%|██████████| 247154006/247154006 [31:09<00:00, 132208.74vector/s]


In [54]:
_ = index512.add(keys, vectors_binary[:, :512//8], log=True)

Add: 100%|██████████| 247154006/247154006 [30:17<00:00, 135956.80vector/s]


In [55]:
_ = index1024.add(keys, vectors_binary[:, :1024//8], log=True)

Add: 100%|██████████| 247154006/247154006 [30:13<00:00, 136264.07vector/s]


In [56]:
index1024

usearch.Index
- config
-- data type: ScalarKind.B1
-- dimensions: 1024
-- metric: MetricKind.Hamming
-- multi: False
-- connectivity: 16
-- expansion on addition :128 candidates
-- expansion on search: 64 candidates
- binary
-- uses OpenMP: 0
-- uses SimSIMD: 1
-- supports half-precision: 1
-- uses hardware acceleration: ice
- state
-- size: 247,154,006 vectors
-- memory usage: 105,214,139,584 bytes
-- max level: 5
--- 0. 247,154,006 nodes
--- 1. 15,341,176 nodes
--- 2. 976,320 nodes
--- 3. 61,632 nodes
--- 4. 3,648 nodes
--- 5. 384 nodes

In [70]:
_ = index_float.add(keys, vectors_float, log=True)

Add: 100%|██████████| 247154006/247154006 [1:25:28<00:00, 48191.05vector/s]


In [71]:
index_float.save('index-float-1024d.usearch')

In [57]:
index1024.save('vectors-binary-1024.usearch')

In [58]:
index512.save('vectors-binary-512.usearch')

In [59]:
index256.save('vectors-binary-256.usearch')

In [60]:
index128.save('vectors-binary-128.usearch')

Now let's random sample some entries and print top 10 closest matches for each of them.

In [104]:
lookup_size = int(1e6)
candidates = 10
indexes = [(index64, 64), (index128, 128), (index256, 256), (index512, 512), (index1024, 1024)]
keys_queries = np.random.choice(keys, lookup_size, replace=False)

In [105]:
matches_per_index = [index.search(vectors_binary[keys_queries, :ndim // 8], candidates, log=True) for index, ndim in indexes]
matches_per_index

Search: 100%|██████████| 1000000/1000000 [00:05<00:00, 196426.19vector/s]
Search: 100%|██████████| 1000000/1000000 [00:02<00:00, 346744.21vector/s]
Search: 100%|██████████| 1000000/1000000 [00:03<00:00, 298790.95vector/s]
Search: 100%|██████████| 1000000/1000000 [00:03<00:00, 303244.66vector/s]
Search: 100%|██████████| 1000000/1000000 [00:03<00:00, 284561.83vector/s]


[usearch.BatchMatches(10000000 across 1000000 queries),
 usearch.BatchMatches(10000000 across 1000000 queries),
 usearch.BatchMatches(10000000 across 1000000 queries),
 usearch.BatchMatches(10000000 across 1000000 queries),
 usearch.BatchMatches(10000000 across 1000000 queries)]

For larger batches you may not want to log the matches, but for now, let's log them to make sure that the search is working as expected.

In [65]:
for i, (index, ndim) in enumerate(indexes):
    matches_of_index = matches_per_index[i]
    print(f"Index: {i} - {ndim} bits")
    for i, (query_key, query_matches) in enumerate(zip(keys_queries, matches_of_index)):
        print(f"{i}. Query: #{query_key} = {rows_titles[int(query_key)]}")
        for j, match in enumerate(query_matches):        
            print(f"  {j}. Match: {match} = {rows_titles[int(match.key)]}")

Index: 0 - 64 bits
0. Query: #43093098 = Geschichte von Lauchringen
  0. Match: Match(key=43093098, distance=0.0) = Geschichte von Lauchringen
  1. Match: Match(key=111024319, distance=4.0) = Alan Belcher
  2. Match: Match(key=37118816, distance=5.0) = Walter Laqueur
  3. Match: Match(key=54026435, distance=5.0) = BMW Z
  4. Match: Match(key=79245825, distance=5.0) = Online to offline
1. Query: #211804521 = Гийотьер — Габриель Пери (станция метро)
  0. Match: Match(key=232645856, distance=5.0) = Едуард Смірнов
  1. Match: Match(key=232617796, distance=5.0) = Персида Ненадович
  2. Match: Match(key=200516064, distance=6.0) = Рио-Гранде-Сити
  3. Match: Match(key=232653332, distance=6.0) = FELIN
  4. Match: Match(key=172197880, distance=6.0) = 115 (ierakstu kompānija)
2. Query: #22017906 = Arroyo Falero
  0. Match: Match(key=22017906, distance=0.0) = Arroyo Falero
  1. Match: Match(key=21514580, distance=3.0) = Arroyo Platanarito
  2. Match: Match(key=26621054, distance=3.0) = Umbreia Pe

Outputting recall isn't enough to compare the performance of different methods. Instead, we can use NDCG@10 to compare the quality of the search results. We can use the following formula to calculate NDCG@10:

$$
NDCG@10 = \frac{DCG@10}{IDCG@10}
$$

where DCG@10 is the discounted cumulative gain at 10 and IDCG@10 is the ideal discounted cumulative gain at 10. The formula for DCG@10 is:

In [106]:
matches_float = index_float.search(vectors_float[keys_queries], candidates, log=True)
matches_float

Search: 100%|██████████| 1000000/1000000 [00:11<00:00, 90839.21vector/s]


usearch.BatchMatches(10000000 across 1000000 queries)

In [126]:
def sum_geometric_series(a, r, n):
    """
    Computes the sum of the first n terms of a geometric series.
    
    Parameters:
    a (float): The first term of the series.
    r (float): The common ratio.
    n (int): The number of terms.
    
    Returns:
    float: The sum of the first n terms.
    """
    if r == 1:  # Special case where the series does not converge
        return a * n
    else:
        return a * (1 - r**n) / (1 - r)


def rank_distance(expected: np.ndarray, predicted: np.ndarray, weight_multiplier: float = 0.9, penalize_missing: bool = True) -> float:
    assert len(expected) == len(predicted), "Expected and predicted arrays must be of the same length."
    assert len(set(expected)) == len(expected), "Expected array must not contain duplicate items."
    assert len(set(predicted)) == len(predicted), "Predicted array must not contain duplicate items."
    
    n = len(expected)
    common_items = np.intersect1d(expected, predicted)
    
    def one_side(keys, other):
        local = 0
        weight = 1
        for rank, key in enumerate(keys):
            if key in common_items:
                rank_in_other = np.where(other == key)[0][0]
                local += abs(rank - rank_in_other) * weight
            elif penalize_missing:
                local += (n - rank) * weight
            weight *= weight_multiplier
        return local

    total = one_side(predicted, expected) + one_side(expected, predicted)
    
    count_ranks = 2 * (len(common_items) if not penalize_missing else n)
    if count_ranks == 0:
        return 1.0
    
    max_rank_difference = n
    upper_bound = max_rank_difference * sum_geometric_series(1, weight_multiplier, count_ranks)        
    return total / upper_bound

In [None]:
from tqdm import tqdm

mean_rank_distance = np.zeros(len(matches_per_index), dtype=np.float64)

for lookup_idx in tqdm(range(lookup_size)):
    expected = [m.key for m in matches_float[lookup_idx]]
    for i, matches_of_index in enumerate(matches_per_index):
        received = [m.key for m in matches_of_index[lookup_idx]]
        mean_rank_distance[i] += rank_distance(expected, received, 0.9, True)

mean_rank_distance /= lookup_size
mean_rank_distance

Let's also measure the precision at 10 and the recall at 10. We can use the following formulas to calculate precision and recall:

$$
Precision@10 = \frac{TP}{TP + FP}
$$

$$
Recall@10 = \frac{TP}{TP + FN}
$$

where TP is the number of true positives, FP is the number of false positives, and FN is the number of false negatives.

In [None]:
from tqdm import tqdm

mean_recall_at_10 = np.zeros(len(matches_per_index), dtype=np.float64)
for lookup_idx in tqdm(range(lookup_size)):
    top_expected = matches_float[lookup_idx][0]
    for i, matches_of_index in enumerate(matches_per_index):
        was_found = False
        for m in matches_of_index[lookup_idx]:
            if int(m.key) == int(top_expected.key):
                was_found = True
                break
        mean_recall_at_10[i] += int(was_found)

mean_recall_at_10 /= lookup_size
mean_recall_at_10

In [None]:
from tqdm import tqdm

mean_precision_at_10 = np.zeros(len(matches_per_index), dtype=np.float64)

for lookup_idx in tqdm(range(lookup_size)):
    expected = [m.key for m in matches_float[lookup_idx]]
    for i, matches_of_index in enumerate(matches_per_index):
        received = [m.key for m in matches_of_index[lookup_idx]]
        intersection = np.intersect1d(expected, received)
        mean_precision_at_10[i] += len(intersection) / len(received)

mean_precision_at_10 /= lookup_size
mean_precision_at_10