In [None]:
import numpy as np
from datasets import load_dataset
from encode import vectorize_e5, vectorize_uform

In [None]:
data = load_dataset("wikipedia", "20220301.en", cache_dir="./data/wikipedia/")
data, data["train"][0]

In [None]:
test_batch = [data["train"][i]["text"] for i in range(10)]
test_batch, vectorize_e5(test_batch)

In [None]:
from usearch.io import load_matrix, save_matrix
from tqdm import tqdm
import os

In [None]:
vectors_path = "./data/wikipedia/abstracts.fbin"
if not os.path.exists(vectors_path):
    matrix = np.zeros((6458670, 768), dtype=np.float32)
    save_matrix(matrix, vectors_path)

Find all non-zero rows in the NumPy `matrix`, those rows we will need to vectorize.
Iterate through the `data["train"][i]["text"]` in batches, printing progress with `tqdm`.
Vectorize using `vectorize_e5` function, that can take up to 1000 strings-list in a single batch.
Once completed, or if terminated, call `save_matrix(matrix, vectors_path)`, to preserve progress.

In [None]:
matrix = load_matrix(vectors_path)

In [None]:
zero_rows = np.any(matrix == 0, axis=1)
zero_indices = np.where(zero_rows)[0]
zero_indices

In [None]:
import time

num_entries = len(data["train"])
start_idx = 0
batch_size = 100
last_save_time = time.time()
save_interval = 600  # 10 minutes in seconds

# Using tqdm for progress bar. The "unit_scale" and "unit" arguments allow us to track samples/second.
with tqdm(total=len(zero_indices), unit_scale=batch_size, unit="samples") as pbar:
    for i in range(0, len(zero_indices), batch_size):
        batch_indices = zero_indices[i:i+batch_size]
        batch_texts = [data["train"][int(idx)]["text"] for idx in batch_indices]

        # Vectorize using vectorize_e5 function
        batch_vectors = vectorize_e5(batch_texts)

        # Update the matrix
        matrix[batch_indices] = batch_vectors

        # Update the progress bar
        pbar.update(len(batch_indices))

        # Check if it's time to save the progress
        current_time = time.time()
        if current_time - last_save_time > save_interval:
            save_matrix(matrix, vectors_path)
            last_save_time = current_time

# Save the matrix after the entire loop is finished
save_matrix(matrix, vectors_path)

: 