In [1]:
import numpy as np
from datasets import load_dataset
from encode import vectorize_e5, vectorize_uform

  from .autonotebook import tqdm as notebook_tqdm
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 77263.49it/s]


In [2]:
data = load_dataset("wikipedia", "20220301.en", cache_dir="./data/wikipedia/")
data, data["train"][0]

(DatasetDict({
     train: Dataset({
         features: ['id', 'url', 'title', 'text'],
         num_rows: 6458670
     })
 }),
 {'id': '12',
  'url': 'https://en.wikipedia.org/wiki/Anarchism',
  'title': 'Anarchism',
  'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. 

In [3]:
test_batch = [data["train"][i]["text"] for i in range(10)]
test_batch, vectorize_e5(test_batch)

(['Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourishe

In [4]:
from usearch.io import load_matrix, save_matrix
from tqdm import tqdm
import os

In [5]:
vectors_path = "./data_usearch/wikipedia/abstracts.fbin"
if not os.path.exists(vectors_path):
    matrix = np.zeros((6458670, 768), dtype=np.float32)
    save_matrix(matrix, vectors_path)

Find all non-zero rows in the NumPy `matrix`, those rows we will need to vectorize.
Iterate through the `data["train"][i]["text"]` in batches, printing progress with `tqdm`.
Vectorize using `vectorize_e5` function, that can take up to 1000 strings-list in a single batch.
Once completed, or if terminated, call `save_matrix(matrix, vectors_path)`, to preserve progress.

In [6]:
matrix = load_matrix(vectors_path)


In [7]:
zero_rows = np.any(matrix == 0, axis=1)
zero_indices = np.where(zero_rows)[0]
zero_indices

array([    200,     201,     202, ..., 6458667, 6458668, 6458669])

In [None]:
num_entries = len(data["train"])
start_idx = 0
batch_size = 10000

# Using tqdm for progress bar
for i in tqdm(range(0, len(zero_indices), batch_size)):
    batch_indices = zero_indices[i:i+batch_size]
    batch_texts = [data["train"][int(idx)]["text"] for idx in batch_indices]
    
    # Vectorize using vectorize_e5 function
    batch_vectors = vectorize_e5(batch_texts)
    
    # Update the matrix
    matrix[batch_indices] = batch_vectors

    # Save the progress every batch
    save_matrix(matrix, vectors_path)