# Prepare Arxiv Texts

Takes abstracts (in TSV) and pre-computed E5 vectors of Arxiv papers from the HuggingFace portal, repackaging them into Parquet files, and building the USearch index. The outputs are:

- `./data/ann-arxiv-2m/title_abstract.parquet`
- `./data/ann-arxiv-2m/abstract.e5-base-v2.usearch`

In [None]:
import os
import numpy as np
import pandas as pd
from usearch.io import load_matrix
from usearch.index import Index
from encode import vectorize_e5, vectorize_uform
from tqdm import tqdm

In [None]:
def load_vectors():
    vectors = load_matrix("./data/ann-arxiv-2m/abstract.e5-base-v2.fbin")
    return vectors
vectors = load_vectors()

In [None]:
index_path = "./data/ann-arxiv-2m/abstract.e5-base-v2.usearch"
index = Index(dtype="f16", metric="cos", ndim=768)

if os.path.exists(index_path):
    index.load(index_path)
else:
    batch_size = 1000  # Adjust this based on your preference
    total_batches = int(np.ceil(vectors.shape[0] / batch_size))

    # Using tqdm for progress bar
    for i in tqdm(range(total_batches), desc="Indexing batches"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, vectors.shape[0])

        batch_keys = np.arange(start_idx, end_idx)
        batch_vectors = vectors[start_idx:end_idx]

        index.add(batch_keys, batch_vectors)

    index.save(index_path)

In [None]:
index.hardware_acceleration

In [None]:
abstracts_path = "./data/ann-arxiv-2m/title_abstract.tsv"
abstracts = pd.read_csv(abstracts_path, sep="\t")
abstracts.to_parquet("data/ann-arxiv-2m/abstracts.parquet")

In [None]:
abstracts

In [None]:
sample_abstract = abstracts["abstract"][2]
vectorize_e5(sample_abstract)

In [None]:
vectorize_e5(sample_abstract).shape

In [None]:
matches = index.search(vectorize_e5(sample_abstract), 10)
matches

In [None]:
assert matches.keys[0] == 2