In [1]:
from opensearchpy import OpenSearch, RequestsHttpConnection
import json
from tqdm import tqdm

# Configure the OpenSearch client
client = OpenSearch(
    hosts=[{"host": "opensearch-ds.ifi.uni-heidelberg.de", "port": 443}],
    http_auth=("asiddhpura", "Pkw?#Rivale9Meran.Abweg"),
    use_ssl=True,
    verify_certs=True,
    ssl_show_warn=False,
    connection_class=RequestsHttpConnection,
    timeout=120,
)

index_name = "frameintell_arxiv_embeddings"


# Function to get total number of documents
def get_total_docs(query):
    try:
        response = client.count(index=index_name, body={"query": {"match": {"abstract": query}}})
        return response["count"]
    except Exception as e:
        print(f"Error getting document count: {e}")
        return 0


# Function to download and write data using Scroll API
def download_and_write_data(filename, query, batch_size=10000):
    total_docs = get_total_docs(query)
    print("Total documents: ", total_docs)

    with open(filename, "w") as f:
        with tqdm(total=total_docs, desc="Downloading") as pbar:
            # Initialize the scroll
            res = client.search(
                index=index_name,
                body={
                    "query": {"match": {"abstract": query}},
                    "_source": ["id", "embedding"],
                },
                scroll="10m",
                size=batch_size,
            )

            # Get the scroll ID
            scroll_id = res["_scroll_id"]
            hits = res["hits"]["hits"]

            while len(hits):
                # Process current batch
                for hit in hits:
                    doc = {"id": hit["_source"]["id"], "embedding": hit["_source"]["embedding"]}
                    f.write(json.dumps(doc) + "\n")

                pbar.update(len(hits))

                try:
                    # Fetch the next batch using the scroll API
                    res = client.scroll(scroll_id=scroll_id, scroll="10m")
                    hits = res["hits"]["hits"]
                    scroll_id = res["_scroll_id"]
                except Exception as e:
                    print(f"Error scrolling: {e}")
                    break

    print(f"Data downloaded and saved to {filename}")


# Download and write the data
output_filename = "embeddings_data.jsonl"
download_and_write_data(output_filename, "machine learning")

Total documents:  216143


Downloading:   0%|          | 0/216143 [00:00<?, ?it/s]

Hits:  10000


Downloading: 100%|██████████| 216143/216143 [32:24<00:00, 111.13it/s]

Data downloaded and saved to embeddings_data.jsonl



