In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection
from sentence_transformers import SentenceTransformer
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
import os
import json

In [None]:
# Initialize the OpenSearch client
client = OpenSearch(
    hosts=[{"host": "opensearch-ds.ifi.uni-heidelberg.de", "port": 443}],
    http_auth=("asiddhpura", "Pkw?#Rivale9Meran.Abweg"),
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    timeout=60,
)

# Create a Point in Time
pit = client.create_point_in_time(index="frameintell_arxiv_metadata", keep_alive="5m")
pit_id = pit["pit_id"]
# print(f"Point in Time ID: {pit_id}")

In [None]:
# Get oldest paper
query = {
    "size": 1,
    "_source": ["update_date"],  # We only need the update_date field
    "query": {"match_all": {}},  # Match all documents
    "sort": [{"update_date": {"order": "asc"}}],  # Sort by update_date in ascending order
}

# Get the end update_date from the first document
response = client.search(index="frameintell_arxiv_metadata", body=query)
end_date = response["hits"]["hits"][0]["_source"]["update_date"]
print(f"Oldest paper update_date: {end_date}")

In [None]:
# temporary update_date
# end_date = "2024-05-10"

In [None]:
# Set the start date for search_after
# end_date = datetime.strptime(update_date, "%Y-%m-%d")
# end_date = end_date.strftime("%Y-%m-%d")
start_date = "now"

# Initial search query
query = {
    "size": 100,
    "_source": ["id", "abstract", "update_date"],
    "query": {"range": {"update_date": {"lte": start_date, "gte": end_date}}},
    "sort": [{"update_date": {"order": "desc"}}, {"_id": "desc"}],  # tie breaker
    "pit": {"id": pit_id, "keep_alive": "5m"},
}

# Perform the initial search
response = client.search(body=query)
results = response["hits"]["hits"]

In [None]:
# Create the mapping for the target index
mappings = {
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "abstract": {"type": "text"},
            "processed_abstract": {"type": "text"},
            "embedding": {
                "type": "knn_vector",
                "dimension": 384,
            },
            "update_date": {"type": "date"},
        }
    }
}


# Create the target index to store the embeddings
target_index = "frameintell_arxiv_embeddings"
if not client.indices.exists(index=target_index):
    client.indices.create(index=target_index, body=mappings)
    print(f"Index {target_index} created.")

In [None]:
# Get the set of English stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


# Process the abstracts
def process_abstracts(abstracts):
    processed_abstracts = []
    for abstract in abstracts:
        if not isinstance(abstract, str):
            processed_abstracts.append("")
            continue
        abstract = abstract.replace("\n", " ")  # Remove newline characters
        abstract = abstract.lower()  # Convert to lowercase
        abstract = re.sub(r"[^a-zA-Z\s]", "", abstract)  # Remove special characters and digits
        words = abstract.split()  # Tokenize the abstract
        processed_abstract = [word for word in words if word not in stop_words]  # Remove stopwords
        processed_abstract = " ".join(processed_abstract)  # Combine the processed text as a string
        processed_abstracts.append(processed_abstract)
    return processed_abstracts

In [None]:
# Load the Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)

In [None]:
total_documents = 0
# Continue fetching documents in batches until there are no more documents
while True:
    # If there are no more documents, break the loop
    if not results:
        break

    # Get the abstracts and process them
    abstracts = [result["_source"]["abstract"] for result in results]
    processed_abstracts = process_abstracts(abstracts)

    # Get the embeddings
    embeddings = model.encode(processed_abstracts).tolist()

    # Prepare the documents to be indexed
    bulk_data = []
    for i, result in enumerate(results):
        doc = {
            "id": result["_source"]["id"],
            "abstract": result["_source"]["abstract"],
            "processed_abstract": processed_abstracts[i],
            "embedding": embeddings[i],
            "update_date": result["_source"]["update_date"],
        }
        bulk_data.append({"index": {"_index": target_index, "_id": doc["id"]}})
        bulk_data.append(doc)

    # Index the documents
    client.bulk(body=bulk_data)

    # Increment the total documents counter
    total_documents += len(results)
    print(total_documents)
    # Get and print the last update_date that was indexed
    last_update_date = results[-1]["_source"]["update_date"]
    print(f"Last update_date: {last_update_date}")

    # Get the last sort value
    last_sort = results[-1]["sort"]
    query["search_after"] = last_sort
    response = client.search(body=query)
    results = response["hits"]["hits"]

In [None]:
# Close the Point in Time and release resources
client.delete_point_in_time(body={"pit_id": pit_id})
client.transport.close()