In [1]:
import pandas as pd
from opensearchpy import OpenSearch
from opensearchpy.connection import RequestsHttpConnection

# Connect to OpenSearch
client = OpenSearch(
    hosts=[{"host": "opensearch-ds.ifi.uni-heidelberg.de", "port": 443}],
    http_auth=("asiddhpura", "Pkw?#Rivale9Meran.Abweg"),
    use_ssl=True,
    verify_certs=True,
    ssl_show_warn=False,
    connection_class=RequestsHttpConnection,
    timeout=120,
)

In [2]:
df = pd.read_csv("nodes.csv")

In [3]:
import colorsys

# Generate a color for each cluster
def generate_color(cluster_number):
    hue = hash(cluster_number) % 360 / 360.0
    saturation = 0.9
    value = 0.9
    r, g, b = colorsys.hsv_to_rgb(hue, saturation, value)
    return "#{:02x}{:02x}{:02x}".format(int(r * 255), int(g * 255), int(b * 255))

# Add a color column to the dataframe
df["color"] = df["cluster"].apply(generate_color)

In [4]:
# Fetch all titles and update dates from OpenSearch
def fetch_titles_and_dates_from_opensearch(opensearch_client, ids):
    index_name = "frameintell_arxiv_metadata"

    query = {
        "size": 10000,
        "_source": ["title", "update_date"],
        "query": {"ids": {"values": ids}},
    }

    id_to_info = {}

    def process_hits(hits):
        for hit in hits:
            id_to_info[hit["_id"]] = {
                "title": hit["_source"].get("title"),
                "update_date": hit["_source"].get("update_date"),
            }

    response = opensearch_client.search(index=index_name, body=query, scroll="2m")

    scroll_id = response["_scroll_id"]
    process_hits(response["hits"]["hits"])

    while len(response["hits"]["hits"]):
        response = opensearch_client.scroll(scroll_id=scroll_id, scroll="2m")
        process_hits(response["hits"]["hits"])

    return id_to_info


# Assuming 'df' is your dataframe and 'id' is the name of the column with IDs
ids = df["id"].unique().tolist()

# Fetch all titles and update dates
id_to_info = fetch_titles_and_dates_from_opensearch(client, ids)

# Add titles and update dates to the dataframe
df["title"] = df["id"].map(lambda x: id_to_info.get(x, {}).get("title"))
df["update_date"] = df["id"].map(lambda x: id_to_info.get(x, {}).get("update_date"))
# Convert update_date to datetime
df["update_date"] = pd.to_datetime(df["update_date"])
# Change format from yyyy-MM-dd to dd/MM/yyyy
df["update_date"] = df["update_date"].dt.strftime("%d/%m/%Y")
# Remove "\n" from title
df["title"] = df["title"].str.replace("\n", "")
# Remove all whitesapce from title
df["title"] = df["title"].str.strip()

In [5]:
# Save the dataframe to a new CSV file
df.to_csv("new_nodes.csv", index=False)