In [1]:
import pandas as pd
from itertools import combinations
import numpy as np
from tqdm import tqdm

## Clustering evaluation using Rand Index

In [2]:
# evaluation function with an example
# the truths are the list of arxiv categories for each paper, the preds are the cluster labels
# rand score modified to allow for multiple categories per paper

def modified_rand_score_vectorized(preds, truths):
    n = len(preds)
    
    # Convert predictions to a numpy array for faster operations
    preds = np.array(preds)
    
    # Prepare a label presence matrix
    unique_labels = sorted(set(label for sublist in truths for label in sublist))
    label_index = {label: idx for idx, label in enumerate(unique_labels)}
    
    # Create a boolean matrix of size (n, number of unique labels)
    truth_matrix = np.zeros((n, len(unique_labels)), dtype=bool)
    for i, labels in enumerate(truths):
        truth_matrix[i, [label_index[label] for label in labels]] = True
    
    # Calculate pairwise matrix indicating shared label (the slowest part of the function))
    shared_label_matrix = np.dot(truth_matrix, truth_matrix.T) > 0
    
    # Calculate pairwise equal predictions
    same_pred = preds[:, None] == preds
    
    # Calculate TP, TN, FP, FN using vectorized operations
    TP = np.sum(np.logical_and(shared_label_matrix, same_pred))
    TN = np.sum(np.logical_and(~shared_label_matrix, ~same_pred))
    FP = np.sum(np.logical_and(~shared_label_matrix, same_pred))
    FN = np.sum(np.logical_and(shared_label_matrix, ~same_pred))

    # Correct for self-comparison (diagonal counts as True in both shared_label_matrix and same_pred)
    TP -= n
    
    # Calculate the Rand Index
    rand_index = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
    return rand_index

# Example usage
preds = [1, 1, 2, 2]
truths = [['a', 'b'], ['a'], ['c'], ['c', 'd']]
print(modified_rand_score_vectorized(preds, truths))

1.0


##  Example usage with arxiv data

In [3]:
# function to add back the list of categories for each paper
def reorg_category_df(df_categories):
    data_ids, data_categories = [], []
    cur_id, cat = "", []
    data = df_categories.sort_values("id")

    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        if cur_id != row[0]:
            # save the exising id's categories data to dictionary. if cur_id is "", it indicates the beginning of the loop, no prior records to save
            if cur_id != "":
                data_ids.append(cur_id)
                data_categories.append(",".join(cat))
                cur_id = row[0]
                cat = []
            else:
                cur_id = row[0]
        cat.append(row[1])

    df = pd.DataFrame.from_dict({"id": data_ids, 'categories': data_categories})
    # save it to csv, so you could load it next time without rerunning
    # df.to_csv(save_to_file_name)
    # print(f"Total {df.shape[0]} records. Have saved the file {save_to_file_name} into the data folder.")

    return df

In [4]:
# the data/arxiv-metadata-ext-category.csv file contains N rows of the same paper (same paper id) with N different categories
df_categories = pd.read_csv("data/arxiv-metadata-ext-category.csv",dtype={"id":object,"category_id":object})
# Reorganize to each row as one paper with a column containing the list of categories. Might take 1 to 3+ mins.
df_reorg_category = reorg_category_df(df_categories)

100%|██████████| 4156055/4156055 [01:20<00:00, 51853.04it/s]


In [47]:
# load the clustering results with at least two columns: id and kmeans_label
df_clustering = pd.read_csv("data/bertopic-kmeans60.csv",dtype={"id":object})

In [48]:
df_clustering = df_clustering.merge(df_reorg_category, how='left', on="id")

In [49]:
df_clustering

Unnamed: 0,id,topic_id,categories
0,gr-qc/0209061,11,"gr-qc,cs.CC,quant-ph"
1,q-bio/0607018,36,"q-bio.GN,cs.IT,math-ph,math.IT,math.MP,physics..."
2,q-bio/0604024,50,"q-bio.PE,cs.CE,math.GR,q-bio.OT"
3,q-bio/0403036,36,"q-bio.GN,cs.CE,q-bio.BM,quant-ph"
4,q-bio/0610017,44,"q-bio.BM,cs.CG,cs.DM,math.MG"
...,...,...,...
74384,hep-th/0602072,0,"cs.CC,hep-th"
74385,hep-lat/0505005,31,"hep-lat,cs.CE,physics.comp-ph"
74386,hep-lat/0308005,19,"hep-lat,cs.DC"
74387,hep-lat/0307015,19,"hep-lat,cs.DC"


In [50]:
def split_comma(input):
    try:
        return input.split(",")
    except:
        return []

In [51]:
df_clustering["categories_list"] = df_clustering["categories"].apply(lambda x: split_comma(x))

In [52]:
output_score = []
# to speed up the evaluation, we sample random 10,000 papers 10 times and evaluate the clustering results based on the average score
for i in range(10):
    df_clustering_filter = df_clustering[~df_clustering["categories"].isna()]
    df_sample = df_clustering_filter.sample(10000, random_state = i)
    #score = modified_rand_score_vectorized(df_sample["kmeans_label"], df_sample["categories_list"])
    score = modified_rand_score_vectorized(df_sample["topic_id"], df_sample["categories_list"])
    output_score.append(score)
    print(f"seed: {i}, score: {score}")
    
print(f"Average Rand Index Score: {np.mean(output_score)}")

seed: 0, score: 0.8510759675967596
seed: 1, score: 0.8498602260226022
seed: 2, score: 0.848571097109711
seed: 3, score: 0.8514113811381138
seed: 4, score: 0.8496474647464747
seed: 5, score: 0.8532257625762576
seed: 6, score: 0.8506539853985399
seed: 7, score: 0.8492895089508951
seed: 8, score: 0.8505053305330533
seed: 9, score: 0.8485082308230824
Average Rand Index Score: 0.8502748954895489


## Clustering evaluation using Silhouette Score (if possible)

In [ ]:
# X is a feature array and labels are predicted labels for each sample
# reference: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
# silhouette_score(X, labels)