In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import pymongo
import pandas as pd
import numpy as np
from pathlib import Path

In [4]:
db_name = "arxiv-db"
collection_name = "arxiv-dataset-collection"

db = pymongo.MongoClient(host="localhost", port=21000).get_database(db_name)
collection = db[collection_name]
collection

Collection(Database(MongoClient(host=['localhost:21000'], document_class=dict, tz_aware=False, connect=True), 'arxiv-db'), 'arxiv-dataset-collection')

In [11]:
def load_data(cursor):
    all_data_df = pd.DataFrame(cursor)
    return all_data_df

def load_data_cursor(batch_size=128):
    cursor = collection.find({}, {"title", "abstract", "categories"}).batch_size(batch_size)
    return cursor

In [12]:
cursor = load_data_cursor()
cursor

<pymongo.cursor.Cursor at 0x7f8990e7dcd0>

In [13]:
for c in cursor:
    break

In [14]:
c

{'_id': ObjectId('64729c935ced617335d85d64'),
 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'categories': 'hep-ph',
 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of 

In [None]:
# # Load and preprocess the ArXiv dataset
# # Replace this with your own dataset loading and preprocessing code
# all_data_df = load_data()
# all_data_df.shape

In [9]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.metrics import silhouette_score


def extract_embeddings(texts, model, tokenizer, device='cuda'):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    embeddings = outputs['pooler_output'].detach().cpu().numpy()
    return embeddings

In [10]:
# Set up the BERT model and tokenizer
model_name = "bert-base-uncased"  # You can replace this with any other BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
bert_embeddings = extract_embeddings(texts=all_data_df['abstract'].tolist(), model=model, tokenizer=tokenizer)
bert_embeddings.shape

In [None]:
from sentence_transformers import SentenceTransformer

# Set up the Sentence-BERT model
sbert_model = SentenceTransformer(model_name)

# Extract embeddings using SBERT
embeddings = extract_embeddings(abstracts, sbert_model, tokenizer)

In [None]:
from cuml.cluster import DBSCAN, HDBSCAN, AgglomerativeClustering


def perform_gpu_dbscan(embeddings, eps=0.5, min_samples=5):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(embeddings)
    labels = dbscan.labels_
    return labels


def perform_gpu_hdbscan(embeddings, min_cluster_size=5, min_samples=5):
    hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
    hdbscan.fit(embeddings)
    labels = hdbscan.labels_
    return labels


def perform_gpu_agglomerative_clustering(embeddings, n_clusters):
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(embeddings)
    return labels


def find_optimal_clusters(embeddings, clustering_algorithm):
    scores = []
    max_clusters = min(10, len(embeddings))

    for n_clusters in range(2, max_clusters + 1):
        labels = clustering_algorithm(embeddings, n_clusters)
        score = silhouette_score(embeddings, labels)
        scores.append(score)

    optimal_clusters = np.argmax(scores) + 2
    return optimal_clusters

In [None]:
# Perform GPU-accelerated DBSCAN
gpu_dbscan_labels = perform_gpu_dbscan(embeddings, eps=0.5, min_samples=5)

# Perform GPU-accelerated HDBSCAN
gpu_hdbscan_labels = perform_gpu_hdbscan(embeddings, min_cluster_size=5, min_samples=5)

# Perform GPU-accelerated Agglomerative Clustering
gpu_agglomerative_labels = perform_gpu_agglomerative_clustering(embeddings, n_clusters=5)

# Find the optimal number of clusters for each algorithm
optimal_dbscan_clusters = find_optimal_clusters(embeddings, perform_gpu_dbscan)
optimal_hdbscan_clusters = find_optimal_clusters(embeddings, perform_gpu_hdbscan)
optimal_agglomerative_clusters = find_optimal_clusters(embeddings, perform_gpu_agglomerative_clustering)

# Print the results
print("GPU-accelerated DBSCAN Clustering:")
print("Optimal number of clusters:", optimal_dbscan_clusters)
print("Cluster labels:", gpu_dbscan_labels)

print("\nGPU-accelerated HDBSCAN Clustering:")
print("Optimal number of clusters:", optimal_hdbscan_clusters)
print("Cluster labels:", gpu_hdbscan_labels)

print("\nGPU-accelerated Agglomerative Clustering:")
print("Optimal number of clusters:", optimal_agglomerative_clusters)
print("Cluster labels:", gpu_agglomerative_labels)