In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import pymongo
import pandas as pd
import numpy as np
from pathlib import Path

In [4]:
db_name = "arxiv-db"
collection_name = "arxiv-dataset-collection"

db = pymongo.MongoClient(host="localhost", port=21000).get_database(db_name)
collection = db[collection_name]
collection

Collection(Database(MongoClient(host=['localhost:21000'], document_class=dict, tz_aware=False, connect=True), 'arxiv-db'), 'arxiv-dataset-collection')

In [10]:
def load_data():
    all_data_df = pd.DataFrame(collection.find({}, {"title", "abstract", "categories"}))
    return all_data_df

In [11]:
# Load and preprocess the ArXiv dataset
all_data_df = load_data()
all_data_df.shape

(2258347, 4)

### Dataset

In [68]:
class ArxivDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        abstract = self.data.loc[index, 'abstract']
        encoded_input = tokenizer(abstract, padding=True, truncation=True, return_tensors='pt', pad_to_multiple_of=512)

        # Return a dictionary with the desired fields
        return {
            'input_ids': encoded_input['input_ids'].squeeze(),
            'attention_mask': encoded_input['attention_mask'].squeeze()
        }

In [67]:
import transformers
transformers.__version__

'4.29.2'

In [69]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.metrics import silhouette_score
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


def extract_embeddings(model, data_loader: DataLoader, device='cuda'):
    embeddings = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, total=len(data_loader)):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
            
#             outputs = model(input_ids, attention_mask=attention_mask)
            outputs = model(**batch)
            embeddings.append(outputs.pooler_output.detach().cpu())

    embeddings = torch.stack(embeddings)
    return embeddings

In [70]:
def get_data_loader(tokenizer, batch_size=128):
    dataset = ArxivDataset(data=all_data_df, tokenizer=tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return dataloader

In [71]:
# Set up the BERT model and tokenizer
model_name = "bert-base-uncased"  # You can replace this with any other BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [72]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [73]:
dataloader = get_data_loader(tokenizer=tokenizer, batch_size=128)

In [None]:
bert_embeddings = extract_embeddings(model=model, data_loader=dataloader, device=device)
bert_embeddings.shape

  0%|          | 0/17644 [00:00<?, ?it/s]

### SBERT

In [None]:
from sentence_transformers import SentenceTransformer

# Set up the Sentence-BERT model
sbert_model = SentenceTransformer(model_name)

# Extract embeddings using SBERT
embeddings = extract_embeddings(abstracts, sbert_model, tokenizer)

In [None]:
from cuml.cluster import DBSCAN, HDBSCAN, AgglomerativeClustering


def perform_gpu_dbscan(embeddings, eps=0.5, min_samples=5):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(embeddings)
    labels = dbscan.labels_
    return labels


def perform_gpu_hdbscan(embeddings, min_cluster_size=5, min_samples=5):
    hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
    hdbscan.fit(embeddings)
    labels = hdbscan.labels_
    return labels


def perform_gpu_agglomerative_clustering(embeddings, n_clusters):
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(embeddings)
    return labels


def find_optimal_clusters(embeddings, clustering_algorithm):
    scores = []
    max_clusters = min(10, len(embeddings))

    for n_clusters in range(2, max_clusters + 1):
        labels = clustering_algorithm(embeddings, n_clusters)
        score = silhouette_score(embeddings, labels)
        scores.append(score)

    optimal_clusters = np.argmax(scores) + 2
    return optimal_clusters

In [None]:
# Perform GPU-accelerated DBSCAN
gpu_dbscan_labels = perform_gpu_dbscan(embeddings, eps=0.5, min_samples=5)

# Perform GPU-accelerated HDBSCAN
gpu_hdbscan_labels = perform_gpu_hdbscan(embeddings, min_cluster_size=5, min_samples=5)

# Perform GPU-accelerated Agglomerative Clustering
gpu_agglomerative_labels = perform_gpu_agglomerative_clustering(embeddings, n_clusters=5)

# Find the optimal number of clusters for each algorithm
optimal_dbscan_clusters = find_optimal_clusters(embeddings, perform_gpu_dbscan)
optimal_hdbscan_clusters = find_optimal_clusters(embeddings, perform_gpu_hdbscan)
optimal_agglomerative_clusters = find_optimal_clusters(embeddings, perform_gpu_agglomerative_clustering)

# Print the results
print("GPU-accelerated DBSCAN Clustering:")
print("Optimal number of clusters:", optimal_dbscan_clusters)
print("Cluster labels:", gpu_dbscan_labels)

print("\nGPU-accelerated HDBSCAN Clustering:")
print("Optimal number of clusters:", optimal_hdbscan_clusters)
print("Cluster labels:", gpu_hdbscan_labels)

print("\nGPU-accelerated Agglomerative Clustering:")
print("Optimal number of clusters:", optimal_agglomerative_clusters)
print("Cluster labels:", gpu_agglomerative_labels)