In [2]:
import spacy
import numpy as np
from sklearn.cluster import  OPTICS, DBSCAN,  AgglomerativeClustering
from sklearn.cluster._hdbscan.hdbscan import HDBSCAN

from pathlib import Path
from pysota.process import Persistence

nlp = spacy.load('en_core_web_lg')
results_dir = Path('../results/clean')

In [3]:
db = Persistence.load_files(results_dir)
print(len(db))

98


In [4]:
documents = [i.abstract for i in db]
doc_vectors = [nlp(doc).vector for doc in documents]
X = np.array(doc_vectors)

eps = .5
metric = 'euclidean'

# cluster = DBSCAN(eps=eps, min_samples=2)
# dbscan = OPTICS(min_samples=2)
cluster = AgglomerativeClustering(n_clusters=10, metric=metric, linkage='ward',)
# cluster = HDBSCAN(metric='cosine',  max_cluster_size=20)
cluster.fit(X)

# Group documents by their cluster labels
clusters = {}
for idx, label in enumerate(cluster.labels_):
    clusters.setdefault(label, []).append(db[idx])

In [5]:
# Print the number of documents in each cluster
print(f' Number of clusters: {len(clusters)} \n\n')
for label, docs in clusters.items():
    if label == -1:
        print(f"Noise: {len(docs)} documents")
    else: 
        print(f"Cluster {label}: {len(docs)} documents")   

 Number of clusters: 10 


Cluster 7: 8 documents
Cluster 6: 13 documents
Cluster 2: 32 documents
Cluster 0: 8 documents
Cluster 5: 5 documents
Cluster 4: 5 documents
Cluster 1: 8 documents
Cluster 3: 13 documents
Cluster 9: 1 documents
Cluster 8: 5 documents


In [6]:
# Print the number of documents in each cluster
print(f' Number of clusters: {len(clusters)} with eps = {eps}\n\n')
for label, docs in clusters.items():
    if label == -1:
        print(f"Noise: {len(docs)} documents")
    else: 
        print(f"Cluster {label}: {len(docs)} documents")   
    for doc in docs:
        title = doc.title.replace('\n', ' ')
        print(f"  - {title}")
    print('\n\n ========================================================= \n\n')

 Number of clusters: 10 with eps = 0.5


Cluster 7: 8 documents
  - Transformation Properties of Learned Visual Representations
  - Representations for Stable Off-Policy Reinforcement Learning
  - On the Complexity of Representation Learning in Contextual Linear Bandits
  - On the Generalization of Representations in Reinforcement Learning
  - The Utility of Sparse Representations for Control in Reinforcement Learning
  - Learning Sparse Representations Incrementally in Deep Reinforcement Learning
  - Revisiting Factorizing Aggregated Posterior in Learning Disentangled Representations
  - Representer Theorems for Metric and Preference Learning: A Geometric Perspective




Cluster 6: 13 documents
  - Speech representation learning: Learning bidirectional encoders with single-view, multi-view, and multi-task methods
  - Representation Learning for Natural Language Processing
  - Representation Learning: A Statistical Perspective
  - A survey on self-supervised methods for visual represen

In [7]:
Persistence.save_clusters(clusters, Path(f'../results/clustered/{metric}'))