In [1]:
import warnings
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

import umap
import hdbscan
warnings.filterwarnings("ignore")

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
device

'cuda:0'

In [4]:
dataset = pd.read_csv("Bend Export.csv")

In [5]:
dataset = dataset.loc[:,'Bio']

In [6]:
dataset.dropna(inplace=True)

In [7]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [8]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to(device)

In [9]:
def emb(text,model,tokenizer):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return np.array(sentence_embeddings.to('cpu'))

In [10]:
embeddings_dataset = []

In [11]:
for text in dataset:
    embeddings_dataset.append(emb(text,model,tokenizer)[0])

In [12]:
embeddings_dataset

[array([ 2.37282980e-02,  3.59510221e-02, -7.62424693e-02, -5.27482061e-03,
         2.77207773e-02,  2.83812154e-02,  1.94151681e-02,  3.80236818e-03,
        -5.69263212e-02,  3.02306134e-02,  2.20708642e-02,  2.25117654e-02,
        -1.93897840e-02,  2.41086110e-02, -6.47703037e-02, -2.39373092e-02,
         7.99251646e-02,  7.93227926e-02,  3.48916650e-02,  2.28372645e-02,
        -4.77949483e-03,  1.12914026e-01,  9.97855663e-02,  5.09156883e-02,
        -3.58342729e-03,  9.26262215e-02, -9.13908407e-02, -7.47004198e-03,
         1.41477853e-03, -1.13665707e-01,  2.57755201e-02,  9.15314853e-02,
         1.31805092e-02,  7.93367159e-03,  5.82292816e-03, -8.62157047e-02,
         3.60084203e-04,  2.17202641e-02,  5.06952442e-02,  5.76224066e-02,
         2.64295489e-02, -1.74688213e-02, -7.29416087e-02,  7.31820837e-02,
        -4.40676697e-02,  3.20586711e-02, -6.34892192e-03, -1.82877705e-02,
         1.11862056e-01, -5.46187023e-03,  3.97115089e-02, -6.40187263e-02,
         5.0

In [13]:
df_embeddings = pd.DataFrame(embeddings_dataset)

In [14]:
df_embeddings.shape

(5115, 384)

In [15]:
umap_neighbors = 128
umap_n_components = 128

In [16]:
# densmap
umap_embeddings = umap.UMAP(n_neighbors=umap_neighbors,
                            n_components=umap_n_components,
                            n_epochs =5000,
                            min_dist=0,
                            low_memory=False,
                            learning_rate=0.01,
                            verbose=True,
                            metric='l2',
                            spread=3,
                            local_connectivity=2,
                            negative_sample_rate=8,
                            densmap = True,
                            dens_lambda = 2.5,
                            random_state=42).fit_transform(df_embeddings)

UMAP(dens_lambda=2.5, densmap=True, learning_rate=0.1, local_connectivity=2, low_memory=False, metric='l2', min_dist=0, n_components=128, n_epochs=5000, n_neighbors=128, negative_sample_rate=8, random_state=42, spread=3, verbose=True)
Mon Jan 30 15:07:31 2023 Construct fuzzy simplicial set
Mon Jan 30 15:07:31 2023 Finding Nearest Neighbors
Mon Jan 30 15:07:31 2023 Building RP forest with 9 trees
Mon Jan 30 15:07:31 2023 NN descent for 12 iterations
	 1  /  12
	 2  /  12
	 3  /  12
	 4  /  12
	 5  /  12
	Stopping threshold met -- exiting after 5 iterations
Mon Jan 30 15:07:43 2023 Finished Nearest Neighbor Search
Mon Jan 30 15:07:46 2023 Construct embedding
Mon Jan 30 15:07:48 2023 Computing original densities


In [None]:
umap_embeddings.shape

(5115, 128)

In [None]:
hdbscan_minimal_cluster_size = 50
hdbscan_min_samples = 50

In [None]:
cluster = hdbscan.HDBSCAN(min_cluster_size=hdbscan_minimal_cluster_size,
                          metric='l2',
                          min_samples=hdbscan_min_samples,
                          #core_dist_n_jobs=10,
                          cluster_selection_epsilon=0.01,
                          cluster_selection_method='leaf',
                          #leaf_size=40,
                          algorithm='best').fit(umap_embeddings)

In [None]:
# -1 means noise data
pd.DataFrame(cluster.labels_).value_counts()

-1    4372
 3     248
 1     152
 4     112
 5      94
 2      76
 0      61
dtype: int64

In [None]:
df_dataset = pd.DataFrame(dataset)

In [None]:
df_dataset['labels'] = cluster.labels_

In [None]:
df_dataset

Unnamed: 0,Bio,labels
379,ADVENTURE AMPLIFIED,2
385,Adventure.,2
390,Adventures in life.,2
392,Adventures with Kristin and Mike.,2
401,"Aiming to fix volatility, one area of life at ...",2
...,...,...
4351,Stop existing ... Start living,2
4366,Striving to capture the best. With a personal ...,2
4615,Trying hard to give people a happier and highe...,2
4618,trying to change lives,2
