In [None]:
pip install python-dev-tools --user --upgrade

In [None]:
conda install -c conda-forge hdbscan

In [None]:
pip install setuptools --upgrade --user

In [11]:
pip install bertopic

Collecting bertopic
  Using cached bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
Collecting hdbscan>=0.8.29
  Using cached hdbscan-0.8.33.tar.gz (5.2 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting plotly>=4.7.0
  Using cached plotly-5.18.0-py3-none-any.whl (15.6 MB)
Collecting umap-learn>=0.5.0
  Using cached umap_learn-0.5.4-py3-none-any.whl
Collecting tenacity>=6.2.0
  Using cached tenacity-8.2.3-py3-none-any.whl (24 kB)
Collecting pynndescent>=0.5
  Using cached pynndescent-0.5.10-py3-none-any.whl
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml): started
  Building wheel for hdbscan (pyproject.toml): finished with status 'done'
  Created wheel fo



In [1]:
import pandas as pd
from bertopic import BERTopic

words = pd.read_csv("words.csv")
docs = words["Definitions"]

In [16]:
embedding_models = ["multi-qa-distilbert-dot-v1"]
#The last 2 are specialized for semantic search, so I gave them a shot 
mapping_methods_name = ["PCA", "UMAP_cosine", "UMAP_euclidean", "Truncated_SVD"]
clustering_methods_name = ["HDBSCAN", "K_means_50", "Agglomerative_50"]
#Topic representation not tested yet

In [4]:
#the actual methods are here
from umap import UMAP
from sklearn.decomposition import PCA, TruncatedSVD
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans, AgglomerativeClustering

mapping_methods = [PCA(n_components=5),
                   UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine'), 
                   UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='euclidean'),
                   TruncatedSVD(n_components=5)]

clustering_methods = [
    HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True),
    KMeans(n_clusters=50),
    AgglomerativeClustering(n_clusters=50)
]

In [19]:
import itertools
for (embedding_model, mapping_methods_index, clustering_methods_index) in itertools.product(embedding_models, range(4), range(3)):
    print(f"Trial - Emdedding model: {embedding_model} - Mapping Method: {mapping_methods_name[mapping_methods_index]} - Clustering Method: {clustering_methods_name[clustering_methods_index]}")
    topic_model = BERTopic(hdbscan_model=clustering_methods[clustering_methods_index],
                           embedding_model=embedding_model,
                           umap_model=mapping_methods[mapping_methods_index])
    topics, probs = topic_model.fit_transform(docs)
    topic_model.get_document_info(docs).to_csv(f"Topic_assignment_{embedding_model}_{mapping_methods_name[mapping_methods_index]}_{clustering_methods_name[clustering_methods_index]}.csv")
    

Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: PCA - Clustering Method: HDBSCAN


Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 17.6kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 37.5kB/s]
Downloading (…)d2684/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 682kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 333/333 [00:00<00:00, 98.3kB/s]
Downloading (…)2684/train_script.py: 100%|██████████| 13.8k/13.8k [00:00<00:00, 4.61MB/s]
Downloading (…)33b02d2684/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 492kB/s]
Downloading (…)02d2684/modules.json: 100%|██████████| 229/229 [00:00<00:00, 115kB/s]


Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: PCA - Clustering Method: K_means_50
Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: PCA - Clustering Method: Agglomerative_50
Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: UMAP_cosine - Clustering Method: HDBSCAN
Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: UMAP_cosine - Clustering Method: K_means_50
Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: UMAP_cosine - Clustering Method: Agglomerative_50
Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: UMAP_euclidean - Clustering Method: HDBSCAN
Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: UMAP_euclidean - Clustering Method: K_means_50
Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: UMAP_euclidean - Clustering Method: Agglomerative_50
Trial - Emdedding model: multi-qa-distilbert-dot-v1 - Mapping Method: Truncated_SVD

In [2]:
from umap import UMAP
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

sentence_model = SentenceTransformer("multi-qa-MiniLM-L6-dot-v1")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
topic_model = BERTopic(umap_model=UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='euclidean'))
topics, probs = topic_model.fit_transform(docs, embeddings)


In [16]:
topic_model.visualize_topics()

In [3]:

topic_model.visualize_heatmap()


In [10]:
umap = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='euclidean')
reduced_embeddings = umap.fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [48]:
umap.transform(topic_model.topic_embeddings_)

array([[ 9.852944 ,  7.264674 ],
       [ 9.960839 ,  7.320249 ],
       [13.054335 , 11.29307  ],
       [ 9.914556 ,  7.401176 ],
       [ 7.389461 , 10.349151 ],
       [10.127419 ,  6.8824153],
       [ 9.195981 ,  9.474053 ],
       [10.506046 , 14.888231 ],
       [ 5.968884 , 10.694704 ],
       [10.318688 ,  6.3307743],
       [ 8.725799 , 12.295392 ],
       [ 9.475035 , 14.645674 ],
       [12.235234 ,  7.679608 ],
       [10.326965 , 10.663405 ],
       [12.154097 ,  6.957266 ],
       [ 6.91311  ,  7.882527 ],
       [ 7.4987354,  4.7076573],
       [ 9.033767 , 13.451228 ],
       [ 8.953979 ,  7.56103  ],
       [ 8.251744 , 10.344549 ],
       [11.071599 ,  6.0218997],
       [ 9.258547 ,  7.156737 ],
       [11.186172 , 11.549475 ],
       [-2.0848603, 10.667484 ],
       [ 9.135873 , 10.39878  ],
       [ 8.5017395,  4.888984 ],
       [ 9.762491 ,  9.147108 ],
       [ 9.09854  , 11.696578 ],
       [12.606381 ,  9.695205 ],
       [ 9.190841 ,  8.64992  ],
       [12

In [18]:
reduced_embeddings

array([[12.181383 ,  4.690489 ],
       [ 8.493652 ,  9.843183 ],
       [ 8.273893 , 10.3263645],
       ...,
       [ 6.1776357,  6.6027904],
       [ 7.9651513,  9.32143  ],
       [ 8.885109 ,  6.3367143]], dtype=float32)

In [23]:
words.join(pd.DataFrame(reduced_embeddings)).to_csv("words_and_embedding.csv")

In [46]:
pd.DataFrame(umap.transform(topic_model.topic_embeddings_))[:-1]

Unnamed: 0,0,1
0,9.693974,7.237710
1,10.038517,7.469729
2,13.087567,11.436805
3,10.058129,7.512450
4,7.367224,10.370178
...,...,...
96,6.746112,8.932661
97,6.239762,4.101670
98,8.949113,4.873857
99,10.972302,6.812556


In [59]:
pd.DataFrame.from_dict(topic_model.topic_labels_, orient='index', columns=["words"])[1:].join(pd.DataFrame(umap.transform(topic_model.topic_embeddings_))[:-1]).to_csv("topics_and_embedding.csv")

In [None]:
topic_model.get_document_info(docs).to_csv(f"Topic_assignment_final.csv")