## Installing requirements



In [None]:
pip install python-dev-tools --user --upgrade

In [None]:
conda install -c conda-forge hdbscan

In [None]:
pip install setuptools --upgrade --user

In [None]:
pip install bertopic

## Import the words definition and required libraries

In [2]:
import pandas as pd
from bertopic import BERTopic

words = pd.read_csv("./resources/words.csv")
docs = words["Definitions"]

### Running experiments on combinations of algorithms

In [16]:
embedding_models = ["multi-qa-distilbert-dot-v1"]
#The last 2 are specialized for semantic search, so I gave them a shot 
mapping_methods_name = ["PCA", "UMAP_cosine", "UMAP_euclidean", "Truncated_SVD"]
clustering_methods_name = ["HDBSCAN", "K_means_50", "Agglomerative_50"]
#Topic representation not tested yet

In [4]:
#the actual methods are here
from umap import UMAP
from sklearn.decomposition import PCA, TruncatedSVD
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans, AgglomerativeClustering

mapping_methods = [PCA(n_components=5),
                   UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine'), 
                   UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='euclidean'),
                   TruncatedSVD(n_components=5)]

clustering_methods = [
    HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True),
    KMeans(n_clusters=50),
    AgglomerativeClustering(n_clusters=50)
]

In [None]:
import itertools
for (embedding_model, mapping_methods_index, clustering_methods_index) in itertools.product(embedding_models, range(4), range(3)):
    print(f"Trial - Emdedding model: {embedding_model} - Mapping Method: {mapping_methods_name[mapping_methods_index]} - Clustering Method: {clustering_methods_name[clustering_methods_index]}")
    topic_model = BERTopic(hdbscan_model=clustering_methods[clustering_methods_index],
                           embedding_model=embedding_model,
                           umap_model=mapping_methods[mapping_methods_index])
    topics, probs = topic_model.fit_transform(docs)
    topic_model.get_document_info(docs).to_csv(f"Topic_assignment_{embedding_model}_{mapping_methods_name[mapping_methods_index]}_{clustering_methods_name[clustering_methods_index]}.csv")
    

With the combination chosen, we run the model

In [2]:
from umap import UMAP
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

sentence_model = SentenceTransformer("multi-qa-MiniLM-L6-dot-v1")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
topic_model = BERTopic(umap_model=UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='euclidean'))
topics, probs = topic_model.fit_transform(docs, embeddings)


Visualize the topics selected by the model

In [3]:
topic_model.visualize_topics()

In [4]:

topic_model.visualize_heatmap()


In [8]:
import numpy as np
words_and_embedding = pd.read_csv("words_and_embedding.csv", index_col="Unnamed: 0")
reduced_embeddings = np.array(words_and_embedding[["0", "1"]])
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [10]:
from scipy.cluster import hierarchy as sch

linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

  0%|          | 0/105 [00:00<?, ?it/s]

100%|██████████| 105/105 [00:00<00:00, 191.60it/s]


.
├─■──obey_servant_subjugate_respect_resist ── Topic: 85
└─of_to_the_be_in
     ├─of_to_the_be_in
     │    ├─■──particle_final_question_imperative_rabbit ── Topic: 26
     │    └─of_to_the_be_in
     │         ├─of_to_the_in_be
     │         │    ├─of_to_the_in_be
     │         │    │    ├─■──kidneys_sedang_floor_spider_cheep ── Topic: 0
     │         │    │    └─of_to_the_in_rice
     │         │    │         ├─of_to_the_in_rice
     │         │    │         │    ├─■──hidden_hide_hiding_covered_hideandgoseek ── Topic: 84
     │         │    │         │    └─of_to_the_in_rice
     │         │    │         │         ├─■──happy_fortunate_lucky_mystical_worries ── Topic: 92
     │         │    │         │         └─of_to_the_in_rice
     │         │    │         │              ├─■──therefore_then_also_because_but ── Topic: 47
     │         │    │         │              └─of_to_the_in_rice
     │         │    │         │                   ├─of_to_the_in_rice
     │         │    │    

In [11]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
umap.transform(topic_model.topic_embeddings_)

Save the results.

In [23]:
words.join(pd.DataFrame(reduced_embeddings)).to_csv("words_and_embedding.csv")

In [None]:
pd.DataFrame(umap.transform(topic_model.topic_embeddings_))[:-1]

In [59]:
pd.DataFrame.from_dict(topic_model.topic_labels_, orient='index', columns=["words"])[1:].join(pd.DataFrame(umap.transform(topic_model.topic_embeddings_))[:-1]).to_csv("topics_and_embedding.csv")

In [None]:
topic_model.get_document_info(docs).to_csv(f"Topic_assignment_final.csv")

In [5]:
from umap import UMAP
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

sentence_model = SentenceTransformer("multi-qa-MiniLM-L6-dot-v1")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
topic_model = BERTopic(umap_model=UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='euclidean'), embedding_model=sentence_model)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [6]:
topic_embeddings = topic_model.topic_embeddings_
docs_embeddings = topic_model._extract_embeddings(docs)


In [16]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1229,-1_to_be_not_one,"[to, be, not, one, go, out, house, of, in, up]","[to not be able to stand up because of fall, s..."
1,0,213,0_oh_here_help_interjection,"[oh, here, help, interjection, write, boat, se...","[to look here and there , anything that is use..."
2,1,103,1_noise_sound_crying_hear,"[noise, sound, crying, hear, loud, yell, cry, ...",[noise of person crying; noise of blowing on s...
3,2,55,2_rice_husked_cooked_unhusked,"[rice, husked, cooked, unhusked, grain, grains...","[cooked rice, cooked rice, type of rice]"
4,3,53,3_speak_think_understand_tell,"[speak, think, understand, tell, magical, clev...",[to speak to a friend so that someone else wil...
...,...,...,...,...,...
98,97,11,97_obey_servant_subjugate_respect,"[obey, servant, subjugate, respect, resist, pe...","[to obey, to obey, to obey]"
99,98,11,98_jump_upon_pounce_leap,"[jump, upon, pounce, leap, jumps, copying, rea...","[jump, to jump, to jump]"
100,99,11,99_walk_walking_leaving_pebbly,"[walk, walking, leaving, pebbly, gets, stilts,...","[to walk fast , to walk go , to walk]"
101,100,11,100_nothing_there_empty_gone,"[nothing, there, empty, gone, nobody, we, woul...","[nothing, completely gone, nothing there , not..."


In [19]:
import numpy as np

topics = np.array(topics)
topics.shape

(3804,)

In [26]:
topics

array([29, -1, 13, ..., 33, -1, -1])

In [30]:
import numpy as np

# Assuming your cluster centers, points, and cluster assignments are named as follows:
# cluster_centers: (103, 384)
# points: (3804, 384)
# cluster_assignments: (3804,)

# Filter out points assigned to cluster -1
valid_points_mask = (topics != -1)
valid_points = docs_embeddings[valid_points_mask]
valid_assignments = topics[valid_points_mask]
# Calculate the distances for each valid point to its assigned cluster center
distances = np.linalg.norm(valid_points - topic_embeddings[valid_assignments+1], axis=1)
# Calculate the average distance for each cluster
average_distances = np.zeros(len(topic_embeddings))
count_per_cluster = np.zeros(len(topic_embeddings))
for i in range(len(valid_assignments)):
    cluster_idx = valid_assignments[i]
    average_distances[cluster_idx] += distances[i]
    count_per_cluster[cluster_idx] += 1
# Avoid division by zero
count_per_cluster[count_per_cluster == 0] = 1
average_distances /= count_per_cluster
# Print or use the average_distances array as needed
print("Average distances for each cluster:", average_distances)

Average distances for each cluster: [4.19412008 3.56826156 3.41267929 3.62278051 3.74118834 3.50947665
 3.3601815  3.57336186 3.42845339 3.37882641 3.57819325 3.87732537
 3.46911142 3.61464258 3.49267273 3.47034795 3.67683369 3.16047205
 3.72110975 3.21069974 3.59953339 3.46978888 3.53733463 3.87747634
 3.41918183 3.42889459 3.21450738 3.62527901 3.2812891  3.12309149
 3.5541458  3.58735541 3.13750129 2.90438263 3.64621373 3.32670499
 3.33687835 3.70243756 3.38392057 3.131729   3.24296014 3.05089232
 3.58639636 3.42576782 3.24241754 3.35343102 3.29217715 3.35310207
 3.44593477 3.52666298 3.26538144 3.52392297 3.1493188  3.53290789
 3.39218622 3.2330702  3.43019675 3.15813217 3.17319679 3.41021062
 3.55220394 3.48508543 3.53956604 3.16284187 2.86056508 3.51863326
 3.42702112 3.15109304 2.72721011 3.10434152 3.12020281 3.0861092
 3.25351095 3.03964227 2.40572375 3.38081098 3.39945762 3.22197118
 2.8330184  2.69783361 3.57423203 2.84090279 3.40777401 2.85795005
 2.97424032 2.90712144 3.03