In [1]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from collections import Counter
import matplotlib.pyplot as plt
from plotting_utils import save_figure
from umap import UMAP
import hdbscan


import multiprocessing

In [2]:
# load processed_labeled_data_all.csv
df = pd.read_csv('./data/processed_data_all_vol_2.csv')

In [3]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

# 1. Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(df['description'].tolist(), show_progress_bar=True)


Batches:   0%|          | 0/1067 [00:00<?, ?it/s]

In [4]:
# adjust umap parameters
# default parameters: n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=None
umap_model = UMAP(n_neighbors=15, # tune
                  n_components=5, # fixed
                  metric='cosine', # fixed
                  random_state=42)

In [5]:
# default -> min_cluster_size=10, min_samples=min_cluster_size
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=50, # tune
                                min_samples=5, # tune
                                prediction_data=True)

In [6]:
# 2. Initialize BERTopic Model
model = BERTopic(language="english", 
                calculate_probabilities=False, 
                verbose=True, 
                nr_topics=149, # fixed
                min_topic_size=500, # tune
                top_n_words=10, # fixed
                umap_model=umap_model,
                hdbscan_model=hdbscan_model)

# 3. Fit the Model to the pre-calculated embeddings and assign the discovered topics back to DataFrame
topics, _ = model.fit_transform(df['description'].tolist(), embeddings=embeddings)
df['topic'] = topics


# 4. If needed, extract topic representation
topic_representation = model.get_topic_info()
topic_documents = model.get_document_info(df['description'])

Exception origin:
  File "/Users/annabzinkowska/anaconda3/envs/thesis_/lib/python3.10/site-packages/numba/core/types/functions.py", line 486, in __getnewargs__
    raise ReferenceError("underlying object has vanished")

  nn_descent_internal_low_memory_parallel(
2023-12-27 13:18:14,004 - BERTopic - Reduced dimensionality
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling

In [7]:
topic_representation

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9251,-1_lkb_bal_basic_none,"[lkb, bal, basic, none, cip, iso, fpm, butterf...","[iso fpm polished basic bal lo none cip, iso f..."
1,0,2110,0_legs_pump_shroud_kw,"[legs, pump, shroud, kw, hz, screw, sic, lkh, ...",[pump lkh kw hz sms sic epdm blasted screw leg...
2,1,906,1_maintainable_inch_change_ssv,"[maintainable, inch, change, ssv, shut, nc, tr...","[ssv inch blasted epdm change nc maintainable,..."
3,2,576,2_fgd_dv_st_man,"[fgd, dv, st, man, pneu, qdoc, asme, sf, cl, bpe]",[valve dv st dn fgd man cl cl ra sf ptfe epdm ...
4,3,542,3_sanimidget_sanimagnum_sanimicro_ex,"[sanimidget, sanimagnum, sanimicro, ex, qdoc, ...","[sanimidget weldbpe qdoc ex, sanimicro lf weld..."
...,...,...,...,...,...
143,142,51,142_plug_set_mix_upper,"[plug, set, mix, upper, mixproof, lower, amp, ...","[upper plug dn mixproof plug set, upper plug d..."
144,143,51,143_raw_bend_deg_bs,"[raw, bend, deg, bs, pinstamp, cd, elbow, pol,...","[bend pol raw bs, bend pol raw bs, bend pol ra..."
145,144,50,144_lc_kg_mtr_fs,"[lc, kg, mtr, fs, box, disp, cal, mount, ether...",[lc bl kg legs mtr fs ethernet ip cal box disp...
146,145,50,145_bush_bushing_bearing_item,"[bush, bushing, bearing, item, sanitary, colla...","[bearing bush sanitary item, bearing bush sani..."
