In [1]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from collections import Counter
import matplotlib.pyplot as plt
from plotting_utils import save_figure
from umap import UMAP
import hdbscan


import multiprocessing

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load processed_labeled_data_all.csv
df = pd.read_csv('./data/processed_data_all.csv')

In [37]:
df.head()

Unnamed: 0,product_id,description,month_mode,quarter_mode,year_mode,day_week_mode,quantity_sum,price_sum,unit_weight,customer_country_mode,customer_country_count,customer_id_count,category,unit_price_mean,description_original
0,10004,cfcf sfy bolted hanger nptf stud viton,2,1,2020,2,6,232.92,0.124,DK,1,1,-1,38.82,"C24FCF-1/2-SFY-S BOLTED HANGER, N..."
1,10005,cfcf sfy bolted hanger nptf stud viton,10,4,2019,2,200,1774.0,0.259,CO,1,1,-1,8.87,"C24FCF-1-SFY-S BOLTED HANGER, N..."
2,10006,cfcf sfy bolted hanger nptf stud viton,10,4,2019,2,300,1848.0,0.276,CO,1,1,-1,6.16,"C24FCF-1 1/2-SFY-S BOLTED HANGER, N..."
3,10015,cfcf sfy bolted hanger nptf stud viton,1,1,2019,2,300,2115.0,0.318,SE,2,2,-1,7.05,"C24FCF-2-SFY-S BOLTED HANGER, N..."
4,10040,cfcf sfy bolted hanger nptf stud viton,1,1,2019,2,400,3119.0,0.363,CO,2,2,-1,7.7975,"C24FCF-3-SFY-S BOLTED HANGER, N..."


In [3]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

# 1. Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(df['description'].tolist(), show_progress_bar=True)


Batches:  17%|█▋        | 185/1100 [00:38<02:33,  5.95it/s]

In [None]:
# adjust umap parameters
# default parameters: n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=None
umap_model = UMAP(n_neighbors=200, # tune
                  n_components=5, # fixed
                  metric='cosine', # fixed
                  random_state=42)

In [None]:
# default -> min_cluster_size=10, min_samples=min_cluster_size
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=50, # tune
                                min_samples=10, # tune
                                prediction_data=True)

In [None]:
# 2. Initialize BERTopic Model
model = BERTopic(language="english", 
                calculate_probabilities=False, 
                verbose=True, 
                nr_topics="auto", # fixed
                min_topic_size=500, # tune
                top_n_words=10, # fixed
                umap_model=umap_model,
                hdbscan_model=hdbscan_model)

# 3. Fit the Model to the pre-calculated embeddings and assign the discovered topics back to DataFrame
topics, _ = model.fit_transform(df['description'].tolist(), embeddings=embeddings)
df['topic'] = topics


# 4. If needed, extract topic representation
topic_representation = model.get_topic_info()
topic_documents = model.get_document_info(df['description'])

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2023-10-05 19:17:46,701 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-10-05 19:17:48,600 - BERTopic - Clustered reduced embeddings
2023-10-05 19:17:49,109 - BERTopic - Reduced number of topics from 288 to 136


In [None]:
fig = model.visualize_barchart(top_n_topics = 41, n_words = 10)
plt.tight_layout()
fig.show()

<Figure size 640x480 with 0 Axes>

In [82]:
model.visualize_topics(custom_labels=True)

In [87]:
model.visualize_heatmap(n_clusters=5)

In [88]:
hierarchical_topics = model.hierarchical_topics(df['description'])

100%|██████████| 39/39 [00:00<00:00, 432.49it/s]


In [89]:
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [59]:
# show df only for cateogyr different than '-1'
df_labeled = df[df['category'] != '-1']

In [91]:
# show for df_labeled groupby category and topic and just these columns
df_labeled.groupby(['category', 'topic']).size().reset_index(name='counts')

Unnamed: 0,category,topic,counts
0,A,-1,50
1,A,1,11
2,A,3,49
3,A,6,2
4,A,9,25
5,A,11,10
6,B,-1,23
7,B,1,23
8,B,3,21
9,B,5,1


In [65]:
# groupby df by topic count
df_topic_count = df.groupby(['topic']).size().reset_index(name='counts')
df_topic_count

Unnamed: 0,topic,counts
0,-1,17009
1,0,3129
2,1,2547
3,2,1623
4,3,1464
5,4,1403
6,5,1326
7,6,953
8,7,719
9,8,644
