In [2]:
%load_ext autoreload
%autoreload 2

# IMPORTS

In [3]:
# data
import pandas as pd

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

# src
from src.general.io import to_pickle, from_pickle
from src.general.io import disable_warnings

In [4]:
disable_warnings()

# PATHS & NAMES

In [11]:
PROCESSED_FOLDER = "../data/processed"
INTERIM_FOLDER = "../data/interim"
REPORT_FOLDER = "../reports/bertopic"
MODELS_FOLDER = "../models"

PROCESSED_FILENAME = "features.pkl"
PROCESSED_FILEPATH = f"{PROCESSED_FOLDER}/{PROCESSED_FILENAME}"

SPACY_TOKENIZED_FILENAME = "feedback_docs.pkl"
RASA_DOCS_FILENAME = "rasa_docs.pkl"
RASA_EMBEDINGS_FILENAME = "rasa_embedings.pkl"
SPACY_TOKENIZED_FILEPATH = f"{INTERIM_FOLDER}/{SPACY_TOKENIZED_FILENAME}"
RASA_DOCS_FILEPATH = f"{INTERIM_FOLDER}/{RASA_DOCS_FILENAME}"
RASA_EMBEDINGS_FILEPATH = f"{INTERIM_FOLDER}/{RASA_EMBEDINGS_FILENAME}"

SBERT_PATH = f"{MODELS_FOLDER}/sbert"
FIT_BERTOPIC_PATH = f"{MODELS_FOLDER}/bertopic"

SEED = 42
N_TOPICS = 20

# DATA

In [6]:
docs = from_pickle(RASA_DOCS_FILEPATH)

# Topic modelling

## BERTopic

this model needs **at least 1000 datapoints**. Here presented just for future use.

In [7]:
# Embedding model:
embedding_model = SentenceTransformer(SBERT_PATH) # "all-MiniLM-L6-v2"

# Clustering model:
cluster_model = HDBSCAN(min_cluster_size = 10, 
                        metric = 'euclidean', 
                        cluster_selection_method = 'eom', 
                        prediction_data = True)

# BERTopic model
topic_model = BERTopic(embedding_model = embedding_model,
                       hdbscan_model = cluster_model)

In [8]:
# Fit the model on a corpus
try:
    topic_model = from_pickle(FIT_BERTOPIC_PATH)
except FileNotFoundError:    
    topic_model.fit(docs)
    to_pickle(topic_model, FIT_BERTOPIC_PATH)

<bertopic._bertopic.BERTopic at 0x1a1b6b0d300>

In [9]:
topics, probs = topic_model.transform(docs)
pd.DataFrame({"topics":topics, "probs":probs})["topics"].value_counts()

-1      495
 0      155
 1      106
 2      105
 3      102
       ... 
 254     11
 255     11
 256     11
 257     11
 258     10
Name: topics, Length: 260, dtype: int64

In [12]:
# Visualization examples:
# Save intertopic distance map as HTML file
topic_model.visualize_topics().write_html(f"{REPORT_FOLDER}/intertopic_dist_map.html")

# Save topic-terms barcharts as HTML file
topic_model.visualize_barchart(top_n_topics = 25).write_html(f"{REPORT_FOLDER}/barchart.html")

# Save documents projection as HTML file
topic_model.visualize_documents(docs).write_html(f"{REPORT_FOLDER}/projections.html")

# Save topics dendrogram as HTML file
topic_model.visualize_hierarchy().write_html(f"{REPORT_FOLDER}/hieararchy.html")

True