### Topic Modeling using BerTopic

- https://maartengr.github.io/BERTopic/algorithm/algorithm.html

In [7]:
import pandas as pd
df=pd.read_csv("data/train.csv")
df=df[:1000].copy()
print(df.shape)
df.head()

(1000, 2)


Unnamed: 0,text,label
0,Last session of the day http://twitpic.com/67ezh,1
1,Shanghai is also really exciting (precisely -...,2
2,"Recession hit Veronique Branquinho, she has to...",0
3,happy bday!,2
4,http://twitpic.com/4w75p - I like it!!,2


In [8]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)

In [9]:
topics , probs=topic_model.fit_transform(df.text.to_list())

In [10]:
topics

[-1,
 0,
 1,
 -1,
 -1,
 -1,
 4,
 0,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 0,
 2,
 10,
 7,
 -1,
 4,
 3,
 -1,
 3,
 2,
 5,
 9,
 1,
 2,
 0,
 3,
 9,
 -1,
 -1,
 0,
 3,
 1,
 8,
 0,
 2,
 6,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 1,
 4,
 4,
 1,
 -1,
 -1,
 9,
 -1,
 4,
 -1,
 -1,
 2,
 3,
 3,
 4,
 -1,
 3,
 -1,
 1,
 -1,
 1,
 1,
 0,
 -1,
 1,
 -1,
 -1,
 -1,
 0,
 0,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 2,
 -1,
 7,
 -1,
 2,
 -1,
 8,
 -1,
 -1,
 1,
 5,
 1,
 7,
 2,
 2,
 3,
 -1,
 -1,
 2,
 0,
 1,
 -1,
 3,
 -1,
 -1,
 0,
 0,
 6,
 -1,
 4,
 6,
 8,
 -1,
 -1,
 10,
 1,
 -1,
 0,
 0,
 1,
 -1,
 -1,
 4,
 1,
 -1,
 9,
 -1,
 2,
 -1,
 2,
 3,
 2,
 -1,
 -1,
 -1,
 7,
 0,
 0,
 6,
 0,
 -1,
 8,
 -1,
 2,
 5,
 2,
 -1,
 0,
 2,
 2,
 2,
 2,
 -1,
 0,
 8,
 -1,
 -1,
 9,
 5,
 3,
 1,
 1,
 5,
 -1,
 3,
 -1,
 3,
 -1,
 -1,
 0,
 -1,
 8,
 3,
 -1,
 -1,
 3,
 -1,
 7,
 -1,
 0,
 -1,
 -1,
 1,
 6,
 2,
 -1,
 1,
 -1,
 0,
 9,
 -1,
 -1,
 1,
 1,
 2,
 -1,
 3,
 0,
 4,
 7,
 0,
 4,
 1,
 -1,
 -1,
 0,
 1,
 4,
 -1,
 1,
 -1,
 1,
 5,
 8,
 -1,
 -1,
 -1,
 -1,
 1,
 0,
 5,
 -1,
 1