In [1]:
!pip3 install bertopic[all]

Collecting bertopic[all]
  Downloading bertopic-0.9.4-py2.py3-none-any.whl (57 kB)
[?25l[K     |█████▊                          | 10 kB 37.1 MB/s eta 0:00:01[K     |███████████▍                    | 20 kB 28.3 MB/s eta 0:00:01[K     |█████████████████               | 30 kB 18.2 MB/s eta 0:00:01[K     |██████████████████████▊         | 40 kB 15.9 MB/s eta 0:00:01[K     |████████████████████████████▍   | 51 kB 15.6 MB/s eta 0:00:01[K     |████████████████████████████████| 57 kB 5.4 MB/s 
[?25hCollecting plotly>=4.7.0
  Downloading plotly-5.5.0-py2.py3-none-any.whl (26.5 MB)
[K     |████████████████████████████████| 26.5 MB 1.6 MB/s 
Collecting numpy>=1.20.0
  Downloading numpy-1.21.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 56.6 MB/s 
[?25hCollecting hdbscan>=0.8.27
  Downloading hdbscan-0.8.27.tar.gz (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 65.6 MB/s 
[?25h  Installing build 

## Pass 1: Topic Modeling with BERTopic

Mostly borrowed from [BERTopic's original repository](https://github.com/MaartenGr/BERTopic/). Included for video demo

In [1]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

In [2]:
docs = fetch_20newsgroups(subset = 'all',  remove = ('headers', 'footers', 'quotes'))['data']

In [3]:
print(f"Number of documents = {len(docs)}")

Number of documents = 18846


In [4]:
docs[1]

'My brother is in the market for a high-performance video card that supports\nVESA local bus with 1-2MB RAM.  Does anyone have suggestions/ideas on:\n\n  - Diamond Stealth Pro Local Bus\n\n  - Orchid Farenheit 1280\n\n  - ATI Graphics Ultra Pro\n\n  - Any other high-performance VLB card\n\n\nPlease post or email.  Thank you!\n\n  - Matt\n'

In [5]:
topic_model = BERTopic(language = "english", calculate_probabilities = True) # We need the probabilities to visualize
topics, _ = topic_model.fit_transform(docs)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]



In [6]:
# Get the most frequent topics
topic_freq = topic_model.get_topic_freq()
outliers = topic_freq['Count'][topic_freq['Topic'] == -1].iloc[0]
print(f"{outliers} documents have not been classified")
print(f"The other {topic_freq['Count'].sum() - outliers} documents are {topic_freq['Topic'].shape[0] - 1} topics")

6158 documents have not been classified
The other 12688 documents are 207 topics


In [7]:
topic_freq.head()

Unnamed: 0,Topic,Count
0,-1,6158
1,0,1841
2,1,598
3,2,526
4,3,482


In [8]:
print(f"There are {topic_freq['Count'].iloc[1]} documents that are talking about topic ID {topic_freq['Topic'].iloc[1]}")

There are 1841 documents that are talking about topic ID 0


In [11]:
topic_model.get_topic(topic_freq['Topic'].iloc[10])

[('ram', 0.013630524706767991),
 ('drive', 0.010323742623797712),
 ('price', 0.010112561504320645),
 ('sale', 0.009884855852958275),
 ('pc', 0.008830466247363226),
 ('meg', 0.008739011702274443),
 ('os', 0.008714727482861423),
 ('monitor', 0.008706750843030967),
 ('card', 0.008597036404504996),
 ('mac', 0.00844015249791707)]

Let's visualize these topics in the embedding space!

In [12]:
topic_model.visualize_topics()

We can do some topic reduction. But won't talk about that here. Refere the original repo for more information.

## Pass 2: Breaking down BERTopic

The code in this section is brought to you by [this blog post](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6). Read for more info. Included here for explanatory purposes

In [13]:
!pip install sentence_transformers
!pip install umap-learn
!pip install hdbscan



In [14]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import umap
import hdbscan
import numpy as np
import pandas as pd

In [15]:
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') # We'll get to this :)
embeddings = model.encode(docs)

Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/555 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embeddings.shape

(18846, 768)

Every document is encoded into a 768 dim vector. But this is still high dimensional (especially if we are dealing with >100K documents).

In [16]:
umap_embeddings = umap.UMAP(n_neighbors = 15, 
                            n_components = 10,
                            min_dist = 0.0,
                            metric='cosine').fit_transform(embeddings)

In [17]:
umap_embeddings.shape

(18846, 10)

In [18]:
cluster = hdbscan.HDBSCAN(min_cluster_size = 10, 
                          metric = 'euclidean', 
                          cluster_selection_method = 'eom'
                          ).fit(umap_embeddings)

In [19]:
print(f"Number of clusters = {np.unique(cluster.labels_).shape[0] - 1}")

Number of clusters = 107


$$
CTFIDF_i = \frac{t_i}{w_i} \times log \frac{D}{\sum_n t_n}
$$

$t_i = \text{Number of occurances of current word in documents of the } i^{th} \text{topic/cluster}$

$w_i = \text{Number of distinct words in all clustered documents}$

$D = \text{Number of clusters}$

$\sum_n t_n = \text{Sum of occurances current word in all clustered document}$

In [20]:
docs_df = pd.DataFrame(docs, columns = ['document'])
docs_df['topic'] = cluster.labels_
docs_per_topic = docs_df.groupby(['topic']).agg({'document': ' '.join})

In [21]:
docs_per_topic = docs_per_topic.reset_index()

In [22]:
def CTfIDF(documents):
    count = CountVectorizer(stop_words='english').fit(documents)
    t = count.transform(documents).toarray() #num_clusters x num_words
    w = t.sum(axis=1) # num_clusters x 1
    tf = np.divide(t.T, w) #num_words x num_clusters
    D = len(documents) # num_clusters x 1
    sum_t = t.sum(axis=0) #num_words x 1
    idf = np.log(np.divide(D, sum_t)).reshape(-1, 1) # num_words x 1
    tf_idf = np.multiply(tf, idf) #num_words x num_clusters
    return tf_idf, count

In [23]:
tf_idf, count = CTfIDF(docs_per_topic['document'].values)

In [24]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['topic'])
                     .document
                     .count()
                     .reset_index()
                     .rename({"topic": "topic", "document": "size"}, axis='columns')
                     .sort_values("size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=10)
topic_sizes = extract_topic_sizes(docs_df)

In [25]:
topic_sizes.head(10)

Unnamed: 0,topic,size
0,-1,9981
26,25,803
11,10,674
27,26,642
14,13,528
1,0,517
22,21,485
85,84,384
40,39,293
23,22,222


In [26]:
top_n_words[11]

[('e_k', 0.0037732745955654638),
 ('fips', 0.0037732745955654638),
 ('cryptologia', 0.0036618948706096397),
 ('unicity', 0.0034917348660744017),
 ('cryptology', 0.0031684452549781317),
 ('ciphertext', 0.003157254085759954),
 ('kah67', 0.003145527349213129),
 ('ciphers', 0.003126203603362653),
 ('p_1', 0.0030837009436712752),
 ('ciphertexts', 0.003013336563203958)]

In [27]:
top_n_words[78]

[('nonexistence', 0.002362186791350249),
 ('excitable', 0.002236205523103062),
 ('mythology', 0.002205163718300397),
 ('lilac', 0.002154845940506263),
 ('agnostic', 0.0020948532530115464),
 ('uncritically', 0.0018897494330801991),
 ('myths', 0.001674205265739152),
 ('philosophers', 0.0016538727887252978),
 ('gullible', 0.001613338530583532),
 ('agnosticism', 0.0015605937437761022)]