# BERTopic for topic modeling

1. applying bertopic
2. visualizing topics
3. preparing representative docs for topics

### import and preprocessing

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [43]:
df = pd.read_csv('datasets/Dataset_10k.csv')

In [44]:
# changing artificial intelligence to ai
df['title1'] = df['title'].apply(lambda x: re.sub(r'(?i)\s*artificial\s+intelligence\s*', ' ai ', x))
df['title1'] = df['title1'].apply(lambda x: x.strip())
# removing consecutive ai terms
df['title1']=df['title1'].apply(lambda x:  re.sub(r'(ai\s+)+', 'ai ', x))
df['title1'] = df['title1'].apply(lambda x: x.strip())

In [7]:
texts = list(df['title1'])

### Applying Bertopic

In [57]:
from sentence_transformers import SentenceTransformer

# Embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(texts, show_progress_bar=True)
embeddings.shape

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

(10000, 384)

In [8]:
pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB

In [8]:
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=100, min_dist=0.0, metric='cosine', random_state=42)

In [15]:
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vect_model = CountVectorizer(ngram_range=(1,2),stop_words='english',min_df=2)

In [11]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [12]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)


# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    #"OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model,
    "POS": pos_model
}

In [40]:
texts[:10]

['will.i.am and Fyilicia on the AI revolution, inclusivity & The Voice UK ...Tech & Science Daily podcast',
 'Intel Launches World’s First Systems Foundry Designed for the AI Era',
 'The Unique Challenges of Selling Enterprise AI',
 "Contentious California AI bill passes legislature, awaits governor's signature",
 'Exploring Genius, Creation, and Humanity in the Age of AI',
 "Conversational AI improves 'fourth trimester' maternal care at Penn Medicine",
 'Disney harnesses AI to drive streaming ad technology',
 'Will Rinehart: Unpacking the Executive Order on ai',
 'Video: Where Bitcoin and ai Meet',
 "NVIDIA's relentless rally: AI chip giant eyes new heights\u200b"]

**NOTE:** 
the following is the implementation of the bertopic model. The obtained BERTopic model is stored as tmod2 file.

In [9]:

from bertopic import BERTopic
# from umap import UMAP
# from hdbscan import HDBSCAN
# from sklearn.feature_extraction.text import CountVectorizer
# from bertopic.vectorizers import ClassTfidfTransformer
# from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech


# umap_model = UMAP(n_neighbors=15, n_components=100, min_dist=0.0, metric='cosine', random_state=42)
# hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# vect_model = CountVectorizer(ngram_range=(1,2),stop_words='english',min_df=2)
# ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


# # KeyBERT
# keybert_model = KeyBERTInspired()
# # Part-of-Speech
# pos_model = PartOfSpeech("en_core_web_sm")
# # MMR
# mmr_model = MaximalMarginalRelevance(diversity=0.3)


# # All representation models
# representation_model = {
#     "KeyBERT": keybert_model,
#     "MMR": mmr_model,
#     "POS": pos_model
# }

# topic_model = BERTopic(

#   # Pipeline models
#   embedding_model=embedding_model,
#   umap_model=umap_model,
#   hdbscan_model=hdbscan_model,
#   vectorizer_model=vect_model,
#   representation_model=representation_model,
#   ctfidf_model=ctfidf_model,
#   # Hyperparameters
#   top_n_words=10,
#   verbose=True
# )

# # Train model
# topics, probs = topic_model.fit_transform(texts, embeddings)

# # Show topics
# topic_model.get_topic_info()

# topic_model.save('tmod2', serialization="pickle")

In [51]:
topic_model = BERTopic.load("tmod2")

In [52]:
topic_model.umap_model,topic_model.hdbscan_model,topic_model.vectorizer_model,topic_model.ctfidf_model

(UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=100, n_jobs=1, random_state=42, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 HDBSCAN(min_cluster_size=20, prediction_data=True),
 CountVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english'),
 ClassTfidfTransformer(reduce_frequent_words=True))

In [13]:
c = topic_model.get_topic_info()
c.head()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,3310,-1_human_chief_good_ibm,"[human, chief, good, ibm, officer, ai officer,...","[ai officer, ai policy, launches ai, chief ai,...","[human, ibm, ai officer, chief ai, sap, white ...","[human, chief, good, officer, house, machine, ...",[News | ESSC | UAH scientist earns National ai...
1,0,808,0_health_care_healthcare_medical,"[health, care, healthcare, medical, medicine, ...","[healthcare ai, ai healthcare, ai medical, hea...","[healthcare, ai healthcare, ai health, healthc...","[health, care, healthcare, medical, medicine, ...","[ai in Health Care, AI in Health Care: Powerin..."
2,1,412,1_education_schools_school_students,"[education, schools, school, students, classro...","[ai education, ai schools, ai academic, ai cla...","[education, schools, students, ai education, a...","[education, schools, school, students, classro...",[Exploring the impact of ai on higher educatio...
3,2,281,2_generative ai_generative_ai generative_use g...,"[generative ai, generative, ai generative, use...","[ai generative, generative ai, ai creative, br...","[generative ai, ai generative, use generative,...","[generative, usage, enterprise, creative, diff...",[Executive Conversations: Putting generative A...
4,3,260,3_jobs_job_hr_hiring,"[jobs, job, hr, hiring, workers, ai jobs, empl...","[ai workplace, ai jobs, workforce ai, ai job, ...","[jobs, hr, ai jobs, employers, ai workplace, w...","[jobs, job, hiring, workers, employers, employ...",[AI will affect 40% of jobs and probably worse...


In [46]:
df['Topic'] = topic_model.topics_

In [47]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,title,source,number_of_characters_title,number_of_words_title,day_of_week,month,year,quarter,is_weekend,category,title1,Topic
0,0,2024-10-14,"will.i.am and Fyilicia on the AI revolution, i...",Evening Standard,122,20,Monday,October,2024,4,False,Other,"will.i.am and Fyilicia on the AI revolution, i...",72
1,1,2024-02-21,Intel Launches World’s First Systems Foundry D...,Investor Relations :: Intel Corporation (INTC),117,18,Wednesday,February,2024,1,False,Other,Intel Launches World’s First Systems Foundry D...,-1
2,2,2024-02-05,The Unique Challenges of Selling Enterprise AI,Emerge,54,9,Monday,February,2024,1,False,Career,The Unique Challenges of Selling Enterprise AI,11
3,3,2024-08-28,Contentious California AI bill passes legislat...,Reuters,88,11,Wednesday,August,2024,3,False,Other,Contentious California AI bill passes legislat...,27
4,4,2024-10-15,"Exploring Genius, Creation, and Humanity in th...",University of Aberdeen,82,14,Tuesday,October,2024,4,False,Other,"Exploring Genius, Creation, and Humanity in th...",-1


In [48]:
df.to_csv('datasets/Dataset_10k_topics.csv') # storing the topic information of each headlines

### Visualizing Topics

The visualizations are only visible in google collab.
Refer to the bertopic visualizations folder for the outputs.

Topic = -1 indicates the headlines that couldn't be clustered with any of the other topics.

In [24]:
topic_model.visualize_topics() # stored as topic_visualization.png

In [12]:
topic_model.visualize_hierarchy() #stored as visualize-hierachy.png

In [25]:
topic_model.visualize_barchart(top_n_topics=88) # dtored as topic_keywords.pnd

In [41]:
pip install plotly



In [53]:
classes = list(df['category'])
texts = list(df['title1'])
topics_per_class = topic_model.topics_per_class(texts, classes=classes)

4it [00:00,  6.59it/s]


In [54]:
topic_model.visualize_topics_per_class(topics_per_class,top_n_topics=88)

In [55]:
# Create the Plotly figure
fig = topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=88)

# Save the interactive Plotly figure as an HTML file
fig.write_html("topics_per_class_visualization.html")


In [67]:
topic_model.visualize_documents(docs=texts,topics=topic_model.topics_,embeddings=embeddings,hide_annotations=True)

In [69]:
# Visualize the documents with consistent results
fig = topic_model.visualize_documents(texts, topic_model.topics_, embeddings=embeddings)

# Save the interactive Plotly figure as an HTML file
fig.write_html("consistent_documents_visualization.html")

print("The visualization has been saved as 'consistent_documents_visualization.html'")

The visualization has been saved as 'consistent_documents_visualization.html'


### Creating Representative Docs for each document

The bertopic model returns 3 representative docs for each topic but inorder to gain more insights/overview of each topic it is better to have more representative docs. Here we try to get min 20 representative docs for each topic using the **_extract_representative_doc** function

In [16]:
c['Representative_Docs'].iloc[0]
# Only 3 representative docs are returned, let's try to get more

['News | ESSC | UAH scientist earns National ai Research Resource (NAIRR) funding to build AI foundation model for heliophysics',
 'Yann LeCun, chief AI scientist at Meta: ‘Human-level ai is going to take a long time’',
 'Microsoft quadruples its investment in AI and Cloud infrastructure in Spain to promote the deployment of responsible and secure ai in companies and public administration – Centro de noticias']

In [17]:
len(topic_model.topics_) # topic_model.topics_ is the topics assigned to the headlines when trained

10000

In [18]:
from collections import defaultdict

# Convert the list of texts into a DataFrame
documents_df = pd.DataFrame(texts, columns=["Document"])
documents_df['Topic']=topic_model.topics_
topics_dict = defaultdict(list)
ddf = documents_df.sort_values(by='Topic')
for idx, topic in enumerate(ddf['Topic']):
    topics_dict[topic].append(idx)


In [19]:
topics_dict.keys()

dict_keys([-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87])

In [20]:
# extracting top 20 documents as we assigned HDBSCAN a min_cluster_size = 20

d = topic_model._extract_representative_docs(c_tf_idf=topic_model.c_tf_idf_, documents=ddf, topics=topics_dict, nr_samples=500, nr_repr_docs=20)[0]

In [21]:
c['Representative_Docs'].iloc[3] # representative docs of Topic 2

['Executive Conversations: Putting generative AI to work in omnichannel customer service with Prashant Singh, Chief Operating Officer at LeadSquared | Amazon Web Services',
 'DataRobot Announces New Enterprise-Grade Functionality to Close the Generative AI Confidence Gap and Accelerate Adoption',
 'What’s Next for Generative AI?']

In [22]:
d[2] # few other representative docs for Topic 2

['Gartner Says Generative AI for Procurement Has Hit Peak of Inflated Expectations',
 'Think 2024 On Demand | watsonx: Scale The Impact Of Generative AI',
 'Empowering minds: a round table on Generative AI and Education in Asia-Pacific',
 'New data shows that workers are bringing their own generative AI to work',
 'Generative AI vs. predictive AI: Understanding the differences',
 'Joint statement on competition in generative AI foundation models and AI products',
 'Exploring the Impact of Generative AI on Software Engineering and Career Paths',
 'How to use generative AI beyond efficiency: Just talk to it for half an hour every week – OK Tomorrow founder Nilesh Ashra',
 'Realizing the potential of generative AI in human services: Use cases to transform program delivery',
 'What’s Next for Generative AI?',
 'Generative AI vs Machine Learning: Key Differences and Use Cases',
 'Canva adds a new generative AI platform to its growing creative empire',
 'Charting a New Frontier for Generativ

In [45]:
# checking whether all the representative docs(c) are present in the extracted ones(d)
for i in range(c.shape[0]-1):
  t = 0
  for x in c['Representative_Docs'].iloc[i+1]:
    if x not in d[i]:
      t = t+1
  print(f'topic {i}: missing rep docs: {t}')


topic 0: missing rep docs: 3
topic 1: missing rep docs: 1
topic 2: missing rep docs: 0
topic 3: missing rep docs: 1
topic 4: missing rep docs: 0
topic 5: missing rep docs: 0
topic 6: missing rep docs: 0
topic 7: missing rep docs: 0
topic 8: missing rep docs: 0
topic 9: missing rep docs: 0
topic 10: missing rep docs: 0
topic 11: missing rep docs: 0
topic 12: missing rep docs: 0
topic 13: missing rep docs: 0
topic 14: missing rep docs: 0
topic 15: missing rep docs: 0
topic 16: missing rep docs: 0
topic 17: missing rep docs: 0
topic 18: missing rep docs: 0
topic 19: missing rep docs: 0
topic 20: missing rep docs: 0
topic 21: missing rep docs: 0
topic 22: missing rep docs: 0
topic 23: missing rep docs: 0
topic 24: missing rep docs: 0
topic 25: missing rep docs: 0
topic 26: missing rep docs: 0
topic 27: missing rep docs: 0
topic 28: missing rep docs: 0
topic 29: missing rep docs: 0
topic 30: missing rep docs: 0
topic 31: missing rep docs: 0
topic 32: missing rep docs: 0
topic 33: missing re

In [33]:
# lets combine representative docs 'c' and 'd' s
repdocs = {}
for i in range(c.shape[0]-1):
  repdocs[i] = list(set(d[i]+c['Representative_Docs'].iloc[i+1]))
repdocs

{0: ['AI in healthcare: The future of patient care and health management',
  'ai could help detect heart failure risk early on, study shows',
  'Customizable AI tool developed at Stanford Medicine helps pathologists identify diseased cells | News Center',
  'AI-enhanced integration of genetic and medical imaging data for risk assessment of Type 2 diabetes',
  '(PDF) Organizational readiness for ai in health care: insights for decision-making and practice',
  'AI-based system to guide stroke treatment decisions may help prevent another stroke',
  'ai -based assessment of PD-L1 expression in diffuse large B cell lymphoma | npj Precision Oncology',
  'AI in Health Care: Powering Patient Outcomes',
  'Harnessing ai (AI) in Anaesthesiology: Enhancing Patient Outcomes and Clinical Efficiency',
  'AI-based selection of individuals for supplemental MRI in population-based breast cancer screening: the randomized ScreenTrustMRI trial',
  'ai applied to coronary artery calcium scans (AI-CAC) sign

In [None]:
import json

# Save the dictionary to a JSON file
with open("datasets/rep_docs2.json", "w") as file:
    json.dump(repdocs, file)
