In [1]:
!pip install gensim pyLDAvis umap-learn bertopic
!pip install scipy==1.11.4 --force-reinstall

Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim import corpora
import nltk
from nltk.corpus import stopwords
from bertopic import BERTopic
import logging
import warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoid tokenizer fork warnings

warnings.filterwarnings("ignore")
nltk.download('stopwords')

# Load sample dataset
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = data.data[:2000]  # Limit to speed up processing

# --------------------- #
# 📌 Preprocessing
# --------------------- #
from sklearn.pipeline import Pipeline
import re
import spacy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def preprocess(doc):
    doc = re.sub(r"\S*@\S*\s?", "", doc)  # Remove emails
    doc = re.sub(r"\'", "", doc)
    doc = re.sub(r"\W+", " ", doc.lower())  # Remove non-alphanumeric
    doc = nlp(doc)
    return " ".join([token.lemma_ for token in doc if token.is_alpha and not token.is_stop and len(token) > 2])

docs_clean = [preprocess(doc) for doc in docs]

2025-07-20 15:45:03.998405: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753026304.218407      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753026304.280266      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# --------------------- #
# LDA
# --------------------- #
tokenized = [doc.split() for doc in docs_clean]
dictionary = corpora.Dictionary(tokenized)
corpus = [dictionary.doc2bow(text) for text in tokenized]
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42, passes=10)

print("\n LDA Topics:")
for i, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {i}: {topic}")



 LDA Topics:
Topic 0: 0.007*"like" + 0.005*"good" + 0.005*"know" + 0.005*"year" + 0.005*"think" + 0.004*"thing" + 0.004*"go" + 0.004*"get" + 0.003*"new" + 0.003*"time"
Topic 1: 0.008*"people" + 0.005*"say" + 0.004*"armenian" + 0.004*"president" + 0.004*"government" + 0.004*"think" + 0.003*"time" + 0.003*"year" + 0.003*"state" + 0.003*"go"
Topic 2: 0.007*"think" + 0.006*"know" + 0.005*"time" + 0.004*"game" + 0.004*"like" + 0.004*"people" + 0.004*"way" + 0.003*"go" + 0.003*"year" + 0.003*"child"
Topic 3: 0.008*"god" + 0.006*"people" + 0.005*"know" + 0.004*"think" + 0.004*"say" + 0.004*"time" + 0.004*"jesus" + 0.003*"thing" + 0.003*"believe" + 0.003*"like"
Topic 4: 0.008*"use" + 0.007*"file" + 0.006*"system" + 0.005*"image" + 0.005*"jpeg" + 0.005*"program" + 0.004*"work" + 0.004*"window" + 0.004*"know" + 0.004*"drive"


In [4]:
# --------------------- #
# NMF
# --------------------- #
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(docs_clean)

nmf_model = NMF(n_components=5, random_state=42)
nmf_model.fit(tfidf)

print("\n NMF Topics:")
feature_names = tfidf_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf_model.components_):
    print(f"Topic {topic_idx}: {' '.join([feature_names[i] for i in topic.argsort()[:-11:-1]])}")


 NMF Topics:
Topic 0: think people like know good year time right game thing
Topic 1: window file program run thank use work application server mail
Topic 2: drive scsi disk hard mac controller floppy external ide thank
Topic 3: god jesus bible believe christian sin christ people law human
Topic 4: card driver bus video color monitor vlb isa cache controller


In [5]:
# --------------------- #
# BERTopic
# --------------------- #
print("\n Running BERTopic...")
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Better for short texts

bertopic = BERTopic(embedding_model=embedding_model, min_topic_size=10, verbose=True)
topics, _ = bertopic.fit_transform(docs_clean)

print("\n BERTopic Topics:")
print(bertopic.get_topic_info().head(10))  # Print top 10 topics



 Running BERTopic...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2025-07-20 15:46:51,214 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2025-07-20 15:47:30,121 - BERTopic - Embedding - Completed ✓
2025-07-20 15:47:30,122 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-20 15:47:45,120 - BERTopic - Dimensionality - Completed ✓
2025-07-20 15:47:45,121 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-20 15:47:45,202 - BERTopic - Cluster - Completed ✓
2025-07-20 15:47:45,214 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-20 15:47:45,397 - BERTopic - Representation - Completed ✓



 BERTopic Topics:
   Topic  Count                        Name  \
0     -1     32       -1_pen_team_game_blue   
1      0   1751     0_know_people_think_use   
2      1     77      1_year_win_game_player   
3      2     57           2_det_tor_nyr_bos   
4      3     36  3_inguiry_wrong_sure_think   
5      4     25                  4_uhhhh___   
6      5     22      5_game_hockey_espn_abc   

                                      Representation  \
0  [pen, team, game, blue, philadelphia, face, jo...   
1  [know, people, think, use, like, time, work, s...   
2  [year, win, game, player, hit, team, good, bas...   
3  [det, tor, nyr, bos, chi, mtl, shot, game, tea...   
4          [inguiry, wrong, sure, think, , , , , , ]   
5                          [uhhhh, , , , , , , , , ]   
6  [game, hockey, espn, abc, coverage, watch, cit...   

                                 Representative_Docs  
0  [pocklington want wake power hold political of...  
1  [afraid go kill worry love christian guara

In [6]:
# --------------------- #
# Topic Coherence Comparison
# --------------------- #
from gensim.models.coherencemodel import CoherenceModel

def compute_coherence(model, texts, dictionary, topn=10):
    if isinstance(model, gensim.models.ldamodel.LdaModel):
        return CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
    
    elif isinstance(model, NMF):
        topics = []
        for topic_weights in model.components_:
            top_words = [feature_names[i] for i in topic_weights.argsort()[:-topn - 1:-1]]
            topics.append(top_words)
        return CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
    
    elif isinstance(model, BERTopic):
        # FIXED: Extract only the words from each topic
        topics = [
            [word for word, _ in topic_words]
            for topic_id, topic_words in model.get_topics().items()
            if topic_id >= 0
        ]
        return CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
    
    else:
        return None

print("\n Coherence Scores:")
print("LDA:", compute_coherence(lda_model, tokenized, dictionary))
print("NMF:", compute_coherence(nmf_model, tokenized, dictionary))
print("BERTopic:", compute_coherence(bertopic, tokenized, dictionary))



 Coherence Scores:
LDA: 0.4602518183073701
NMF: 0.6013958308359433
BERTopic: 0.5789760060016346
