# Topic Modeling text using transformers

In [None]:
# Import Libraries
!pip install transformers tf-keras
!pip install bertopic sentence-transformers umap-learn hdbscan
!pip install datasets
from datasets import load_dataset
from transformers import pipeline
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np




In [None]:
# Load text
dataset = load_dataset("ag_news", split="train[:200]")  # Load 200 samples for speed
docs = [item['text'] for item in dataset]

In [None]:
# Generate embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(docs, show_progress_bar=True)

# Dimensionality reduction with UMAP
umap_model = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine')
reduced_embeddings = umap_model.fit_transform(embeddings)

# Clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom')
labels = clusterer.fit_predict(reduced_embeddings)


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# Create DataFrame
df = pd.DataFrame({'Document': docs, 'Cluster': labels})

# Extract topic keywords per cluster using TF-IDF
topics = {}
for label in set(labels):
    if label == -1:
        continue  # Skip noise points
    cluster_docs = df[df['Cluster'] == label]['Document']
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
    tfidf_matrix = vectorizer.fit_transform(cluster_docs)
    keywords = vectorizer.get_feature_names_out()
    topics[label] = keywords

In [None]:
# Display extracted topics
print("\nExtracted Topics:\n")
for label, keywords in topics.items():
    print(f"Topic {label}: {', '.join(keywords)}")


Extracted Topics:

Topic 0: ad, al, arabia, cheney, government, gt, lt, president, saudi, strong
Topic 1: air, asian, bangkok, best, blues, business, destinations, fighter, needs, strike
Topic 2: 151, ap, birds, dolphin, new, north, ocean, said, scientists, species
Topic 3: claims, com, earth, hubble, meteor, nasa, perseid, planet, shower, space
Topic 4: 36, ap, astronauts, launch, manned, million, rocket, said, space, team
Topic 5: auction, company, google, interview, ipo, offering, playboy, public, reuters, search
Topic 6: 3d, doom, elements, gameboy, games, gondry, michel, music, visual, works
Topic 7: economy, lynn, market, new, oil, opec, outlook, prices, reuters, week
Topic 8: 3d, application, code, developers, java, just, logger, microsoft, mozilla, sun


# Using Bertopic

In [None]:
# Create a BERTopic model (uses transformers under the hood)
topic_model = BERTopic()

In [None]:
# Fit the model to your documents
topics, probs = topic_model.fit_transform(docs)

In [None]:
# Display topics
topic_info = topic_model.get_topic_info()
print(topic_info)

   Topic  Count              Name  \
0     -1     32  -1_the_to_and_of   
1      0     99    0_the_to_of_in   
2      1     69    1_ap_the_to_of   

                                      Representation  \
0        [the, to, and, of, in, on, for, is, by, as]   
1  [the, to, of, in, and, for, that, is, on, reut...   
2  [ap, the, to, of, spacecom, and, in, space, fo...   

                                 Representative_Docs  
0  [NTP in Debian \\The Network Time Daemon (NTP ...  
1  [Why Windows isn't Unix \\"I first heard about...  
2  [Marine Expedition Finds New Species (AP) AP -...  
