In [1]:
%pip install -r requirements.txt -q


Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
import os
warnings.filterwarnings("ignore")
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


# Lab 5 - Clustering and Topic Modeling

This notebook explores text clustering and topic modeling using the ArXiv NLP dataset. The pipeline consists of:
1. Embedding documents with sentence-transformers
2. Dimensionality reduction with UMAP
3. Clustering with HDBSCAN
4. Topic modeling with BERTopic

## 1. Load Dataset


In [3]:
from datasets import load_dataset

dataset = load_dataset("maartengr/arxiv_nlp")["train"]
abstracts = dataset["Abstracts"]
titles = dataset["Titles"]
print(f"Loaded {len(abstracts)} documents")


Generating train split: 44949 examples [00:00, 89076.15 examples/s]


Loaded 44949 documents


## 2. Create Embeddings

Using `thenlper/gte-small` - a compact but effective embedding model optimized for semantic similarity.


In [4]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('thenlper/gte-small')
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)
print(f"Embeddings shape: {embeddings.shape}")


Batches:   1%|          | 12/1405 [01:00<1:57:15,  5.05s/it]


KeyboardInterrupt: 

## 3. Dimensionality Reduction with UMAP

Reducing from 384 dimensions to 5 to enable effective clustering. UMAP preserves both local and global structure.


In [None]:
from umap import UMAP

umap_model = UMAP(n_components=5, min_dist=0.0, metric='cosine', random_state=42)
reduced_embeddings = umap_model.fit_transform(embeddings)


## 4. Clustering with HDBSCAN

HDBSCAN automatically determines cluster count and handles outliers (labeled as -1).


In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom')
hdbscan_model.fit(reduced_embeddings)
clusters = hdbscan_model.labels_
print(f"Number of clusters: {len(set(clusters))}")


In [None]:
import numpy as np

# Inspect first 3 documents in cluster 0
cluster = 0
print(f"Sample documents from cluster {cluster}:\n")
for index in np.where(clusters==cluster)[0][:3]:
    print(list(abstracts)[index][:200] + "...\n")


## 5. Cluster Visualization

Reducing to 2D for visualization. Outliers shown in grey.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Reduce to 2D for visualization
reduced_2d = UMAP(n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

df = pd.DataFrame(reduced_2d, columns=["x", "y"])
df["title"] = titles
df["cluster"] = [str(c) for c in clusters]

clusters_df = df.loc[df.cluster != "-1", :]
outliers_df = df.loc[df.cluster == "-1", :]

plt.figure(figsize=(12, 8))
plt.scatter(outliers_df.x, outliers_df.y, alpha=0.05, s=2, c="grey")
plt.scatter(clusters_df.x, clusters_df.y, c=clusters_df.cluster.astype(int), alpha=0.6, s=2, cmap='tab20b')
plt.axis('off')
plt.title("Document Clusters (grey = outliers)")
plt.show()


## 6. Topic Modeling with BERTopic

BERTopic extracts keywords for each cluster using c-TF-IDF, creating interpretable topic descriptions.


In [None]:
from bertopic import BERTopic

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
).fit(abstracts, embeddings)


In [None]:
topic_model.get_topic_info().head(10)


In [None]:
# Get top keywords for topic 0
topic_model.get_topic(0)


## 7. Representation Models

Improving topic representations with KeyBERT and MMR for more coherent keywords.


In [None]:
from copy import deepcopy
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

original_topics = deepcopy(topic_model.topic_representations_)

# Update with KeyBERTInspired
representation_model = KeyBERTInspired()
topic_model.update_topics(abstracts, representation_model=representation_model)

# Compare original vs updated for topic 0
print("Original:", [w for w, _ in original_topics[0][:5]])
print("KeyBERT:", [w for w, _ in topic_model.get_topic(0)[:5]])


## 8. Activity: Topic Labels with Groq API

Implementing `update_topics_with_groq` function to generate human-readable topic labels using an LLM.


In [None]:
from groq import Groq
from dotenv import load_dotenv
import time
import random

load_dotenv()

def update_topics_with_groq(topic_model, documents, model="meta-llama/llama-4-scout-17b-16e-instruct"):
    client = Groq()
    topics = topic_model.get_topics()
    updated_labels = {}
    
    prompt_template = """I have a topic that contains the following documents:
{documents}

The topic is described by the following keywords: '{keywords}'.

Based on the documents and keywords, what is this topic about?
Respond with only: topic: <short topic label>"""
    
    for topic_id in topics:
        if topic_id == -1:
            continue
            
        rep_docs = topic_model.get_representative_docs(topic_id)[:5]
        docs_text = "\n".join(rep_docs)
        keywords = ", ".join([w for w, _ in topics[topic_id][:10]])
        
        prompt = prompt_template.format(documents=docs_text, keywords=keywords)
        
        for attempt in range(3):
            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that creates concise topic labels."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0,
                    max_tokens=500
                )
                label = response.choices[0].message.content.strip()
                if label.lower().startswith("topic:"):
                    label = label[6:].strip()
                updated_labels[topic_id] = label
                time.sleep(random.uniform(0.5, 1.0))
                break
            except Exception as e:
                print(f"Retry {attempt+1} for topic {topic_id}: {e}")
                time.sleep(2 ** attempt)
        else:
            updated_labels[topic_id] = f"Topic {topic_id}"
    
    topic_model.set_topic_labels(updated_labels)
    return topic_model

# Run the function (uncomment when API key is set)
# topic_model = update_topics_with_groq(topic_model, abstracts)
# topic_model.get_topic_info().head(10)


## 9. Word Cloud Visualization


In [None]:
from wordcloud import WordCloud

topic_model.update_topics(abstracts, top_n_words=100)

def create_wordcloud(model, topic):
    plt.figure(figsize=(10, 5))
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=100, width=1600, height=800)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Topic {topic}")
    plt.show()

create_wordcloud(topic_model, topic=0)
