In [None]:
pip install --quiet "bertopic[visualization]" sentence-transformers umap-learn hdbscan gensim wordcloud plotly

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:


import os, re
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")

# BERTopic & components
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import TfidfVectorizer
from umap import UMAP
import hdbscan

# BERTopic visuals
from bertopic.plotting import (
    visualize_barchart,
    visualize_hierarchy,
    visualize_topics,
    visualize_documents,
    visualize_term_rank,
    visualize_topics_over_time,
)

# Gensim utils (used internally by BERTopic.Coherence if needed)
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess

# -------------------------
# USER KNOBS — adjust for lab/demo
# -------------------------
DATA_ROOT = r"/kaggle/input/20-newsgroups/mini_newsgroups/mini_newsgroups"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST = 0.1
UMAP_N_COMPONENTS = 5
HDBSCAN_MIN_CLUSTER_SIZE = 40
TOP_N_WORDS = 12
CALCULATE_PROBABILITIES = False
RANDOM_STATE = 42
# -------------------------


def load_texts_simple(root_dir, max_files=None, min_tokens=6):
    p = Path(root_dir)
    docs, paths = [], []
    for file in p.rglob("*"):
        if not file.is_file():
            continue
        try:
            text = file.read_text(encoding="utf8", errors="replace").strip()
        except Exception:
            continue
        if not text:
            continue
        if len(text.split()) < min_tokens:
            continue
        docs.append(text)
        paths.append(str(file))
        if max_files and len(docs) >= max_files:
            break
    return docs, paths

# -------------------------
# Light preprocessing: remove headers, urls, emails, signatures
# (keeps natural text so embeddings remain meaningful)
# -------------------------
def light_preprocess(text):
    text = re.sub(r"(^|\n)(from|subject|to|cc|reply-to|message-id):.*(\n|$)", " ", text, flags=re.I)
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"\S+@\S+", " ", text)
    text = re.sub(r"[-]{2,}.*", " ", text)   # crude signature removal
    text = re.sub(r"[^A-Za-z0-9'\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


docs, paths = load_texts_simple(DATA_ROOT, max_files=None, min_tokens=6)


# -------------------------
# Apply light preprocessing
# -------------------------
print("Applying light preprocessing...")
docs_clean = [light_preprocess(d) for d in tqdm(docs)]

# -------------------------
# Embeddings (precompute)
# -------------------------
print("Loading SentenceTransformer:", EMBEDDING_MODEL)
embedder = SentenceTransformer(EMBEDDING_MODEL)

print("Computing embeddings...")
embeddings = embedder.encode(docs_clean, show_progress_bar=True, convert_to_numpy=True)
print("Embeddings shape:", embeddings.shape)

# -------------------------
# UMAP reduction
# -------------------------
print("Fitting UMAP...")
umap_model = UMAP(n_neighbors=UMAP_N_NEIGHBORS,
                  n_components=UMAP_N_COMPONENTS,
                  min_dist=UMAP_MIN_DIST,
                  metric="cosine",
                  random_state=RANDOM_STATE)
embeddings_reduced = umap_model.fit_transform(embeddings)
print("UMAP reduced shape:", embeddings_reduced.shape)

# -------------------------
# HDBSCAN clustering
# -------------------------
print("Fitting HDBSCAN...")
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=HDBSCAN_MIN_CLUSTER_SIZE,
                                metric='euclidean',
                                cluster_selection_method='eom',
                                prediction_data=True)
labels = hdbscan_model.fit_predict(embeddings_reduced)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = int((labels == -1).sum())
print(f"HDBSCAN found {n_clusters} clusters (excluding -1). Noise points: {n_noise}")

# -------------------------
# BERTopic (KeyBERTInspired + TF-IDF) — c-TF-IDF used internally
# -------------------------
print("Building BERTopic model (KeyBERTInspired + TF-IDF)...")
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.85, min_df=5)
representation = KeyBERTInspired()

topic_model = BERTopic(
    embedding_model=embedder,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    representation_model=representation,
    top_n_words=TOP_N_WORDS,
    calculate_probabilities=CALCULATE_PROBABILITIES,
    verbose=True
)

print("Fitting BERTopic (this will extract topics using c-TF-IDF)...")
topics, probs = topic_model.fit_transform(docs_clean, embeddings)

# -------------------------
# Print topic info
# -------------------------
print("\n=== TOPIC INFO (top rows) ===")
print(topic_model.get_topic_info().head(15).to_string(index=False))

# -------------------------
# Coherence
# -------------------------
# ---- Tokenize docs for coherence ----
tokenized_docs = [simple_preprocess(doc) for doc in docs_clean]

# ---- Extract top words from each topic for coherence ----
topics_dict = topic_model.get_topics()
topic_words = []
for tid, words in topics_dict.items():
    if tid != -1:
        topic_words.append([w for w, _ in words[:TOP_N_WORDS]])

# ---- Build dictionary and compute coherence ----
dictionary = Dictionary(tokenized_docs)
cm = CoherenceModel(topics=topic_words, texts=tokenized_docs,
                    dictionary=dictionary, coherence='c_v')

print("Overall Coherence (c_v):", round(cm.get_coherence(), 4))

# -------------------------
# Visualizations (display only, no saving)
# - plot_topic_size, plot_hierarchy, plot_barchart -> Plotly figures .show()
# - visualize_topics and visualize_documents -> Plotly .show()
# - WordClouds -> matplotlib plt.show()
# -------------------------
# 1) barchart
fig_b = visualize_barchart(topic_model, top_n_topics=15)
fig_b.show()

# 2) hierarchy
fig_h = visualize_hierarchy(topic_model)
fig_h.show()

# 3) topic map
fig_t = visualize_topics(topic_model)
fig_t.show()

# 4) documents (embedding space)
fig_d = visualize_documents(topic_model, docs_clean)   # or pass embeddings if supported
fig_d.show()

# WordClouds for top 5 non-outlier topics
print("\nDisplaying WordClouds for top topics...")
topic_info = topic_model.get_topic_info()
top_topics = topic_info[topic_info.Topic != -1].head(5).Topic.tolist()
for tid in top_topics:
    words_freq = dict(topic_model.get_topic(int(tid)))
    if not words_freq:
        continue
    wc = WordCloud(width=900, height=400, background_color="white").generate_from_frequencies(words_freq)
    plt.figure(figsize=(12,5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"WordCloud — Topic {tid}")
    plt.show()



AttributeError: module 'sklearn.metrics._dist_metrics' has no attribute 'DistanceMetric64'