In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import base64
from io import BytesIO
import re
from datetime import datetime

# Stopwords --> 
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Sentence embeddings
from sentence_transformers import SentenceTransformer

# Clustering
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA

# LLM summarization
from transformers import pipeline

# Keyword extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# -----------------------------
# Step 1: Load dataset
# -----------------------------
df = pd.read_csv("./abcnews-date-text.csv")
texts = df["headline_text"].dropna().astype(str).tolist()
print("Loaded", len(texts), "headlines")

# -----------------------------
# Step 2: Fast preprocessing
# -----------------------------
def fast_preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text.lower())
    return " ".join([w for w in text.split() if w not in ENGLISH_STOP_WORDS and len(w) > 2])

print("Preprocessing texts...")
processed_texts = [fast_preprocess(t) for t in texts]

# -----------------------------
# Step 3: Embeddings (Sentence-BERT, GPU if available)
# -----------------------------
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(">>> Using device:", device)

embedder = SentenceTransformer("all-MiniLM-L6-v2", device=device)
X = embedder.encode(processed_texts, batch_size=512, show_progress_bar=True)

# -----------------------------
# Step 4: Clustering (MiniBatchKMeans for scalability)
# -----------------------------
n_clusters = 50  # adjust based on how granular you want
print(f"Clustering into {n_clusters} clusters...")
clusterer = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10000, random_state=42)
labels = clusterer.fit_predict(X)

# -----------------------------
# Step 5: Dimensionality reduction for visualization
# -----------------------------
print("Reducing dimensions with IncrementalPCA...")
ipca = IncrementalPCA(n_components=2, batch_size=10000)
reduced = ipca.fit_transform(X)

# -----------------------------
# Step 6: Cluster analysis (keywords + summarization)
# -----------------------------
clustered = pd.DataFrame({"headline": texts, "processed": processed_texts, "cluster": labels})

# TF-IDF for keywords
vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(processed_texts)

# Summarizer (GPU if available)
summarizer = pipeline("summarization", model="google/flan-t5-small",
                      device=0 if torch.cuda.is_available() else -1)

def summarize_cluster(headlines):
    if len(headlines) == 0:
        return "No data"
    text = " ".join(headlines[:10])
    summary = summarizer(text, max_length=30, min_length=5, do_sample=False)
    return summary[0]["summary_text"]

cluster_labels = {}
report_rows = []

# Analyze only top clusters by size (saves summarization time)
top_clusters = clustered["cluster"].value_counts().head(15).index

for c in top_clusters:
    cluster_headlines = clustered[clustered.cluster == c].headline.tolist()
    cluster_processed = clustered[clustered.cluster == c].processed.tolist()

    # Keywords
    cluster_texts = " ".join(cluster_processed)
    keywords = vectorizer.transform([cluster_texts]).toarray().flatten()
    top_idx = keywords.argsort()[-10:][::-1]
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_idx]

    # LLM summary
    summary = summarize_cluster(cluster_headlines)

    cluster_labels[c] = summary if summary != "No data" else ", ".join(top_words[:3])

    report_rows.append({
        "cluster_id": int(c),
        "size": len(cluster_headlines),
        "top_keywords": ", ".join(top_words),
        "summary": summary,
        "sample_headlines": " | ".join(cluster_headlines[:5])
    })

# -----------------------------
# Step 7: Visualization with labels
# -----------------------------
plt.figure(figsize=(12, 8))
sns.scatterplot(x=reduced[:, 0], y=reduced[:, 1], hue=labels, palette="tab10", s=10, legend=None)

# Annotate cluster centers with summaries
for c in top_clusters:
    cluster_points = reduced[labels == c]
    if len(cluster_points) == 0: continue
    center = cluster_points.mean(axis=0)
    plt.text(center[0], center[1], cluster_labels.get(c, str(c)),
             fontsize=9, weight='bold',
             bbox=dict(facecolor='white', alpha=0.6, boxstyle='round,pad=0.3'))

plt.title("Headline Clusters (MiniBatchKMeans + IncrementalPCA)")
buf = BytesIO()
plt.savefig(buf, format='png', bbox_inches='tight')
buf.seek(0)
main_plot_b64 = base64.b64encode(buf.read()).decode('utf-8')
plt.close()

# -----------------------------
# Step 8: Export report (CSV/Excel/HTML)
# -----------------------------
report_df = pd.DataFrame(report_rows)
report_df.to_csv("cluster_report.csv", index=False)
report_df.to_excel("cluster_report.xlsx", index=False)
print("\nCluster report saved as CSV and Excel.")

html_parts = []
html_parts.append(f"<h1>Unsupervised NLP Cluster Report</h1>")
html_parts.append(f"<p>Generated: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC</p>")
html_parts.append("<h2>Main cluster visualization</h2>")
html_parts.append(f"<img src=\"data:image/png;base64,{main_plot_b64}\" style=\"max-width:100%;height:auto;\"/>")

html_parts.append("<h2>Cluster details (Top 15 by size)</h2>")
for row in report_rows:
    html_parts.append(f"<div style='border:1px solid #ddd;padding:10px;margin:10px 0;border-radius:6px;'>")
    html_parts.append(f"<h3>Cluster {row['cluster_id']} (size={row['size']}): {row['summary']}</h3>")
    html_parts.append(f"<p><b>Top keywords:</b> {row['top_keywords']}</p>")
    html_parts.append(f"<p><b>Sample headlines:</b> {row['sample_headlines']}</p>")
    html_parts.append("</div>")

html_content = "\n".join(html_parts)
with open("cluster_report.html", "w", encoding="utf-8") as f:
    f.write("<html><head><meta charset='utf-8'></head><body>")
    f.write(html_content)
    f.write("</body></html>")

print("HTML report saved to cluster_report.html")
print("\nPipeline complete ✅")


  from .autonotebook import tqdm as notebook_tqdm


Loaded 1244184 headlines
Preprocessing texts...
>>> Using device: cuda


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2431/2431 [04:18<00:00,  9.39it/s]


Clustering into 50 clusters...
Reducing dimensions with IncrementalPCA...


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_clas

ModuleNotFoundError: No module named 'openpyxl'

In [3]:
report_df.to_excel("cluster_report.xlsx", index=False)
print("\nCluster report saved as CSV and Excel.")

html_parts = []
html_parts.append(f"<h1>Unsupervised NLP Cluster Report</h1>")
html_parts.append(f"<p>Generated: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC</p>")
html_parts.append("<h2>Main cluster visualization</h2>")
html_parts.append(f"<img src=\"data:image/png;base64,{main_plot_b64}\" style=\"max-width:100%;height:auto;\"/>")

html_parts.append("<h2>Cluster details (Top 15 by size)</h2>")
for row in report_rows:
    html_parts.append(f"<div style='border:1px solid #ddd;padding:10px;margin:10px 0;border-radius:6px;'>")
    html_parts.append(f"<h3>Cluster {row['cluster_id']} (size={row['size']}): {row['summary']}</h3>")
    html_parts.append(f"<p><b>Top keywords:</b> {row['top_keywords']}</p>")
    html_parts.append(f"<p><b>Sample headlines:</b> {row['sample_headlines']}</p>")
    html_parts.append("</div>")

html_content = "\n".join(html_parts)
with open("cluster_report.html", "w", encoding="utf-8") as f:
    f.write("<html><head><meta charset='utf-8'></head><body>")
    f.write(html_content)
    f.write("</body></html>")

print("HTML report saved to cluster_report.html")
print("\nPipeline complete ✅")


Cluster report saved as CSV and Excel.
HTML report saved to cluster_report.html

Pipeline complete ✅


  html_parts.append(f"<p>Generated: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC</p>")
