In [2]:
"""
Step 2: Topic Modeling Script
-----------------------------
- Loads cleaned dataset from Step 1
- Generates embeddings using SentenceTransformers
- Runs BERTopic to cluster reviews into topics
- Saves results into data/processed/

Usage:
    python scripts/topic_modeling.py
"""

import os
import pandas as pd

# Try importing sentence-transformers, fall back to plain transformers if unavailable
try:
    from sentence_transformers import SentenceTransformer
    USE_ST = True
except ImportError:
    print("⚠️ sentence-transformers not installed. Falling back to transformers embedding.")
    from transformers import AutoTokenizer, AutoModel
    import torch
    import numpy as np
    USE_ST = False

from bertopic import BERTopic

# Paths
CLEANED_FILE = "sample_reviews_corrected.csv"
TOPIC_OUTPUT_FILE = "data/sample/reviews_with_topics.csv"
MODEL_OUTPUT_DIR = "data/sample/topic_model"

def get_embeddings(texts, model_name="all-MiniLM-L6-v2"):
    """
    Generate embeddings for texts.
    - If sentence-transformers is available, use it.
    - Otherwise, use transformers + mean pooling.
    """
    if USE_ST:
        model = SentenceTransformer(model_name)
        embeddings = model.encode(texts, show_progress_bar=True, batch_size=32)
        return embeddings

    # Fallback: transformers + mean pooling
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return (token_embeddings * mask_expanded).sum(1) / mask_expanded.sum(1).clamp(min=1e-9)

    all_embeds = []
    for i in range(0, len(texts), 32):
        batch = texts[i:i+32]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = mean_pooling(outputs, inputs["attention_mask"])
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        all_embeds.append(embeddings.cpu().numpy())
    return np.vstack(all_embeds)


def main():
    # Load cleaned data
    if not os.path.exists(CLEANED_FILE):
        raise FileNotFoundError(f"❌ Missing file: {CLEANED_FILE}. Run preprocessing first.")

    df = pd.read_csv(CLEANED_FILE)
    print(f"✅ Loaded {len(df)} reviews for topic modeling")

    # Generate embeddings
    texts = df["review_text"].astype(str).tolist()
    print("🔄 Generating embeddings...")
    embeddings = get_embeddings(texts)

    # Run BERTopic
    print("🔄 Running BERTopic clustering...")
    topic_model = BERTopic(verbose=True)
    topics, probs = topic_model.fit_transform(texts, embeddings)

    df["topic"] = topics
    print(f"✅ Assigned topics to {len(df)} reviews")

    # Save augmented dataset
    os.makedirs(os.path.dirname(TOPIC_OUTPUT_FILE), exist_ok=True)
    df.to_csv(TOPIC_OUTPUT_FILE, index=False)
    print(f"💾 Saved topic results → {TOPIC_OUTPUT_FILE}")

    # Save BERTopic model for reuse
    topic_model.save(MODEL_OUTPUT_DIR)
    print(f"💾 Saved BERTopic model → {MODEL_OUTPUT_DIR}")


if __name__ == "__main__":
    main()


✅ Loaded 5000 reviews for topic modeling
🔄 Generating embeddings...


Batches: 100%|████████████████████████████████| 157/157 [00:22<00:00,  6.99it/s]
2025-09-28 16:55:59,946 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


🔄 Running BERTopic clustering...


2025-09-28 16:56:11,522 - BERTopic - Dimensionality - Completed ✓
2025-09-28 16:56:11,524 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-28 16:56:11,645 - BERTopic - Cluster - Completed ✓
2025-09-28 16:56:11,652 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-28 16:56:12,030 - BERTopic - Representation - Completed ✓


✅ Assigned topics to 5000 reviews
💾 Saved topic results → data/sample/reviews_with_topics.csv
💾 Saved BERTopic model → data/sample/topic_model
