In [2]:
!pip install --quiet bertopic sentence-transformers umap-learn hdbscan gensim

In [3]:
import pandas as pd
df = pd.read_csv("/home/ismail/code/belachkar/voicelens_capstone_project/raw_data/review_for_amazon.csv",
                 encoding='latin-1')
df.head()

Unnamed: 0,name,n_review,country,comment,rating,date
0,Graham MOORE,21,GB,Uncaring and incompetent\r\r\n\r\r\nImpossible...,1,2022-06-20
1,popadog,5,GB,Amazon maybe the quickest way to get<U+0085>\r...,2,2022-06-20
2,Andrew Torok,6,US,Not fair!\r\r\n\r\r\nIn genera! I am an Amazon...,1,2022-06-20
3,Jerry Jocoy,15,US,Amazon Prime is crap\r\r\n\r\r\nAmazon Prime i...,1,2022-06-20
4,steve erickson,3,US,Terrible delivery services\r\r\n\r\r\nTerrible...,1,2022-06-19


In [None]:

import re
import string
import pandas as pd
import numpy as np

# NLP
import spacy
from spacy.tokens import Doc
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Utils
from typing import List

# Load spaCy model (English)
# For production, prefer: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
STOPWORDS = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to /home/ismail/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean_text(text: str) -> str:
    """
    Clean raw text: lowercase, remove URLs, punctuation, etc.
    """
    if not isinstance(text, str):
        return ""

    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove digits
    text = re.sub(r"\d+", "", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [None]:
def lemmatize_text(text: str) -> str:
    """
    Lemmatize text using spaCy, removing stopwords & short tokens.
    """
    if not isinstance(text, str):
        return ""

    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if token.lemma_ not in STOPWORDS
        and len(token.lemma_) > 2
        and token.lemma_.isalpha()
    ]
    return " ".join(tokens)


In [None]:
def preprocess_dataframe(df: pd.DataFrame, text_column: str = "comment") -> pd.DataFrame:
    """
    Apply full preprocessing pipeline on the dataset.

    Parameters:
        df (pd.DataFrame): input dataframe
        text_column (str): column containing raw text

    Returns:
        pd.DataFrame: dataframe with processed_text column
    """
    df = df.copy()

    # Clean
    df["clean_text"] = df[text_column].apply(clean_text)

    # Lemmatize
    df["processed_text"] = df["clean_text"].apply(lemmatize_text)

    # Drop rows where processed text is empty
    df = df[df["processed_text"].str.strip() != ""]

    df.reset_index(drop=True, inplace=True)

    return df


In [25]:
df_clean = preprocess_dataframe(df, text_column="comment")
df_clean.head()


Unnamed: 0,name,n_review,country,comment,rating,date,clean_text,processed_text
0,Graham MOORE,21,GB,Uncaring and incompetent\r\r\n\r\r\nImpossible...,1,2022-06-20,uncaring and incompetent impossible to deal wi...,uncaring incompetent impossible deal customer ...
1,popadog,5,GB,Amazon maybe the quickest way to get<U+0085>\r...,2,2022-06-20,amazon maybe the quickest way to get amazon ma...,amazon maybe quick way get amazon maybe quick ...
2,Andrew Torok,6,US,Not fair!\r\r\n\r\r\nIn genera! I am an Amazon...,1,2022-06-20,not fair in genera i am an amazon junkie i lov...,fair genera amazon junkie love tthose package ...
3,Jerry Jocoy,15,US,Amazon Prime is crap\r\r\n\r\r\nAmazon Prime i...,1,2022-06-20,amazon prime is crap amazon prime is crap firs...,amazon prime crap amazon prime crap first orde...
4,steve erickson,3,US,Terrible delivery services\r\r\n\r\r\nTerrible...,1,2022-06-19,terrible delivery services terrible delivery s...,terrible delivery service terrible delivery se...


In [None]:
# ---- Embedding Model for BERTopic ----
from sentence_transformers import SentenceTransformer

def load_embedding_model(model_name: str = "all-MiniLM-L6-v2"):
    """
    Load a sentence embedding model for BERTopic.

    Args:
        model_name (str): Name of the SentenceTransformer model.

    Returns:
        model: Loaded embedding model.
    """
    print(f"Loading embedding model: {model_name} ...")
    model = SentenceTransformer(model_name)
    print("Embedding model loaded successfully.")
    return model

embedding_model = load_embedding_model()


Loading embedding model: all-MiniLM-L6-v2 ...
Embedding model loaded successfully.


In [None]:
# ---- Topic Modeling Step ----

from bertopic import BERTopic
from umap import UMAP
import hdbscan


def build_topic_model(
    embedding_model,
    n_neighbors: int = 15,
    n_components: int = 5,
    min_cluster_size: int = 10
):
    """
    Build a BERTopic model with UMAP + HDBSCAN + SentenceTransformer embeddings.

    Args:
        embedding_model: Preloaded SentenceTransformer model.
        n_neighbors (int): UMAP neighbors.
        n_components (int): UMAP dimensions.
        min_cluster_size (int): Minimum cluster size for HDBSCAN.

    Returns:
        BERTopic: Configured BERTopic model.
    """

    # Dimensionality Reduction
    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=n_components,
        metric="cosine",
        random_state=42
    )

    # Clustering
    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True
    )

    # BERTopic Model
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        calculate_probabilities=True,
        verbose=True
    )

    print("BERTopic model successfully initialized.")
    return topic_model


# ---- Build the model ----
topic_model = build_topic_model(embedding_model)


BERTopic model successfully initialized.


In [None]:
# ---- Fit BERTopic on your processed text ----

def fit_topic_model(topic_model, df, text_column="processed_text"):
    """
    Fit BERTopic on cleaned text data.

    Args:
        topic_model: BERTopic model instance.
        df (DataFrame): Preprocessed dataframe.
        text_column (str): Column containing processed text.

    Returns:
        topics, probabilities, topic_info
    """
    docs = df[text_column].tolist()
    print(f"Fitting BERTopic on {len(docs)} documents...")

    topics, probabilities = topic_model.fit_transform(docs)

    print("Model fitted successfully.")
    topic_info = topic_model.get_topic_info()

    return topics, probabilities, topic_info


# ---- Run the fitting ----
topics, probs, topic_info = fit_topic_model(topic_model, df_clean)


2025-11-18 17:48:41,743 - BERTopic - Embedding - Transforming documents to embeddings.


Fitting BERTopic on 12948 documents...


Batches: 100%|██████████| 405/405 [02:01<00:00,  3.33it/s]
2025-11-18 17:50:43,758 - BERTopic - Embedding - Completed ✓
2025-11-18 17:50:43,760 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-18 17:51:39,620 - BERTopic - Dimensionality - Completed ✓
2025-11-18 17:51:39,623 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-18 17:51:41,168 - BERTopic - Cluster - Completed ✓
2025-11-18 17:51:41,189 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-18 17:51:42,144 - BERTopic - Representation - Completed ✓


Model fitted successfully.


In [None]:
def assign_topics_to_df(df, topics, probabilities):
    """
    Assigns BERTopic output back to the dataframe.

    Args:
        df (DataFrame): Preprocessed dataframe
        topics (list[int]): Topic numbers for each document
        probabilities (np.ndarray or list): Probabilities from BERTopic

    Returns:
        DataFrame with added columns:
            - dominant_topic
            - topic_probability (probability of dominant topic)
    """
    df = df.copy()
    df["dominant_topic"] = topics

    # If probabilities is 2D, take probability of the assigned topic
    if isinstance(probabilities, np.ndarray) and probabilities.ndim == 2:
        probs_for_dominant = [probabilities[i, t] if t >= 0 else 0.0
                              for i, t in enumerate(topics)]
        df["topic_probability"] = probs_for_dominant
    else:
        df["topic_probability"] = probabilities

    return df


df_topics = assign_topics_to_df(df_clean, topics, probs)
df_topics.head()


Unnamed: 0,name,n_review,country,comment,rating,date,clean_text,processed_text,dominant_topic,topic_probability
0,Graham MOORE,21,GB,Uncaring and incompetent\r\r\n\r\r\nImpossible...,1,2022-06-20,uncaring and incompetent impossible to deal wi...,uncaring incompetent impossible deal customer ...,0,0.825409
1,popadog,5,GB,Amazon maybe the quickest way to get<U+0085>\r...,2,2022-06-20,amazon maybe the quickest way to get amazon ma...,amazon maybe quick way get amazon maybe quick ...,0,0.927613
2,Andrew Torok,6,US,Not fair!\r\r\n\r\r\nIn genera! I am an Amazon...,1,2022-06-20,not fair in genera i am an amazon junkie i lov...,fair genera amazon junkie love tthose package ...,0,0.885888
3,Jerry Jocoy,15,US,Amazon Prime is crap\r\r\n\r\r\nAmazon Prime i...,1,2022-06-20,amazon prime is crap amazon prime is crap firs...,amazon prime crap amazon prime crap first orde...,0,0.980481
4,steve erickson,3,US,Terrible delivery services\r\r\n\r\r\nTerrible...,1,2022-06-19,terrible delivery services terrible delivery s...,terrible delivery service terrible delivery se...,0,0.968909


In [None]:
def generate_topic_labels(topic_model, topic_info, top_n_words: int = 3):
    """
    Generate human-readable topic labels using BERTopic's top words.

    Args:
        topic_model: Fitted BERTopic model.
        topic_info (DataFrame): topic_model.get_topic_info()
        top_n_words (int): Number of representative words to include in the label.

    Returns:
        dict: {topic_id: topic_label}
    """
    topic_labels = {}

    for topic_id in topic_info.Topic:
        if topic_id == -1:
            topic_labels[topic_id] = "Outliers"
            continue

        # Get top words for this topic
        words = [word for word, _ in topic_model.get_topic(topic_id)[:top_n_words]]

        # Make a short, readable label
        label = " ".join(words).title()
        topic_labels[topic_id] = label

    return topic_labels


In [32]:
# Generate labels
topic_labels = generate_topic_labels(topic_model, topic_info)

# Add a new column 'Topic' with human-readable labels
df_topics["Topic"] = df_topics["dominant_topic"].map(topic_labels)

df_topics[["processed_text", "dominant_topic", "Topic", "topic_probability"]].head(10)


Unnamed: 0,processed_text,dominant_topic,Topic,topic_probability
0,uncaring incompetent impossible deal customer ...,0,Amazon Service Customer,0.825409
1,amazon maybe quick way get amazon maybe quick ...,0,Amazon Service Customer,0.927613
2,fair genera amazon junkie love tthose package ...,0,Amazon Service Customer,0.885888
3,amazon prime crap amazon prime crap first orde...,0,Amazon Service Customer,0.980481
4,terrible delivery service terrible delivery se...,0,Amazon Service Customer,0.968909
5,day bait switch delivery nothing frustrating o...,0,Amazon Service Customer,0.897454
6,ever use amazone ever use amazone faulty item ...,0,Amazon Service Customer,0.750962
7,cost return two item cost return two item defe...,0,Amazon Service Customer,0.94828
8,order small kitchen appliance order small kitc...,0,Amazon Service Customer,0.897604
9,amazon need quit call number amazon need quit ...,0,Amazon Service Customer,0.843111
