In [2]:
!pip install --quiet bertopic sentence-transformers umap-learn hdbscan gensim

In [21]:
# ================================
# ðŸ“¦ Setup & Dependencies
# ================================

import re
import string
import pandas as pd
import numpy as np

# NLP
import spacy
from spacy.tokens import Doc
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Utils
from typing import List

# Load spaCy model (English)
# For production, prefer: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
STOPWORDS = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to /home/ismail/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean_text(text: str) -> str:
    """
    Clean raw text: lowercase, remove URLs, punctuation, etc.
    """
    if not isinstance(text, str):
        return ""

    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Remove digits
    text = re.sub(r"\d+", "", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [None]:
def lemmatize_text(text: str) -> str:
    """
    Lemmatize text using spaCy, removing stopwords & short tokens.
    """
    if not isinstance(text, str):
        return ""

    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if token.lemma_ not in STOPWORDS
        and len(token.lemma_) > 2
        and token.lemma_.isalpha()
    ]
    return " ".join(tokens)


In [None]:
def preprocess_dataframe(df: pd.DataFrame, text_column: str = "comment") -> pd.DataFrame:
    """
    Apply full preprocessing pipeline on the dataset.

    Parameters:
        df (pd.DataFrame): input dataframe
        text_column (str): column containing raw text

    Returns:
        pd.DataFrame: dataframe with processed_text column
    """
    df = df.copy()

    # Clean
    df["clean_text"] = df[text_column].apply(clean_text)

    # Lemmatize
    df["processed_text"] = df["clean_text"].apply(lemmatize_text)

    # Drop rows where processed text is empty
    df = df[df["processed_text"].str.strip() != ""]

    df.reset_index(drop=True, inplace=True)

    return df


In [25]:
df_clean = preprocess_dataframe(df, text_column="comment")
df_clean.head()


Unnamed: 0,name,n_review,country,comment,rating,date,clean_text,processed_text
0,Graham MOORE,21,GB,Uncaring and incompetent\r\r\n\r\r\nImpossible...,1,2022-06-20,uncaring and incompetent impossible to deal wi...,uncaring incompetent impossible deal customer ...
1,popadog,5,GB,Amazon maybe the quickest way to get<U+0085>\r...,2,2022-06-20,amazon maybe the quickest way to get amazon ma...,amazon maybe quick way get amazon maybe quick ...
2,Andrew Torok,6,US,Not fair!\r\r\n\r\r\nIn genera! I am an Amazon...,1,2022-06-20,not fair in genera i am an amazon junkie i lov...,fair genera amazon junkie love tthose package ...
3,Jerry Jocoy,15,US,Amazon Prime is crap\r\r\n\r\r\nAmazon Prime i...,1,2022-06-20,amazon prime is crap amazon prime is crap firs...,amazon prime crap amazon prime crap first orde...
4,steve erickson,3,US,Terrible delivery services\r\r\n\r\r\nTerrible...,1,2022-06-19,terrible delivery services terrible delivery s...,terrible delivery service terrible delivery se...


In [None]:
# ---- Embedding Model for BERTopic ----
from sentence_transformers import SentenceTransformer

def load_embedding_model(model_name: str = "all-MiniLM-L6-v2"):
    """
    Load a sentence embedding model for BERTopic.

    Args:
        model_name (str): Name of the SentenceTransformer model.

    Returns:
        model: Loaded embedding model.
    """
    print(f"Loading embedding model: {model_name} ...")
    model = SentenceTransformer(model_name)
    print("Embedding model loaded successfully.")
    return model

embedding_model = load_embedding_model()


Loading embedding model: all-MiniLM-L6-v2 ...
Embedding model loaded successfully.


In [None]:
# ---- Topic Modeling Step ----

from bertopic import BERTopic
from umap import UMAP
import hdbscan


def build_topic_model(
    embedding_model,
    n_neighbors: int = 15,
    n_components: int = 5,
    min_cluster_size: int = 10
):
    """
    Build a BERTopic model with UMAP + HDBSCAN + SentenceTransformer embeddings.

    Args:
        embedding_model: Preloaded SentenceTransformer model.
        n_neighbors (int): UMAP neighbors.
        n_components (int): UMAP dimensions.
        min_cluster_size (int): Minimum cluster size for HDBSCAN.

    Returns:
        BERTopic: Configured BERTopic model.
    """

    # Dimensionality Reduction
    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=n_components,
        metric="cosine",
        random_state=42
    )

    # Clustering
    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=True
    )

    # BERTopic Model
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        calculate_probabilities=True,
        verbose=True
    )

    print("BERTopic model successfully initialized.")
    return topic_model


# ---- Build the model ----
topic_model = build_topic_model(embedding_model)


BERTopic model successfully initialized.


In [None]:
# ---- Fit BERTopic on your processed text ----

def fit_topic_model(topic_model, df, text_column="processed_text"):
    """
    Fit BERTopic on cleaned text data.

    Args:
        topic_model: BERTopic model instance.
        df (DataFrame): Preprocessed dataframe.
        text_column (str): Column containing processed text.

    Returns:
        topics, probabilities, topic_info
    """
    docs = df[text_column].tolist()
    print(f"Fitting BERTopic on {len(docs)} documents...")

    topics, probabilities = topic_model.fit_transform(docs)

    print("Model fitted successfully.")
    topic_info = topic_model.get_topic_info()

    return topics, probabilities, topic_info


# ---- Run the fitting ----
topics, probs, topic_info = fit_topic_model(topic_model, df_clean)


2025-11-18 17:48:41,743 - BERTopic - Embedding - Transforming documents to embeddings.


Fitting BERTopic on 12948 documents...


Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 405/405 [02:01<00:00,  3.33it/s]
2025-11-18 17:50:43,758 - BERTopic - Embedding - Completed âœ“
2025-11-18 17:50:43,760 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-18 17:51:39,620 - BERTopic - Dimensionality - Completed âœ“
2025-11-18 17:51:39,623 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-18 17:51:41,168 - BERTopic - Cluster - Completed âœ“
2025-11-18 17:51:41,189 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-18 17:51:42,144 - BERTopic - Representation - Completed âœ“


Model fitted successfully.


In [None]:
def assign_topics_to_df(df, topics, probabilities):
    """
    Assigns BERTopic output back to the dataframe.

    Args:
        df (DataFrame): Preprocessed dataframe
        topics (list[int]): Topic numbers for each document
        probabilities (np.ndarray or list): Probabilities from BERTopic

    Returns:
        DataFrame with added columns:
            - dominant_topic
            - topic_probability (probability of dominant topic)
    """
    df = df.copy()
    df["dominant_topic"] = topics

    # If probabilities is 2D, take probability of the assigned topic
    if isinstance(probabilities, np.ndarray) and probabilities.ndim == 2:
        probs_for_dominant = [probabilities[i, t] if t >= 0 else 0.0
                              for i, t in enumerate(topics)]
        df["topic_probability"] = probs_for_dominant
    else:
        df["topic_probability"] = probabilities

    return df


df_topics = assign_topics_to_df(df_clean, topics, probs)
df_topics.head()


Unnamed: 0,name,n_review,country,comment,rating,date,clean_text,processed_text,dominant_topic,topic_probability
0,Graham MOORE,21,GB,Uncaring and incompetent\r\r\n\r\r\nImpossible...,1,2022-06-20,uncaring and incompetent impossible to deal wi...,uncaring incompetent impossible deal customer ...,0,0.825409
1,popadog,5,GB,Amazon maybe the quickest way to get<U+0085>\r...,2,2022-06-20,amazon maybe the quickest way to get amazon ma...,amazon maybe quick way get amazon maybe quick ...,0,0.927613
2,Andrew Torok,6,US,Not fair!\r\r\n\r\r\nIn genera! I am an Amazon...,1,2022-06-20,not fair in genera i am an amazon junkie i lov...,fair genera amazon junkie love tthose package ...,0,0.885888
3,Jerry Jocoy,15,US,Amazon Prime is crap\r\r\n\r\r\nAmazon Prime i...,1,2022-06-20,amazon prime is crap amazon prime is crap firs...,amazon prime crap amazon prime crap first orde...,0,0.980481
4,steve erickson,3,US,Terrible delivery services\r\r\n\r\r\nTerrible...,1,2022-06-19,terrible delivery services terrible delivery s...,terrible delivery service terrible delivery se...,0,0.968909


In [None]:
def generate_topic_labels(topic_model, topic_info, top_n_words: int = 3):
    """
    Generate human-readable topic labels using BERTopic's top words.

    Args:
        topic_model: Fitted BERTopic model.
        topic_info (DataFrame): topic_model.get_topic_info()
        top_n_words (int): Number of representative words to include in the label.

    Returns:
        dict: {topic_id: topic_label}
    """
    topic_labels = {}

    for topic_id in topic_info.Topic:
        if topic_id == -1:
            topic_labels[topic_id] = "Outliers"
            continue

        # Get top words for this topic
        words = [word for word, _ in topic_model.get_topic(topic_id)[:top_n_words]]

        # Make a short, readable label
        label = " ".join(words).title()
        topic_labels[topic_id] = label

    return topic_labels


In [32]:
# Generate labels
topic_labels = generate_topic_labels(topic_model, topic_info)

# Add a new column 'Topic' with human-readable labels
df_topics["Topic"] = df_topics["dominant_topic"].map(topic_labels)

df_topics[["processed_text", "dominant_topic", "Topic", "topic_probability"]].head(10)


Unnamed: 0,processed_text,dominant_topic,Topic,topic_probability
0,uncaring incompetent impossible deal customer ...,0,Amazon Service Customer,0.825409
1,amazon maybe quick way get amazon maybe quick ...,0,Amazon Service Customer,0.927613
2,fair genera amazon junkie love tthose package ...,0,Amazon Service Customer,0.885888
3,amazon prime crap amazon prime crap first orde...,0,Amazon Service Customer,0.980481
4,terrible delivery service terrible delivery se...,0,Amazon Service Customer,0.968909
5,day bait switch delivery nothing frustrating o...,0,Amazon Service Customer,0.897454
6,ever use amazone ever use amazone faulty item ...,0,Amazon Service Customer,0.750962
7,cost return two item cost return two item defe...,0,Amazon Service Customer,0.94828
8,order small kitchen appliance order small kitc...,0,Amazon Service Customer,0.897604
9,amazon need quit call number amazon need quit ...,0,Amazon Service Customer,0.843111


In [3]:
import pandas as pd
df = pd.read_csv("/home/ismail/code/belachkar/voicelens_capstone_project/raw_data/review_for_amazon.csv",
                 encoding='latin-1')
df.head()

Unnamed: 0,name,n_review,country,comment,rating,date
0,Graham MOORE,21,GB,Uncaring and incompetent\r\r\n\r\r\nImpossible...,1,2022-06-20
1,popadog,5,GB,Amazon maybe the quickest way to get<U+0085>\r...,2,2022-06-20
2,Andrew Torok,6,US,Not fair!\r\r\n\r\r\nIn genera! I am an Amazon...,1,2022-06-20
3,Jerry Jocoy,15,US,Amazon Prime is crap\r\r\n\r\r\nAmazon Prime i...,1,2022-06-20
4,steve erickson,3,US,Terrible delivery services\r\r\n\r\r\nTerrible...,1,2022-06-19


In [9]:
import random
import numpy as np
import os


RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
os.environ['PYTHONHASHSEED'] = str(RANDOM_STATE)

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()
def clean_text(text: str) -> str:
    """Clean a single text string.


    Steps:
    - Lowercase
    - Remove URLs, emails
    - Remove non-letter characters (keep spaces)
    - Strip repeated spaces
    - Basic normalisation
    """
    if pd.isna(text):
        return ""
    text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|\S+@\S+', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text




def preprocess_text_for_tokens(text: str, stopwords_set=STOPWORDS) -> list:
    """Return a list of lemmatized tokens after cleaning.


    This function is intentionally small and deterministic. It will be used for
    both coherence calculations and for any token-based analysis.
    """
    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if len(t) > 2 and t not in stopwords_set]
    tokens = [LEMMATIZER.lemmatize(t) for t in tokens]
    return tokens




def preprocess_dataframe(df: pd.DataFrame, text_col: str = 'comment') -> pd.DataFrame:
    """Preprocess dataframe in place and return a copy with a new column 'clean_text'.


    - Drops rows with empty comments
    - Adds 'clean_text' (string)
    - Adds 'tokens' (list of tokens)
    """
    df = df.copy()
    df['clean_text'] = df[text_col].astype(str).apply(clean_text)
    df = df[df['clean_text'].str.strip() != '']
    df['tokens'] = df['clean_text'].apply(preprocess_text_for_tokens)
    df = df.reset_index(drop=True)
    return df

[nltk_data] Downloading package punkt to /home/ismail/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ismail/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ismail/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
from sentence_transformers import SentenceTransformer


EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2' # compact & good for BERTopic




def generate_embeddings(text_list: list, model_name: str = EMBEDDING_MODEL_NAME, batch_size: int = 32):
    """Return numpy array of embeddings for a list of texts.


    The function caches the model object (as an attribute) to avoid reloads when
    called multiple times.
    """
    if not hasattr(generate_embeddings, '_model'):
        generate_embeddings._model = SentenceTransformer(model_name)
    model = generate_embeddings._model
    embeddings = model.encode(text_list, show_progress_bar=True, batch_size=batch_size)
    return embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from bertopic import BERTopic
from umap import UMAP
import hdbscan



def train_bertopic_model(embeddings: np.ndarray,
        texts: list,
        n_neighbors: int = 15,
        n_components: int = 5,
        min_cluster_size: int = 15,
        random_state: int = RANDOM_STATE):
    """Train a BERTopic model using UMAP + HDBSCAN and return the trained model.


    Parameters are exposed to allow later tuning.
    """
    umap_model = UMAP(n_neighbors=n_neighbors,
        n_components=n_components,
        metric='cosine',
        random_state=random_state)


    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
        metric='euclidean',
        cluster_selection_method='eom')


    topic_model = BERTopic(umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        calculate_probabilities=False,
        verbose=True,
        random_state=random_state)


    topics, _ = topic_model.fit_transform(texts, embeddings)
    return topic_model

In [13]:
def assign_topics(df: pd.DataFrame, model: BERTopic, embeddings: np.ndarray) -> pd.DataFrame:
    """Assign dominant_topic and human-readable Topic label to the dataframe.


    Returns a copy of df with columns ['dominant_topic', 'Topic_label'] added.
    """
    df = df.copy()
    topics, probs = model.transform(df['clean_text'].tolist(), embeddings)
    df['dominant_topic'] = topics


    # Build human-readable labels using representative words
    topic_info = model.get_topic_info()


    def make_label(topic_id: int):
        if topic_id == -1:
            return 'Outlier'
        top_words = [w for w, _ in model.get_topic(topic_id)][:5]
        # heuristic: join first 2-3 words into a short label
        label = ' '.join([w.capitalize() for w in top_words[:3]])
        return label


    df['Topic'] = df['dominant_topic'].apply(make_label)
    return df

In [16]:
nltk.download("wordnet", download_dir="/home/ismail/.pyenv/versions/voicelens/nltk_data")
nltk.download("omw-1.4", download_dir="/home/ismail/.pyenv/versions/voicelens/nltk_data")
nltk.data.path.append("/home/ismail/.pyenv/versions/voicelens/nltk_data")

df_clean = preprocess_dataframe(df)
df_clean.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ismail/.pyenv/versions/voicelens/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/ismail/.pyenv/versions/voicelens/nltk_data...


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/ismail/nltk_data'
    - '/home/ismail/.pyenv/versions/voicelens/nltk_data'
    - '/home/ismail/.pyenv/versions/voicelens/share/nltk_data'
    - '/home/ismail/.pyenv/versions/voicelens/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/home/ismail/.pyenv/versions/voicelens/nltk_data'
**********************************************************************


In [None]:
from collections import Counter
return topic_dict




def compute_coherence_from_model(model: BERTopic, texts_tokens: list, top_n: int = 10):
"""Compute coherence using Gensim's C_V coherence over topic top words.


texts_tokens: list of token lists (preprocessed)
"""
topic_words = topic_top_words(model, top_n=top_n)
# Build gensim dictionary and corpus
dictionary = Dictionary(texts_tokens)
# Gensim's CoherenceModel requires list of topic word lists
topics_for_gensim = [topic_words[t] for t in sorted(topic_words.keys())]
cm = CoherenceModel(topics=topics_for_gensim, texts=texts_tokens, dictionary=dictionary, coherence='c_v')
coherence = cm.get_coherence()
return coherence




def topic_diversity_score(model: BERTopic, top_n: int = 25):
"""Compute topic diversity as (#unique words in top_n across topics) / (top_n * #topics).


Higher means more diverse topics.
"""
topic_words = topic_top_words(model, top_n=top_n)
all_words = [w for words in topic_words.values() for w in words]
unique = set(all_words)
n_topics = len(topic_words)
denom = top_n * n_topics if n_topics > 0 else 1
return len(unique) / denom




def distribution_stability(model: BERTopic, df: pd.DataFrame, embeddings: np.ndarray, n_splits: int = 5, sample_frac: float = 0.5):
"""Estimate distribution stability by sampling and computing average JS divergence between topic distributions.


We sample pairs and compute the Jensen-Shannon distance between their topic frequency distributions.
"""
rng = np.random.default_rng(RANDOM_STATE)
js_scores = []
for _ in range(n_splits):
idx_a = rng.choice(len(df), size=int(len(df) * sample_frac), replace=False)
idx_b = rng.choice(len(df), size=int(len(df) * sample_frac), replace=False)
texts_a = df.iloc[idx_a]['clean_text'].tolist()
texts_b = df.iloc[idx_b]['clean_text'].tolist()
emb_a = embeddings[idx_a]
emb_b = embeddings[idx_b]
topics_a, _ = model.transform(texts_a, emb_a)
topics_b, _ = model.transform(texts_b, emb_b)
freq_a = Counter(topics_a)
freq_b =