In [1]:
import pandas as pd
df = pd.read_csv("/home/ismail/code/belachkar/voicelens_capstone_project/raw_data/review_for_amazon.csv",
                 encoding='latin-1')
df.head()

Unnamed: 0,name,n_review,country,comment,rating,date
0,Graham MOORE,21,GB,Uncaring and incompetent\r\r\n\r\r\nImpossible...,1,2022-06-20
1,popadog,5,GB,Amazon maybe the quickest way to get<U+0085>\r...,2,2022-06-20
2,Andrew Torok,6,US,Not fair!\r\r\n\r\r\nIn genera! I am an Amazon...,1,2022-06-20
3,Jerry Jocoy,15,US,Amazon Prime is crap\r\r\n\r\r\nAmazon Prime i...,1,2022-06-20
4,steve erickson,3,US,Terrible delivery services\r\r\n\r\r\nTerrible...,1,2022-06-19


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text: str) -> str:
    """
    Basic text cleaning: lowercase, remove URLs, special characters,
    extra spaces, and keep alphabetic words only.
    """
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)       # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", " ", text)                 # keep letters only
    text = re.sub(r"\s+", " ", text).strip()                 # remove extra spaces
    return text

def tokenize(text: str) -> list:
    """
    Tokenize using gensim simple_preprocess (handles punctuation).
    """
    return simple_preprocess(text, deacc=True)

def remove_stopwords(tokens: list) -> list:
    """
    Remove English stopwords.
    """
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens: list) -> list:
    """
    Lemmatize tokens to reduce words to their base form.
    """
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess_pipeline(text: str) -> list:
    """
    Full preprocessing pipeline:
    1. Clean text
    2. Tokenize
    3. Remove stopwords
    4. Lemmatize
    Returns list of final processed tokens.
    """
    cleaned = clean_text(text)
    tokens = tokenize(cleaned)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_tokens(tokens)
    return tokens


[nltk_data] Downloading package stopwords to /home/ismail/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ismail/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from gensim.corpora import Dictionary

# Apply preprocessing to the "comment" column
processed_docs = df['comment'].apply(preprocess_pipeline)

# Create dictionary mapping words → ids
dictionary = Dictionary(processed_docs)

# Filter extremes to improve LDA quality
dictionary.filter_extremes(
    no_below=10,    # keep tokens that appear in at least 10 documents
    no_above=0.5    # remove tokens that appear in more than 50% of documents
)

# Create Bag-of-Words corpus
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print("Dictionary size:", len(dictionary))
print("Example BOW for first document:", corpus[0][:10])


Dictionary size: 3244
Example BOW for first document: [(0, 1), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 2)]


In [4]:
from gensim.models.ldamodel import LdaModel

# Define number of topics (can tune later)
NUM_TOPICS = 10

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    random_state=42,
    chunksize=2000,
    passes=10,
    alpha='auto',
    eta='auto',
    eval_every=None  # disables per-iteration logging for speed
)

print("LDA model successfully trained!")


LDA model successfully trained!


In [8]:
def get_dominant_topic(model, bow):
    """
    Return the dominant topic ID for a single document.
    """
    topic_probs = model.get_document_topics(bow)
    if not topic_probs:
        return None
    dominant_topic = max(topic_probs, key=lambda x: x[1])[0]
    return dominant_topic

# Apply to entire dataset
topic_ids = [get_dominant_topic(lda_model, bow) for bow in corpus]

df['dominant_topic'] = topic_ids  # keep numeric IDs
df['Topic'] = topic_ids           # temporary placeholder

print("Dominant topic successfully assigned to all rows!")
df.head()


Dominant topic successfully assigned to all rows!


Unnamed: 0,name,n_review,country,comment,rating,date,Topic,dominant_topic
0,Graham MOORE,21,GB,Uncaring and incompetent\r\r\n\r\r\nImpossible...,1,2022-06-20,9,9
1,popadog,5,GB,Amazon maybe the quickest way to get<U+0085>\r...,2,2022-06-20,0,0
2,Andrew Torok,6,US,Not fair!\r\r\n\r\r\nIn genera! I am an Amazon...,1,2022-06-20,2,2
3,Jerry Jocoy,15,US,Amazon Prime is crap\r\r\n\r\r\nAmazon Prime i...,1,2022-06-20,1,1
4,steve erickson,3,US,Terrible delivery services\r\r\n\r\r\nTerrible...,1,2022-06-19,7,7


In [9]:
def get_topic_keywords(model, num_words=5):
    """
    Return top keywords for each LDA topic as a readable label.
    """
    topic_keywords = {}
    for topic_id in range(model.num_topics):
        words = model.show_topic(topic_id, topn=num_words)
        keywords = ", ".join([word for word, _ in words])
        topic_keywords[topic_id] = keywords
    return topic_keywords


# Get labels
topic_labels = get_topic_keywords(lda_model, num_words=4)

topic_labels

{0: 'account, card, email, order',
 1: 'prime, package, delivery, get',
 2: 'review, product, book, star',
 3: 'seller, company, product, money',
 4: 'price, great, love, good',
 5: 'would, said, told, back',
 6: 'service, customer, good, always',
 7: 'kindle, driver, parcel, delivery',
 8: 'item, day, order, delivery',
 9: 'customer, service, worst, company'}

In [10]:
# =======================================================
# SECTION 7 — MAP TOPIC IDs TO HUMAN LABELS
# =======================================================

def assign_topic_labels(df, topic_id_col, topic_labels_dict):
    """
    Map numeric LDA topic IDs to human-readable topic labels.
    """
    df["Topic"] = df[topic_id_col].map(topic_labels_dict)
    return df


# Apply mapping
df = assign_topic_labels(df, "dominant_topic", topic_labels)

df[["comment", "dominant_topic", "Topic"]].head()


Unnamed: 0,comment,dominant_topic,Topic
0,Uncaring and incompetent\r\r\n\r\r\nImpossible...,9,"customer, service, worst, company"
1,Amazon maybe the quickest way to get<U+0085>\r...,0,"account, card, email, order"
2,Not fair!\r\r\n\r\r\nIn genera! I am an Amazon...,2,"review, product, book, star"
3,Amazon Prime is crap\r\r\n\r\r\nAmazon Prime i...,1,"prime, package, delivery, get"
4,Terrible delivery services\r\r\n\r\r\nTerrible...,7,"kindle, driver, parcel, delivery"
