In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv("../data/cleaned_reviews.csv")
reviews = df['cleaned_review'].dropna()

df.head()

Unnamed: 0,review_id,user_id,product_id,review_title,review_content,cleaned_review,cleaned_length
0,"R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...",B07JW9H4J1,"Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,look durable charging fine toono complainschar...,36
1,"RGIQEG07R9HS2,R1SMWZQ86XIN8U,R2J3Y1WL29GWDE,RY...","AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...",B098NS6PVG,"A Good Braided Cable for Your Type C Device,Go...",I ordered this cable to connect my phone to An...,ordered cable connect phone android auto car c...,113
2,"R3J3EQQ9TZI5ZJ,R3E7WBGK7ID0KV,RWU79XKQ6I1QF,R2...","AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...",B096MSW6CT,"Good speed for earlier versions,Good Product,W...","Not quite durable and sturdy,https://m.media-a...",quite durable sturdy good nice productworking ...,10
3,"R3EEUZKKK9J36I,R3HJVYCLYOY554,REDECAZ7AMPQC,R1...","AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...",B08HDJ86NZ,"Good product,Good one,Nice,Really nice product...","Good product,long wire,Charges good,Nice,I bou...",good productlong wirecharges goodnicei bought ...,40
4,"R1BP4L2HH9TFUP,R16PVJEXKV6QZS,R2UPDB81N66T4P,R...","AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...",B08CF3B7N1,"As good as original,Decent,Good one for second...","Bought this instead of original apple, does th...",bought instead original apple work fast apple ...,227


In [6]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bezatezera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bezatezera/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
vectorizer = CountVectorizer(max_df=0.9, min_df=10, stop_words='english')
X = vectorizer.fit_transform(df['cleaned_review'])

In [8]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

0,1,2
,n_components,5
,doc_topic_prior,
,topic_word_prior,
,learning_method,'batch'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,10
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [10]:
def display_topics_improved(model, feature_names, no_top_words):
    """Display topics in a more readable format"""
    topics_dict = {}
    
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        topics_dict[f'Topic {idx+1}'] = top_words
        
        print(f"\n{'='*60}")
        print(f"📌 Topic {idx+1}")
        print('='*60)
        print(", ".join(top_words))
    
    return topics_dict

# Call it
topics = display_topics_improved(lda, vectorizer.get_feature_names_out(), 10)

# Save topics to file
import json
with open('../results/topics.json', 'w') as f:
    json.dump(topics, f, indent=4)
print("\nTopics saved to results/topics.json")


📌 Topic 1
cable, good, charging, product, quality, fast, usb, charge, work, charger

📌 Topic 2
product, good, use, easy, water, like, quality, time, work, used

📌 Topic 3
good, quality, product, sound, price, use, like, using, work, mouse

📌 Topic 4
watch, good, product, feature, screen, day, price, time, like, heater

📌 Topic 5
good, phone, camera, quality, battery, price, picture, feature, like, screen

Topics saved to results/topics.json


In [11]:
import pyLDAvis
import pyLDAvis.lda_model 

pyLDAvis.enable_notebook()
pyLDAvis.prepare(
    topic_term_dists=lda.components_,
    doc_topic_dists=lda.transform(X),
    doc_lengths=X.sum(axis=1).getA1(),
    vocab=vectorizer.get_feature_names_out(),
    term_frequency=X.sum(axis=0).getA1()
)


In [15]:
df['topic'] = lda.transform(X).argmax(axis=1)
df.to_csv("reviews_with_topics.csv", index=False)


In [None]:
#Topic 1's bubble is the largest, it means that most of the reviews in the dataset are about that theme
#Topic 1:
#['cable', 'good', 'charging', 'product', 'quality', 'fast', 'usb', 'charge', 'work', 'charger']

# It is likely about charging accessories 