In [None]:
# !pip install -U spacy
# !python -m spacy download en_core_web_sm
# !pip install spacy matplotlib seaborn wordcloud scikit-learn ipywidgets

In [None]:
# !pip install gensim

In [None]:
# !pip install --upgrade numpy==1.23.5 scipy==1.9.3 gensim==4.3.0

In [5]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloa

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


LatentDirichletAllocation

In [None]:
import pandas as pd
import re
import spacy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# import ipywidgets as widgets
from IPython.display import display, Markdown

df = pd.read_csv("/content/iphone.csv")  # Adjust if your path changes
# 1. Load data
reviews = df['reviewDescription'].dropna()

# 2. Load spaCy model
nlp = spacy.load('en_core_web_sm')

# 3. Clean & Lemmatize
def clean_and_lemmatize(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = text.lower()
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop])

cleaned_reviews = reviews.apply(clean_and_lemmatize)

# 4. Vectorize
vectorizer = CountVectorizer(
    stop_words='english',
    ngram_range=(1, 3),
    max_df=0.8,
    min_df=5,
    max_features=3000
)
X = vectorizer.fit_transform(cleaned_reviews)

# 5. LDA Model
n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

# 6. Topic distribution
topic_distribution = lda.transform(X)
dominant_topics = np.argmax(topic_distribution, axis=1)

# 7. Keyword extraction
feature_names = vectorizer.get_feature_names_out()
topic_keywords = {}
keyword_to_topic = {}

for topic_idx, topic in enumerate(lda.components_):
    top_keywords = [feature_names[i] for i in topic.argsort()[:-11:-1]]
    topic_keywords[topic_idx] = top_keywords
    for kw in top_keywords:
        keyword_to_topic[kw] = topic_idx

# 8. Dropdown keyword list
dropdown_keywords = sorted(set(keyword_to_topic.keys()))

# 11. Optional WordCloud per Topic
def show_wordcloud(topic_idx):
    words = ' '.join(topic_keywords[topic_idx])
    wc = WordCloud(width=400, height=200, background_color='white').generate(words)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud for Topic {topic_idx}")
    plt.show()

# 12. UI: Dropdown + Output
output = widgets.Output()

def display_reviews(change):
    with output:
        output.clear_output()
        if not change.new:
            return
        keyword = change.new
        topic_idx = keyword_to_topic[keyword]
        relevant_idxs = np.where(dominant_topics == topic_idx)[0]

        print(f"\n🔍 Reviews mentioning: '{keyword}' (Topic {topic_idx})\n")
        count = 0
        for idx in relevant_idxs:
            raw_review = reviews.iloc[idx].lower()
            if keyword in raw_review:
                print(f"• {reviews.iloc[idx]}\n")
                count += 1
            if count >= 5:
                break
        if count == 0:
            print("No reviews found mentioning this keyword.")

        # Show WordCloud and Top Words
        show_wordcloud(topic_idx)

# # Dropdown widget
# dropdown = widgets.Dropdown(
#     options=dropdown_keywords,
#     description='🔑 Keyword:',
#     layout=widgets.Layout(width='50%'),
#     style={'description_width': 'initial'}
# )

# dropdown.observe(display_reviews, names='value')

# # UI Display
# display(Markdown("## 🧠 Topic Modeling Review Explorer"))
# display(Markdown("Select a keyword to explore related reviews and topic details."))
# display(dropdown)
# display(output)

## 🧠 Topic Modeling Review Explorer

Select a keyword to explore related reviews and topic details.

Dropdown(description='🔑 Keyword:', layout=Layout(width='50%'), options=('amazon', 'android', 'apple', 'battery…

Output()

**USING BERTTOPIC**





In [None]:
import pandas as pd
import re
import spacy
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
# import ipywidgets as widgets
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load spaCy model for lemmatization
nlp = spacy.load('en_core_web_sm')

def clean_and_lemmatize(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = text.lower()
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop])

# Preprocess reviews
reviews = df['reviewDescription'].dropna()
cleaned_reviews = reviews.apply(clean_and_lemmatize).tolist()

# Load sentence transformer model (better embeddings)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings
embeddings = embedding_model.encode(cleaned_reviews, show_progress_bar=True)

# Initialize BERTopic with tuned parameters
topic_model = BERTopic(
    embedding_model=embedding_model,
    nr_topics="auto",
    min_topic_size=30,
    calculate_probabilities=True,
    verbose=True
)

# Fit model on data and embeddings
topics, probs = topic_model.fit_transform(cleaned_reviews, embeddings)

# Extract topic info and keywords
topic_info = topic_model.get_topic_info()
topic_keywords = {}
for topic_id in topic_info['Topic'].unique():
    if topic_id == -1:  # outlier topic
        continue
    keywords = topic_model.get_topic(topic_id)
    topic_keywords[topic_id] = [kw[0] for kw in keywords]

# Map keywords to topics for dropdown
keyword_to_topic = {}
for tid, kws in topic_keywords.items():
    for kw in kws:
        keyword_to_topic[kw] = tid

dropdown_keywords = sorted(keyword_to_topic.keys())

# === Topic Coherence (TC) Calculation ===
def topic_coherence(topic_words, embedding_model):
    """
    Computes average pairwise cosine similarity between topic words' embeddings.
    Higher means more coherent topic.
    """
    # Embed each word (sentence transformer expects sentences, so words are fine)
    word_embeddings = embedding_model.encode(topic_words)
    # Compute pairwise cosine similarity matrix
    sim_matrix = cosine_similarity(word_embeddings)
    # Extract upper triangle without diagonal
    n = len(topic_words)
    upper_triangle_sims = [sim_matrix[i, j] for i in range(n) for j in range(i+1, n)]
    # Average similarity
    return np.mean(upper_triangle_sims)

# Compute TC for all topics
tc_scores = []
for tid, kws in topic_keywords.items():
    tc = topic_coherence(kws, embedding_model)
    tc_scores.append((tid, tc))

# === Topic Diversity (TD) Calculation ===
all_keywords = [kw for kws in topic_keywords.values() for kw in kws]
unique_keywords = set(all_keywords)
td_score = len(unique_keywords) / len(all_keywords)

# Display TC and TD scores
print("Topic Coherence (per topic):")
for tid, score in tc_scores:
    print(f"  Topic {tid}: {score:.4f}")
print(f"\nTopic Diversity (TD) score: {td_score:.4f}")

# WordCloud function
def show_wordcloud(topic_id):
    words = ' '.join(topic_keywords.get(topic_id, []))
    wc = WordCloud(width=600, height=300, background_color='white').generate(words)
    plt.figure(figsize=(6, 3))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud for Topic {topic_id}")
    plt.show()

# Output widget for reviews
output = widgets.Output()

def display_reviews(change):
    with output:
        output.clear_output()
        if not change.new:
            return
        keyword = change.new
        topic_id = keyword_to_topic[keyword]

        relevant_idxs = [i for i, t in enumerate(topics) if t == topic_id]

        print(f"\n🔍 Reviews mentioning keyword: '{keyword}' (Topic {topic_id})\n")
        count = 0
        for idx in relevant_idxs:
            raw_review = reviews.iloc[idx].lower()
            if keyword in raw_review:
                print(f"• {reviews.iloc[idx]}\n")
                count += 1
            if count >= 5:
                break
        if count == 0:
            print("No reviews found mentioning this keyword.")

        show_wordcloud(topic_id)

# # Dropdown UI
# dropdown = widgets.Dropdown(
#     options=dropdown_keywords,
#     description='🔑 Keyword:',
#     layout=widgets.Layout(width='50%'),
#     style={'description_width': 'initial'}
# )
# dropdown.observe(display_reviews, names='value')

# # Display UI
# display(Markdown("## 🧠 Improved BERTopic Review Explorer"))
# display(Markdown("Select a keyword to explore related reviews and wordcloud."))
# display(dropdown)
# display(output)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/93 [00:00<?, ?it/s]

2025-06-08 18:10:16,710 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-08 18:10:40,769 - BERTopic - Dimensionality - Completed ✓
2025-06-08 18:10:40,770 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-08 18:10:41,027 - BERTopic - Cluster - Completed ✓
2025-06-08 18:10:41,028 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-06-08 18:10:41,118 - BERTopic - Representation - Completed ✓
2025-06-08 18:10:41,119 - BERTopic - Topic reduction - Reducing number of topics
2025-06-08 18:10:41,128 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-08 18:10:41,180 - BERTopic - Representation - Completed ✓
2025-06-08 18:10:41,182 - BERTopic - Topic reduction - Reduced number of topics from 20 to 20


Topic Coherence (per topic):
  Topic 0: 0.3014
  Topic 1: 0.2860
  Topic 2: 0.3928
  Topic 3: 0.3574
  Topic 4: 0.3534
  Topic 5: 0.3702
  Topic 6: 0.2584
  Topic 7: 0.3269
  Topic 8: 0.3617
  Topic 9: 0.7230
  Topic 10: 0.2796
  Topic 11: 0.3864
  Topic 12: 0.3785
  Topic 13: 0.3028
  Topic 14: 0.5021
  Topic 15: 0.3058
  Topic 16: 0.3006
  Topic 17: 0.4035
  Topic 18: 0.2700

Topic Diversity (TD) score: 0.6947


## 🧠 Improved BERTopic Review Explorer

Select a keyword to explore related reviews and wordcloud.

Dropdown(description='🔑 Keyword:', layout=Layout(width='50%'), options=('', 'aaa', 'aaya', 'activate', 'amazin…

Output()