In [1]:
#Import Libraries

#for reading and data-manipulation
import os
import numpy as np
import pandas as pd

In [2]:
#for visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from wordcloud import WordCloud
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [3]:
#for data preprocessing
import time
from contractions import contractions_dict
import re
from collections import Counter
from wordcloud import STOPWORDS
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# for ML model Implementation
from gensim import corpora
from gensim.models import CoherenceModel, LdaModel
from gensim.models.coherencemodel import CoherenceModel
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid
from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from bertopic import BERTopic
from sklearn.metrics import silhouette_score
import hdbscan
from sentence_transformers import SentenceTransformer
from umap import UMAP



In [5]:
import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)
# warnings.simplefilter("ignore", category=SettingWithCopyWarning)


In [6]:
df = pd.read_csv('Cell_Phone_Reviews/phone_user_review_file_1.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl..."
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,samsung.com,9.2,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T)
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374910 entries, 0 to 374909
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   phone_url  374910 non-null  object 
 1   date       374910 non-null  object 
 2   lang       374910 non-null  object 
 3   country    374910 non-null  object 
 4   source     374910 non-null  object 
 5   domain     374910 non-null  object 
 6   score      366691 non-null  float64
 7   score_max  366691 non-null  float64
 8   extract    371934 non-null  object 
 9   author     371630 non-null  object 
 10  product    374910 non-null  object 
dtypes: float64(2), object(9)
memory usage: 31.5+ MB


In [8]:
df = df[df['product'].str.contains('Samsung', na=False)]
df = df[df['lang']=='en']

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64368 entries, 0 to 369253
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   phone_url  64368 non-null  object 
 1   date       64368 non-null  object 
 2   lang       64368 non-null  object 
 3   country    64368 non-null  object 
 4   source     64368 non-null  object 
 5   domain     64368 non-null  object 
 6   score      64359 non-null  float64
 7   score_max  64359 non-null  float64
 8   extract    64162 non-null  object 
 9   author     64331 non-null  object 
 10  product    64368 non-null  object 
dtypes: float64(2), object(9)
memory usage: 5.9+ MB


In [10]:
df = df[['phone_url', 'score', 'extract']].dropna()
df.rename(columns={'phone_url': 'unique_identifier', 'score': 'score', 'extract': 'brief_review'}, inplace=True)

In [11]:
df.head()

Unnamed: 0,unique_identifier,score,brief_review
0,/cellphones/samsung-galaxy-s8/,10.0,As a diehard Samsung fan who has had every Sam...
1,/cellphones/samsung-galaxy-s8/,10.0,Love the phone. the phone is sleek and smooth ...
2,/cellphones/samsung-galaxy-s8/,6.0,Adequate feel. Nice heft. Processor's still sl...
3,/cellphones/samsung-galaxy-s8/,9.2,Never disappointed. One of the reasons I've be...
4,/cellphones/samsung-galaxy-s8/,4.0,I've now found that i'm in a group of people t...


In [12]:
# Handling missing values
df.isnull().sum()

unique_identifier    0
score                0
brief_review         0
dtype: int64

In [13]:
#handling duplicate values 
len(df[df.duplicated(subset=['brief_review'])])

861

In [14]:
len(df[df.duplicated(subset=['unique_identifier'])])


64108

In [15]:
# Drop duplicate 
df_cleaned = df.drop_duplicates(subset = ["brief_review", "unique_identifier"])
print(df_cleaned.head())

                unique_identifier  score  \
0  /cellphones/samsung-galaxy-s8/   10.0   
1  /cellphones/samsung-galaxy-s8/   10.0   
2  /cellphones/samsung-galaxy-s8/    6.0   
3  /cellphones/samsung-galaxy-s8/    9.2   
4  /cellphones/samsung-galaxy-s8/    4.0   

                                        brief_review  
0  As a diehard Samsung fan who has had every Sam...  
1  Love the phone. the phone is sleek and smooth ...  
2  Adequate feel. Nice heft. Processor's still sl...  
3  Never disappointed. One of the reasons I've be...  
4  I've now found that i'm in a group of people t...  


In [16]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63611 entries, 0 to 369253
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   unique_identifier  63611 non-null  object 
 1   score              63611 non-null  float64
 2   brief_review       63611 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.9+ MB


In [17]:
# Expand Contraction

# Function to expand contractions using the contractions_dict
def expand_contractions(text):
    # Regular expression pattern to match contractions
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        expanded = contractions_dict.get(match.lower())
        return expanded

    expanded_text = contractions_pattern.sub(expand_match, text)
    return expanded_text

# Apply the expand_contractions function to the "brief_review" column
df_cleaned['brief_review'] = df_cleaned['brief_review'].apply(expand_contractions)

In [18]:
# Convert text in the "brief_review" column to lowercase
df_cleaned['brief_review'] = df_cleaned['brief_review'].str.lower()

In [19]:
# Remove Punctuations

# Function to remove punctuations from text
def remove_punctuations(text):
    # Create a translation table to remove punctuations
    translator = str.maketrans('', '', string.punctuation +'\n')

    # Apply the translation table to remove punctuations
    text_without_punctuations = text.translate(translator)
    return text_without_punctuations

# Apply the remove_punctuations function to the "brief_review" column
df_cleaned['brief_review'] = df_cleaned['brief_review'].apply(remove_punctuations)

In [20]:
# Function to remove URLs from text
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url_pattern, '', text)

# Function to remove words containing digits from text
def remove_words_with_digits(text):
    return ' '.join(word for word in text.split() if not any(char.isdigit() for char in word))

# Function to remove non-ASCII characters (special characters)
def remove_special_characters(text):
    # Replace non-ASCII characters with a space
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

# Apply the remove_urls function to the "brief_review" column
df_cleaned['brief_review'] = df_cleaned['brief_review'].apply(remove_urls)

# Apply the remove_words_with_digits function to the "brief_review" column
df_cleaned['brief_review'] = df_cleaned['brief_review'].apply(remove_words_with_digits)

# Apply the remove_special_characters function to the "brief_review" column
df_cleaned['brief_review'] = df_cleaned['brief_review'].apply(remove_special_characters)

# Verify the cleaned data
df_cleaned['brief_review'].head()


0    as a diehard samsung fan who has had every sam...
1    love the phone the phone is sleek and smooth a...
2    adequate feel nice heft processors still slugg...
3    never disappointed one of the reasons been a l...
4    now found that in a group of people that have ...
Name: brief_review, dtype: object

In [21]:
#THIS IS THE CODE TO REMOVE THE REVIEWS WITH WORD COUNT<=5
# Function to filter reviews based on word count
def filter_short_reviews(text, min_word_count=5):
    return len(text.split()) >= min_word_count

# Apply the filter function to the "brief_review" column and keep only reviews with word count >= 5
df_cleaned = df_cleaned[df_cleaned['brief_review'].apply(filter_short_reviews)]

# Verify the filtered data
df_cleaned['brief_review'].head()


0    as a diehard samsung fan who has had every sam...
1    love the phone the phone is sleek and smooth a...
2    adequate feel nice heft processors still slugg...
3    never disappointed one of the reasons been a l...
4    now found that in a group of people that have ...
Name: brief_review, dtype: object

In [22]:
 # Remove Stopwords
# Download the list of stopwords if not already downloaded
nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Add unnecessary words to the list of English stopwords
stop_unnecessary_words = stop_words.union(set(['mr', 'people', 'would', 'year', 'said', 'say', 'also', 'wale', 'could', 'chars']))


# Function to remove stopwords from text
def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if len(word)>2]
    filtered_words = [word for word in words if word.lower() not in stop_unnecessary_words]
    return ' '.join(filtered_words)

# Apply the remove_stopwords function to the "brief_review" column
df_cleaned['brief_review'] = df_cleaned['brief_review'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/binitkc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# Remove white spaces from the "brief_review" column
df_cleaned['brief_review'] = df_cleaned['brief_review'].str.replace('\s+', ' ', regex=True)

In [24]:
# Shuffle the data
df_shuffled = df_cleaned.sample(frac=1, random_state=42).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,unique_identifier,score,brief_review
0,/cellphones/samsung-galaxy-grand-prime/,6.0,volume bit low overall good phone
1,/cellphones/samsung-galaxy-s6-edge-sm-g925f/,10.0,good design delivery
2,/cellphones/samsung-galaxy-s6-edge-sm-g925f/,9.6,samsung galaxy edge phone sleek smooth quality...
3,/cellphones/samsung-galaxy-note-4/,9.0,midst researching next sphone found upgrade ve...
4,/cellphones/samsung-galaxy-s6/,10.0,really happy product thank


In [25]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60712 entries, 0 to 369253
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   unique_identifier  60712 non-null  object 
 1   score              60712 non-null  float64
 2   brief_review       60712 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.9+ MB


In [38]:
# We need to convert the df (series) to list for passing to BERTopic.
sentenceList = df_cleaned["brief_review"].tolist()
samplesentenceList = sentenceList[1:10001]

In [39]:
# Step 1: Embed the Texts
embedding_model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
embeddings = embedding_model.encode(samplesentenceList)

In [40]:
# Step 2: Dimensionality Reduction using UMAP
# Option 1: UMAP
umap_model = UMAP(n_components=5, random_state=42)
reduced_embeddings = umap_model.fit_transform(embeddings)

In [41]:
# KMeans Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(reduced_embeddings)
kmeans_silhouette = silhouette_score(reduced_embeddings, kmeans_labels)
print(f"KMeans Silhouette Score: {kmeans_silhouette}")

KMeans Silhouette Score: 0.35315001010894775


In [42]:
# HDBSCAN Clustering
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=1, cluster_selection_method='eom')
hdbscan_labels = hdbscan_model.fit_predict(reduced_embeddings)

# Note: Silhouette score works best when there are multiple clusters, and HDBSCAN may assign some points as noise (-1 label).
# Filter out noise points for Silhouette calculation
filtered_embeddings = reduced_embeddings[hdbscan_labels != -1]
filtered_labels = hdbscan_labels[hdbscan_labels != -1]

# Only calculate silhouette score if there are clusters
if len(set(filtered_labels)) > 1:
    hdbscan_silhouette = silhouette_score(filtered_embeddings, filtered_labels)
    print(f"HDBSCAN Silhouette Score: {hdbscan_silhouette}")
else:
    print("HDBSCAN resulted in only one cluster or noise.")

HDBSCAN Silhouette Score: 0.3159528076648712


In [43]:
# Hyper-parameter Tuning.


# Step 2: Define hyperparameter grids
kmeans_params = {'n_clusters': [2, 3, 5, 6,  7, 10, 15, 20, 25, 30, 35, 40, 45, 50]}
hdbscan_params = {
    'min_cluster_size': [5, 6, 8, 10, 15, 20, 25, 30],
    'min_samples': [1, 5, 8, 10, 12, 15],
    'cluster_selection_method': ['eom', 'leaf']
}

# Step 3: Evaluate KMeans
kmeans_results = []
for n_clusters in kmeans_params['n_clusters']:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(reduced_embeddings)
    score = silhouette_score(reduced_embeddings, labels)
    kmeans_results.append({'Algorithm': 'KMeans', 'n_clusters': n_clusters, 'Silhouette Score': score})

# Step 4: Evaluate HDBSCAN
hdbscan_results = []
for min_cluster_size in hdbscan_params['min_cluster_size']:
    for min_samples in hdbscan_params['min_samples']:
        for method in hdbscan_params['cluster_selection_method']:
            hdbscan_model = hdbscan.HDBSCAN(
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                cluster_selection_method=method
            )
            labels = hdbscan_model.fit_predict(reduced_embeddings)
            # Skip noise points for Silhouette Score calculation
            if len(set(labels)) > 1:  # Ensure there are clusters
                score = silhouette_score(reduced_embeddings, labels)
                hdbscan_results.append({
                    'Algorithm': 'HDBSCAN',
                    'min_cluster_size': min_cluster_size,
                    'min_samples': min_samples,
                    'cluster_selection_method': method,
                    'Silhouette Score': score
                })

# Step 5: Combine results and display in a table
all_results = pd.DataFrame(kmeans_results + hdbscan_results)
sorted_results = all_results.sort_values(by='Silhouette Score', ascending=False)

# print(sorted_results)


In [44]:
# Print the best result
best_result = sorted_results.iloc[0]
print("\nBest Result:")
print(best_result)


Best Result:
Algorithm                     KMeans
n_clusters                       6.0
Silhouette Score            0.376034
min_cluster_size                 NaN
min_samples                      NaN
cluster_selection_method         NaN
Name: 3, dtype: object


In [45]:
# Define KMeans model with the best parameters (e.g., n_clusters=5 from your tuning results)
kmeans_model = KMeans(
    n_clusters=6,  # Best parameter based on your tuning
    random_state=42,
    prediction_data=True
)

In [46]:
topic_model = BERTopic(
    embedding_model="paraphrase-MiniLM-L3-v2",  # or another pre-trained model
    hdbscan_model=kmeans_model,
    calculate_probabilities=True,
    verbose=True
)

In [47]:
topics, probs = topic_model.fit_transform(samplesentenceList)

2024-11-17 10:24:41,009 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2024-11-17 10:24:49,042 - BERTopic - Embedding - Completed ✓
2024-11-17 10:24:49,042 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-17 10:24:56,857 - BERTopic - Dimensionality - Completed ✓
2024-11-17 10:24:56,858 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-17 10:24:56,879 - BERTopic - Cluster - Completed ✓
2024-11-17 10:24:56,883 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-17 10:24:56,960 - BERTopic - Representation - Completed ✓


In [48]:
# Now retrieve topics and their words
from gensim.corpora.dictionary import Dictionary
topic_words = []
for topic_id in range(len(topic_model.get_topics())):
    # `get_topic` should return a list of word-score pairs for each topic
    topic = topic_model.get_topic(topic_id)
    if topic:  # Make sure the topic is not empty
        words = [word for word, _ in topic]
        topic_words.append(words)

# Tokenize the original documents for coherence calculation
tokenized_docs = [doc.split() for doc in samplesentenceList]

# Create a dictionary and calculate coherence
dictionary = Dictionary(tokenized_docs)
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence="c_v"
)
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Coherence Score: 0.46803253675543327


In [49]:
def topic_diversity_score(topic_model, top_n_words=10):
    all_words = []
    for topic in topic_model.get_topics().values():
        words = [word for word, _ in topic[:top_n_words]]
        all_words.extend(words)
    unique_words = set(all_words)
    return len(unique_words) / len(all_words)

diversity_score = topic_diversity_score(topic_model)
print(f"Topic Diversity Score: {diversity_score}")


Topic Diversity Score: 0.5333333333333333


In [50]:
# Specify the path where you want to save the model
model_path = "Trained_models/bertopic_samsung_model"

# Save the trained BERTopic model
topic_model.save(model_path)




In [51]:
model_path = "Trained_models/bertopic_samsung_model"

In [52]:
# Load the saved BERTopic model
topic_model = BERTopic.load(model_path)

In [53]:
print(f"Model calculate_probabilities: {topic_model.calculate_probabilities}")

Model calculate_probabilities: True


In [54]:


# Example new document
new_document = "The camera is good "

# Get the topic and probability distribution
topic, probabilities = topic_model.transform([new_document])

# Extract topic names and probabilities
topics = topic_model.get_topic_info()  # Get topic names
topic_labels = topics.set_index("Topic")["Name"].to_dict()
topic_labels = {k: topic_labels[k] if k in topic_labels else f"Topic {k}" for k in range(len(probabilities[0]))}

# Prepare data for visualization
top_topics = {topic_labels[i]: prob for i, prob in enumerate(probabilities[0]) if prob > 0.01}  # Filter significant probabilities
labels = list(top_topics.keys())
values = list(top_topics.values())

# Plot pie chart
plt.figure(figsize=(8, 8))
plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title("Topic Distribution for New Document")
plt.show()


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-17 10:25:39,092 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-11-17 10:25:39,555 - BERTopic - Dimensionality - Completed ✓
2024-11-17 10:25:39,556 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-11-17 10:25:39,561 - BERTopic - Cluster - Completed ✓


TypeError: 'NoneType' object is not subscriptable