In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
from google.colab import drive

In [3]:
# Connect to Google Drive
drive.mount('/content/gdrive')
google_dir = '/content/gdrive/MyDrive/ANLP'

Mounted at /content/gdrive


In [4]:
# Copy data file to Colab instance to avoid quota issues with Google Drive
!cp '/content/gdrive/MyDrive/ANLP/Reviews.csv' '/content/'

In [5]:
# Load and check data
data = pd.read_csv('/content/Reviews.csv')
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
# Filter out reviews with helpfulness votes
helpful_df = data.loc[data['HelpfulnessDenominator'] >= 10].copy()

# Create Target Variable
helpful_df['helpful_score'] = helpful_df['HelpfulnessNumerator'] / helpful_df['HelpfulnessDenominator']

In [7]:
import spacy
import spacy.cli

In [8]:
# Using the large model over the standard version to get vectors. Requires additional install.
spacy.cli.download('en_core_web_lg')
nlp = spacy.load('en_core_web_lg')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [9]:
# Apply spacy's nlp function to text. 
# Note: approx. 14mins load time. 
helpful_df['nlp'] = helpful_df['Text'].apply(nlp)

In [10]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.14.1-py2.py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.7/120.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.

In [11]:
from bertopic import BERTopic
from umap import UMAP

In [12]:
helpful_df['text_processed'] = helpful_df['nlp'].apply(lambda x: [word.lemma_ for word in x if not word.is_stop])

In [13]:
helpful_df['text_processed'] = helpful_df['text_processed'].apply((' ').join)

In [14]:
# Model fit
# Runtime approx. 15mins
X_train = helpful_df['text_processed']

umap_model = UMAP(random_state=99)
topic_model = BERTopic(embedding_model=nlp, umap_model=umap_model)
topics, probs = topic_model.fit_transform(X_train)

In [82]:
fig = topic_model.visualize_topics()
fig.show()

In [71]:
distance_map = pd.DataFrame({'x': fig.data[0]['x'],
                             'y': fig.data[0]['y']})
distance_map

Unnamed: 0,x,y
0,7.812526,3.617671
1,16.664549,-6.151808
2,2.591467,12.624893
3,9.573107,20.802774
4,16.632582,-6.129272
...,...,...
564,9.685658,0.549910
565,0.284355,2.986609
566,2.273794,-4.895193
567,2.786040,12.433862


In [16]:
topic_details = pd.concat([helpful_df[['helpful_score']].reset_index(),
                           topic_model.get_document_info(X_train)], axis=1)
topic_details.head()

Unnamed: 0,index,helpful_score,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,32,1.0,McCann Instant Oatmeal great oatmeal scrape mi...,-1,-1_taste_br_water_tea,taste - br - water - tea - like - product - ch...,0.0,False
1,33,1.0,good instant oatmeal good oatmeal brand . us...,513,513_coco_pb2_dissappointe_spoon,coco - pb2 - dissappointe - spoon - pretzel - ...,1.0,False
2,82,1.0,"know product title say Molecular Gastronomy , ...",-1,-1_taste_br_water_tea,taste - br - water - tea - like - product - ch...,0.0,False
3,158,0.894737,"< span class=""tiny "" > length : : 0:26 Mins < ...",-1,-1_taste_br_water_tea,taste - br - water - tea - like - product - ch...,0.0,False
4,213,0.3,"canidae , Felidae change formula . cat like ...",-1,-1_taste_br_water_tea,taste - br - water - tea - like - product - ch...,0.0,False


In [50]:
rank_topics = topic_details.groupby('Topic')['helpful_score'].mean()
rank_topics = rank_topics.sort_values(ascending=False)
rank_topics = pd.merge(left=rank_topics.reset_index(),
                       right=topic_model.get_topic_info(),
                       how='left',
                       on='Topic')

In [51]:
rank_topics.head(10)

Unnamed: 0,Topic,helpful_score,Count,Name
0,504,1.0,12,504_throwing_barf_pate_kirkland
1,281,1.0,20,281_russel_mutt_bone_petsmart
2,548,1.0,11,548_redesign_brewer_adapter_keurig
3,278,1.0,20,278_infusion_fittingly_hay_unadulterated
4,537,1.0,11,537_prescription_tremendously_vet_okay
5,430,1.0,14,430_hey_unsalted_salsa_accident
6,184,1.0,25,184_midwife_labor_trimester_centimeter
7,471,1.0,12,471_months_pile_beg_shed
8,479,1.0,12,479_obligate_companion_massive_carnivore
9,354,0.995192,16,354_fierce_advocate_fulfill_comprise


In [53]:
rank_topics.tail(10)

Unnamed: 0,Topic,helpful_score,Count,Name
560,558,0.159324,10,558_overpriced_eternity_borderline_paranoid
561,546,0.130303,11,546_srewe_covering_ring_safe
562,404,0.113534,15,404_cancel_ignored_sent_idiot
563,171,0.102897,26,171_garlic_onion_jerky_primal
564,489,0.077381,12,489_ale_beverage_ginger_people
565,186,0.076923,25,186_sage_gag_nasty_not
566,401,0.066667,15,401_ladies_cloyingly_sophisticated_unexpected
567,364,0.06,15,364_delicately_nearby_munch_buttery
568,195,0.041667,25,195_pregnant_pregnancy_fan_maybe
569,174,0.00601,26,174_charge_receive_week_product


In [18]:
rank_topics.sort_values('Count', ascending=False).head(20)

Unnamed: 0,Topic,helpful_score,Count,Name
328,-1,0.767733,9948,-1_taste_br_water_tea
447,0,0.666812,269,0_00_95_99_19
220,1,0.829051,233,1_pb_pb2_reg_gummy
267,2,0.805141,232,2_oil_olive_coconut_virgin
437,3,0.680259,150,3_hungry_science_percent_deliberately
458,4,0.656721,145,4_energy_ostrich_pros_hour
443,5,0.669734,141,5_00_2l_equilent_equilalent
144,6,0.87321,140,6_raman_soup_rice_speck
204,7,0.837659,136,7_tree_plant_grow_garden
494,8,0.590704,117,8_email_service_send_order


In [19]:
# Create dataframe of vectorised text.
vectors = helpful_df['nlp'].apply(lambda x: x.vector)
vec_df = vectors.apply(pd.Series)
vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
32,-1.323802,1.032652,-2.337208,-0.111059,3.76683,-0.041592,0.173712,3.87851,-0.715135,-0.504095,...,0.677222,-1.280813,1.062335,-0.518964,-1.577349,0.981302,0.494414,-0.455574,-2.148609,0.903731
33,-1.340788,1.367154,-2.432186,0.159615,3.363936,-0.144899,0.814567,3.691298,-1.419114,-0.034321,...,0.70832,-0.879604,1.487295,-0.799887,-1.704227,0.51126,0.882618,0.044586,-2.574825,1.159596
82,-1.105859,0.835509,-2.696097,-0.247309,1.856147,-0.053594,0.384191,4.065519,-2.479806,0.900214,...,0.581942,-0.722913,0.525695,-1.452901,-2.035355,0.041754,0.817126,0.323921,-3.517361,1.079162
158,-1.37346,0.747225,-1.2656,-0.159294,2.669593,0.045321,1.209111,3.1696,-2.023155,-0.155882,...,1.271905,0.149229,1.511957,-0.618358,-1.302749,1.035043,1.441601,-0.496323,-2.476612,0.582859
213,-1.334758,1.621257,-3.277381,-0.617078,3.131202,0.339972,-0.001275,3.923903,-1.103019,0.307257,...,0.962377,-0.649345,0.305418,-0.671857,-1.519091,0.11125,0.235039,-0.734187,-3.506289,1.445702


In [20]:
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

In [77]:

pca_distance = PCA(n_components=0.9)
pca_dist_fit = pca_distance.fit_transform(distance_map)

silhouette_dist = []

for k in range(2, 33):
  # initialise kmeans
  kmeans_dist = KMeans(init="random", n_clusters=k, n_init=10, max_iter=500, random_state=99)
  kmeans_dist.fit(pca_dist_fit)
  cluster_labels = kmeans_dist.labels_
 
  # silhouette score
  silhouette_dist.append(silhouette_score(pca_dist_fit, cluster_labels))

# Plot
fig = px.line(x=range(2, 33), y=silhouette_dist, 
              labels={'x': 'K', 'y': 'Silhouette Coefficient'},
              title='Finding K with Silhouette Method',
              width=950
              )
fig.update_layout(template='plotly_dark')
fig.show()

In [116]:
# Clustering algorithm
k = 24 
kmeans_dist = KMeans(init="random", n_clusters=k, n_init=10, max_iter=500, random_state=99)
kmeans_dist.fit(pca_dist_fit)

# Merge results 
distance_map['kmeans'] = kmeans_dist.labels_
distance_map['kmeans'] = distance_map['kmeans'].astype(str)
distance_map['helpful'] = rank_topics['helpful_score']
distance_map

Unnamed: 0,x,y,kmeans,helpful
0,7.812526,3.617671,17,1.000000
1,16.664549,-6.151808,19,1.000000
2,2.591467,12.624893,10,1.000000
3,9.573107,20.802774,3,1.000000
4,16.632582,-6.129272,19,1.000000
...,...,...,...,...
564,9.685658,0.549910,5,0.077381
565,0.284355,2.986609,18,0.076923
566,2.273794,-4.895193,15,0.066667
567,2.786040,12.433862,10,0.060000


In [118]:
fig_dist = px.scatter(distance_map, x='x', y='y', color='kmeans',
                      height=fig.layout['height'], width=fig.layout['width'])


fig_dist.update_traces(marker=dict(size=fig.data[0]['marker']['size'],
                                   line=dict(width=2, color='DarkSlateGrey'),
                                   sizemode='area',
                                   sizeref=0.5
                                   ),
                       customdata=fig.data[0]['customdata'],
                       hovertemplate=fig.data[0]['hovertemplate']
                      )


fig_dist.update_layout(showlegend=False, template='plotly_white')
fig_dist.show()

In [120]:
fig_dist = px.scatter(distance_map, x='x', y='y', color='helpful',
                      height=fig.layout['height'], width=fig.layout['width'])


fig_dist.update_traces(marker=dict(size=fig.data[0]['marker']['size'],
                                   line=dict(width=2, color='DarkSlateGrey'),
                                   sizemode='area',
                                   sizeref=0.2
                                   ),
                       customdata=fig.data[0]['customdata'],
                       hovertemplate=fig.data[0]['hovertemplate']
                      )


fig_dist.update_layout(showlegend=False, template='plotly_white')
fig_dist.show()

In [58]:
remove_outlier = vec_df.loc[topic_details.loc[topic_details['Topic'] != -1, 'index']]
pca = PCA(n_components=0.9)
pca_fitted = pca.fit_transform(remove_outlier)

In [59]:
pca.n_components_

50

In [60]:
# Use nearest neighbors to find a good epsilon input

# Fit Model
nn_model = NearestNeighbors(n_neighbors=2)
nn = nn_model.fit(pca_fitted)
distances, _indices = nn.kneighbors(pca_fitted)

# Sort and plot distances
distances = np.sort(distances, axis=0)
fig = px.line(distances[:, 1],
              labels={'ner_len': 'Sentence Length'},
              color_discrete_sequence=px.colors.qualitative.Dark24,
              title='Distance of Nearest Neighbours')
fig.update_layout(template='plotly_dark')
fig.show()

In [61]:
# Rule of thumb: >= dimensions + 1
min_samples = pca.n_components_ + 2

# Clustering model
clustering = DBSCAN(eps=11.5, min_samples=min_samples).fit(pca_fitted)

In [63]:
# Merge results 
new_df = topic_details.loc[topic_details['Topic'] != -1].copy()

new_df['dbscan'] = clustering.labels_
new_df.head()

Unnamed: 0,index,helpful_score,Document,Topic,Name,Top_n_words,Probability,Representative_document,dbscan,kmeans
1,33,1.0,good instant oatmeal good oatmeal brand . us...,513,513_coco_pb2_dissappointe_spoon,coco - pb2 - dissappointe - spoon - pretzel - ...,1.0,False,0,1
6,324,0.263158,cancel order . cancel problem . positive n...,404,404_cancel_ignored_sent_idiot,cancel - ignored - sent - idiot - delay - requ...,1.0,False,0,0
7,381,0.538462,condiment overpriced terrible . classic disgus...,20,20_sauce_chili_cottage_wanh,sauce - chili - cottage - wanh - articifial - ...,1.0,False,0,1
8,522,0.914894,discover Kettle chip sea salt vinegar shop Tra...,111,111_kettle_chip_mindy_prophecy,kettle - chip - mindy - prophecy - poetic - dr...,0.865643,True,0,0
9,523,0.866667,need salt hide taste potato chip . chip prove ...,402,402_potato_chip_mask_hide,potato - chip - mask - hide - prove - trans - ...,0.904373,True,0,0


In [64]:
new_df.groupby('dbscan').agg({'index':'count', 'helpful_score': ['mean', 'median']})

Unnamed: 0_level_0,index,helpful_score,helpful_score
Unnamed: 0_level_1,count,mean,median
dbscan,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
-1,275,0.464024,0.461538
0,14688,0.767949,0.9
1,71,0.50252,0.5


In [65]:
frequent_topics = new_df.groupby(['dbscan', 'Name'], as_index=False)['index'].count()
frequent_topics = frequent_topics.sort_values(['dbscan', 'index'], ascending=[True, False])
frequent_topics.groupby('dbscan').head(5)

Unnamed: 0,dbscan,Name,index
0,-1,0_00_95_99_19,54
16,-1,186_sage_gag_nasty_not,25
47,-1,431_99_49_19_wrong,13
23,-1,228_isolate_soy_tapicoa_transfat,11
21,-1,213_watmore_resifet_fest_ken,10
183,0,1_pb_pb2_reg_gummy,233
294,0,2_oil_olive_coconut_virgin,232
73,0,0_00_95_99_19,213
405,0,3_hungry_science_percent_deliberately,150
516,0,4_energy_ostrich_pros_hour,143


In [37]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [66]:
silhouette_avg = []

for k in range(2, 12):
  # initialise kmeans
  kmeans = KMeans(init="random", n_clusters=k, n_init=10, max_iter=500, random_state=99)
  kmeans.fit(pca_fitted)
  cluster_labels = kmeans.labels_
 
  # silhouette score
  silhouette_avg.append(silhouette_score(pca_fitted, cluster_labels))

In [67]:
# Plot
fig = px.line(x=range(2, 12), y=silhouette_avg, 
              labels={'x': 'K', 'y': 'Silhouette Coefficient'},
              title='Finding K with Silhouette Method',
              width=950
              )
fig.update_layout(template='plotly_dark')
fig.show()

In [41]:
# Clustering algorithm
k = 3 
kmeans = KMeans(init="random", n_clusters=k, n_init=10, max_iter=500, random_state=99)
kmeans.fit(pca_fitted)

# Merge results 
topic_details['kmeans'] = kmeans.labels_
topic_details.groupby('kmeans').agg({'index':'count', 'helpful_score': ['mean', 'median']})

Unnamed: 0_level_0,index,helpful_score,helpful_score
Unnamed: 0_level_1,count,mean,median
kmeans,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10194,0.733172,0.882353
1,12480,0.79497,0.909091
2,2308,0.730132,0.858804
