In [45]:
embeddings=0

In [63]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from bertopic.representation import KeyBERTInspired,MaximalMarginalRelevance,PartOfSpeech
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
import pandas as pd

### Method for Getting Topic Models from Text ###

df = pd.read_csv("./Data/small_data.csv")

doc = df['review_body']
        
## Initialize Models ##

# For Encoding Doccuments / Pre Processing
embedded_model = SentenceTransformer("all-MiniLM-l6-v2")

# For Controlling Number of Topics
hdbscan_model = HDBSCAN(min_cluster_size=12, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# For Removing Stop Words, Post Encoding
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

# For Reducing Size of Embeddings
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0, metric='cosine', random_state=42)

# For Different Representations 
key_model = KeyBERTInspired()
pos_model = PartOfSpeech("en_core_web_sm")
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# Make Dictionary of Different Representations
representation_model={
        "KeyBERT":key_model,
        "POS":pos_model,
        "MMR":mmr_model
}


# Encode Doccuments with SentenceTransformer
embeddings = embedded_model.encode(doc)

# Initialize our Topic Model
topic_model = BERTopic(
        embedding_model=embedded_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        top_n_words=10,
        verbose=True
)

# Fit our doc and encoded doc to topic model
topics, probs = topic_model.fit_transform(doc,embeddings)

# Automatically generate labels using KeyBERT
keybert_topic_labels = {topic: ",".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_["KeyBERT"].items()}
topic_model.set_topic_labels(keybert_topic_labels)


2023-11-18 20:51:28,672 - BERTopic - Reduced dimensionality
2023-11-18 20:51:28,685 - BERTopic - Clustered reduced embeddings


In [64]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,KeyBERT,POS,MMR,Representative_Docs
0,-1,90,-1_great_34_quality_price,"great product,quality,great quality","[great, 34, quality, price, br, work, works, p...","[great product, quality, great quality, device...","[great, quality, price, power, battery, good, ...","[great, 34, quality, price, br, work, works, p...",[I have ordered 3 packages of these over the p...
1,0,42,0_speaker_br_sound_br br,"sound quality,bluetooth speakers,speaker","[speaker, br, sound, br br, great, good, size,...","[sound quality, bluetooth speakers, speaker, s...","[speaker, sound, great, good, size, speakers, ...","[speaker, br, sound, br br, great, good, size,...","[First off, I bought this product on sale for ..."
2,1,34,1_headphones_br_sound_br br,"headphones,headphone,speakers headphones","[headphones, br, sound, br br, pair, just, don...","[headphones, headphone, speakers headphones, s...","[headphones, br, sound, pair, bass, listening,...","[headphones, br, sound, br br, pair, just, don...",[Excellent sound. Good price. Ear pads are les...
3,2,31,2_product_great_great product_good,"great product,great quality,product","[product, great, great product, good, fast, se...","[great product, great quality, product, fantas...","[product, great, great product, good, fast, se...","[product, great, great product, good, fast, se...","[Great product., great product, great service...."
4,3,23,3_works_works great_did_work,"works great,worked great,works fine","[works, works great, did, work, does job, grea...","[works great, worked great, works fine, works,...","[great, job, use, problems, weeks, product, go...","[works, works great, did, work, does job, grea...","[Works great!, works great, works great]"
5,4,17,4_tv_hdmi_end_br,"hdmi,tv,cable","[tv, hdmi, end, br, app, receiver, line, great...","[hdmi, tv, cable, cables, sony, speakers, rece...","[hdmi, end, app, receiver, line, great, cable,...","[tv, hdmi, end, br, app, receiver, line, great...",[Such a great option to extend those HDMI cord...
6,5,13,5_expected_exactly_described_fit,"better expected,expected,better","[expected, exactly, described, fit, performed,...","[better expected, expected, better, perfectly,...","[fit, , , , , , , , , ]","[expected, exactly, described, fit, performed,...","[wasnt as loud as expected, exactly as promise..."


In [54]:
topic_model.get_topic(1, full=True)

{'Main': [('headphones', 0.08279678174701796),
  ('br', 0.07594886849349898),
  ('sound', 0.06473104219968136),
  ('br br', 0.04959050817602452),
  ('pair', 0.0494141097703832),
  ('just', 0.049073378136489945),
  ('don', 0.04821706592908403),
  ('bass', 0.04557480390406242),
  ('like', 0.04453615055571751),
  ('use', 0.0418405355342957)],
 'KeyBERT': [('headphones', 0.5959835),
  ('headphone', 0.5579121),
  ('speakers headphones', 0.5243578),
  ('sound quality', 0.47302055),
  ('bass', 0.3699308),
  ('listening', 0.3338617),
  ('sound', 0.3094788),
  ('noise', 0.30541444),
  ('music', 0.27672616),
  ('listen', 0.27580574)],
 'POS': [('headphones', 0.08279678174701796),
  ('br', 0.07594886849349898),
  ('sound', 0.06473104219968136),
  ('pair', 0.0494141097703832),
  ('bass', 0.04557480390406242),
  ('listening', 0.03958825748262999),
  ('great', 0.03923017216314647),
  ('noise', 0.0348915938713856),
  ('good', 0.03177691825486004),
  ('music', 0.028930239557450423)],
 'MMR': [('headph

In [65]:
topic_distr, _ = topic_model.approximate_distribution(df['review_body'], window=8, stride=4)

100%|██████████| 1/1 [00:00<00:00, 14.76it/s]


In [66]:
print(embeddings)

[[-0.06370269  0.10720373 -0.05701106 ...  0.02399345  0.03330268
   0.05962891]
 [-0.0614793  -0.01381194 -0.02273538 ...  0.00036705  0.03666481
   0.02547075]
 [-0.07048792 -0.03245116 -0.01553081 ... -0.00090661  0.00291771
   0.0166677 ]
 ...
 [-0.05204808 -0.01369277 -0.02070646 ...  0.00923447 -0.01133341
   0.0401332 ]
 [-0.08399222  0.05477742 -0.00889046 ...  0.05990229  0.07735123
  -0.02825404]
 [ 0.00614069 -0.03187365 -0.02695071 ... -0.16712242 -0.00936904
  -0.02204718]]


In [67]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

topic_model.visualize_documents(doc,reduced_embeddings=reduced_embeddings, custom_labels=True, hide_annotations=True)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed