# **BERTopic - Tutorial**

In [1]:
!pip install bertopic[visualization] --quiet

[K     |████████████████████████████████| 4.7MB 11.3MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 307kB 56.5MB/s 
[K     |████████████████████████████████| 71kB 12.1MB/s 
[K     |████████████████████████████████| 13.2MB 48.5MB/s 
[K     |████████████████████████████████| 1.5MB 55.8MB/s 
[K     |████████████████████████████████| 1.1MB 52.4MB/s 
[K     |████████████████████████████████| 2.9MB 51.2MB/s 
[K     |████████████████████████████████| 890kB 55.6MB/s 
[?25h  Building wheel for hdbscan (PEP 517) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


# **Imports**

In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
from bertopic import BERTopic

# **Load data**

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.head(3)

Unnamed: 0,Reviews
0,Saw the car on the road today and i found it t...
1,Built quality is very solid and a quick peppy ...
2,Even it's base model has all features and safe...


In [4]:
docs = list(df.loc[:, "Reviews"].values)

In [5]:
docs[:5]

['Saw the car on the road today and i found it to have good enough road presence.',
 'Built quality is very solid and a quick peppy engine.',
 "Even it's base model has all features and safety like traction control abs ebd etc",
 'The diesel engine is providing extreme performance to the car like a beast.',
 "It's very comfortable and doesn't make us tired at all."]

# **Creating Topics**

In [6]:
model = BERTopic(language="english")

In [7]:
topics, probs = model.fit_transform(docs)

100%|██████████| 245M/245M [00:10<00:00, 23.1MB/s]


# We can then extract most frequent topics:

In [8]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,154
1,3,152
2,14,139
3,10,60
4,8,39
5,2,33
6,0,30
7,7,27
8,12,27
9,1,21


# Get Individual Topics

In [9]:
model.get_topic(0)

[('sound', 0.20647518388927114),
 ('lighting', 0.05904045659014696),
 ('bose', 0.052776100755048946),
 ('lights', 0.03936030439343131),
 ('awesome', 0.036118521776356785),
 ('good', 0.035078170500737096),
 ('amazing', 0.033548658623619845),
 ('great', 0.03219834532747863),
 ('gimmicky', 0.024852892350148082),
 ('adaptive', 0.024852892350148082)]

In [10]:
model.get_topic(2)

[('cost', 0.12703704731543825),
 ('maintenance', 0.10057315391189506),
 ('service', 0.07040120773832655),
 ('costs', 0.06814275683211399),
 ('little', 0.057560357128908717),
 ('low', 0.04022926156475803),
 ('less', 0.032539541046116914),
 ('costlier', 0.03036859017445161),
 ('goodwill', 0.03036859017445161),
 ('servicing', 0.028305154509779017)]

In [11]:
model.get_topic(14)

[('car', 0.08357972680098304),
 ('good', 0.04357671366284399),
 ('safety', 0.04106929322713067),
 ('suv', 0.03988365816260247),
 ('drive', 0.03256642096270172),
 ('excellent', 0.0309635676225596),
 ('comfortable', 0.02290996022730286),
 ('amazing', 0.022227541436662838),
 ('airbags', 0.021626624133744835),
 ('nice', 0.019017652727957822)]

# **Visualize Topics**

In [12]:
model.visualize_topics()