# BERT Topic Modeling

Roughly following: https://towardsdatascience.com/meet-bertopic-berts-cousin-for-advanced-topic-modeling-ea5bf0b7faa3

BERTopic documentation: https://maartengr.github.io/BERTopic/index.html

In [30]:
# Setting to ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from bertopic import BERTopic

# Text processing libraries
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import emoji
import contractions  # from https://github.com/kootenpv/contractions
import string

pd.set_option('display.max_colwidth', 100)

## 1 Prepare Data

In [31]:
# Read data and clean a bit
spotify = pd.read_csv("../data/raw/spotify_review_kaggle.csv")
data_in = spotify.copy()
data_in = data_in.drop_duplicates(subset="Review")  # Drop dupes in Reviews
data_in.drop(["Total_thumbsup",
             "Reply"], axis=1, inplace=True)
data_in["Length"] = data_in["Review"].str.split(" ").str.len()
data_in = data_in[data_in.Length < 150]
data_in.drop(["Length"], axis=1, inplace=True)
data = data_in
data.head(10)

Unnamed: 0,Time_submitted,Review,Rating
0,2022-07-09 15:00:00,"Great music service, the audio is high quality and the app is easy to use. Also very quick and f...",5
1,2022-07-09 14:21:22,Please ignore previous negative rating. This app is super great. I give it five stars+,5
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience on Android 12"" is too annoying. Please let's get ri...",4
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't put on my playlist??? And why do we have shuffle p...,1
5,2022-07-09 13:20:20,The player controls sometimes disappear for no reason. App restart forgets what I was playing bu...,3
6,2022-07-09 13:19:21,I love the selection and the lyrics are provided with the song you're listening to!,5
7,2022-07-09 13:17:22,Still extremely slow when changing storage to external sd card.. I'm convinced this is done on p...,3
8,2022-07-09 13:16:49,"It's a great app and the best mp3 music app I have ever used but there is one problem that, why ...",5
9,2022-07-09 13:11:32,"I'm deleting this app, for the following reasons: This app now has a failing business model. Whe...",1


In [32]:
## Define a function to do some text cleaning
def clean_text(text):
    text = emoji.demojize(text)  # fix emojis
    text = contractions.fix(text)  # fix contractions
    text = re.sub(r'[^\x00-\x7f]', r'', text)  # remove strange fonts
    text = re.sub(r"\d+", "number", text)  # replace numbers with "number"
    text = re.sub(r'[^\w\s]', '', text)  # remove non-alphanumeric chars
    text = re.sub(r'\b\w{1}\b', '', text) # remove words less than 3 characters
    text = text.replace('_', ' ')  # replace underscores with space
    text = text.strip()  # strip extra spaces
    return text.lower()

In [33]:
## Test our text cleaner
clean_text("I ❤️ Spotify, even though it is critizised for exploiting artists")

'red heart spotify even though it is critizised for exploiting artists'

In [34]:
data["Review"] = data["Review"].apply(clean_text)

In [35]:
## Tokenize/remove punctuations
tokenizer = RegexpTokenizer(r'\w+')

In [36]:
data["Review"] = data["Review"].apply(tokenizer.tokenize)

In [37]:
data.sample(10)

Unnamed: 0,Time_submitted,Review,Rating
21491,2022-05-16 16:45:54,"[great, experience, but, missed, bengali, osts, for, some, great, movies, and, also, few, old, s...",5
48390,2022-03-03 06:43:45,"[exceptional, enjoying, the, recommendation, tab, based, on, the, music, on, the, playlist]",5
3768,2022-07-01 07:50:29,"[cannot, find, saved, playlists, on, latest, android, update]",1
18720,2022-05-25 00:46:21,"[great, but, recently, crashing, the, ad, will, run, and, then, skip, playlist, completely, show...",2
5955,2022-06-26 12:27:01,"[awesome, music, but, could, have, been, better, like, youtube, music]",5
36168,2022-04-14 03:03:47,"[great, app, and, has, worked, flawlessly, until, the, last, update, or, so, where, the, bar, fo...",2
37053,2022-04-13 05:43:39,"[music, randomly, stops, playing, after, the, lastest, update, as, well, as, the, progress, bar,...",1
36259,2022-04-14 00:11:07,"[this, app, is, so, buggy, lately, there, is, no, control, bar, to, pause, music, when, its, pla...",2
38284,2022-04-11 18:44:32,"[great, fun, use, it, mainly, for, podcasts, and, there, are, no, ads, harder, to, use, it, for,...",5
58061,2022-01-27 05:02:59,"[pointless, app, unless, you, are, willing, to, pay, for, premium, just, plays, random, music, s...",2


In [38]:
# Get NLTK English stopwords
our_stopwords = set(stopwords.words('english'))
additional_stopwords = ["spotify", "app", "apps"]
our_stopwords.update(additional_stopwords)
#our_stopwords.remove("not")
# Remove stopwords
# We loop through each list of words and remove stopwords
data['Review'] = data['Review'].apply(lambda x: [word for word in x if word not in (our_stopwords)])
data.sample(5)

Unnamed: 0,Time_submitted,Review,Rating
52541,2022-02-14 06:03:07,"[shown, offline, cannot, anything, tried, everything, think, way]",1
7095,2022-06-23 19:20:17,"[redecillas, aap, sirisoly, work, properly, even, open, porply, angry, faceangry, facewhen, play...",1
14440,2022-06-06 01:22:16,"[love, adds, listen, favorite, songs, different, highly, recommended]",5
21720,2022-05-16 03:23:18,"[able, listen, music, way, sort]",2
41708,2022-03-27 23:39:43,"[love, really, amazing, smiling, face, hearts, keep, great, work, wish, could, rate, number]",5


In [39]:
## Lemmatise tokens
## Download these if needed
# nltk.download('wordnet')
# nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

In [60]:
## Test lemmatizer
[lemmatizer.lemmatize(word) for word in ["reasons", "plays", "playlists", "apps", "app", "leaves", "leaf"]]

['reason', 'play', 'playlist', 'apps', 'app', 'leaf', 'leaf']

In [41]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

In [42]:
data["Review"] = data["Review"].apply(lemmatize_text)

In [43]:
data.head(10)

Unnamed: 0,Time_submitted,Review,Rating
0,2022-07-09 15:00:00,"[great, music, service, audio, high, quality, easy, use, also, quick, friendly, support]",5
1,2022-07-09 14:21:22,"[please, ignore, previous, negative, rating, super, great, give, five, star]",5
2,2022-07-09 13:27:32,"[popup, get, best, experience, android, number, annoying, please, let, u, get, rid]",4
3,2022-07-09 13:26:45,"[really, buggy, terrible, use, recently]",1
4,2022-07-09 13:20:49,"[dear, get, song, put, playlist, shuffle, play]",1
5,2022-07-09 13:20:20,"[player, control, sometimes, disappear, reason, restart, forgets, playing, fix, issue]",3
6,2022-07-09 13:19:21,"[love, selection, lyric, provided, song, listening]",5
7,2022-07-09 13:17:22,"[still, extremely, slow, changing, storage, external, sd, card, convinced, done, purpose, know, ...",3
8,2022-07-09 13:16:49,"[great, best, mpnumber, music, ever, used, one, problem, cannot, play, song, find, song, despite...",5
9,2022-07-09 13:11:32,"[deleting, following, reason, failing, business, model, whether, streaming, service, like, consu...",1


In [44]:
data["Review"] = data["Review"].apply(lambda l: " ".join(l))
data.head()

Unnamed: 0,Time_submitted,Review,Rating
0,2022-07-09 15:00:00,great music service audio high quality easy use also quick friendly support,5
1,2022-07-09 14:21:22,please ignore previous negative rating super great give five star,5
2,2022-07-09 13:27:32,popup get best experience android number annoying please let u get rid,4
3,2022-07-09 13:26:45,really buggy terrible use recently,1
4,2022-07-09 13:20:49,dear get song put playlist shuffle play,1


In [45]:
data = data.reset_index()

In [46]:
model = BERTopic(verbose=True,embedding_model='paraphrase-MiniLM-L3-v2', min_topic_size = 50)
review_topics, _ = model.fit_transform(data["Review"])

Batches: 100%|██████████| 1918/1918 [01:42<00:00, 18.80it/s]
2022-11-17 11:45:06,888 - BERTopic - Transformed documents to Embeddings
2022-11-17 11:45:36,325 - BERTopic - Reduced dimensionality
2022-11-17 11:45:42,316 - BERTopic - Clustered reduced embeddings


In [47]:
freq = model.get_topic_info()
print("Number of topics: {}".format(len(freq)))
freq.head(7)

Number of topics: 100


Unnamed: 0,Topic,Count,Name
0,-1,22040,-1_number_premium_song_playlist
1,0,3705,0_podcasts_podcast_episode_listening
2,1,3638,1_playing_stop_pause_bar
3,2,1976,2_song_find_love_favorite
4,3,1664,3_shuffle_play_playlist_song
5,4,1462,4_play_song_want_skip
6,5,1459,5_log_account_logged_login


In [48]:
a_topic = freq.iloc[1]["Topic"] # Select the 1st topic
model.get_topic(a_topic) # Show the words and their c-TF-IDF scores

[('podcasts', 0.0552918741570749),
 ('podcast', 0.0442602852512311),
 ('episode', 0.01988098107136382),
 ('listening', 0.009827260996056608),
 ('music', 0.008484711044390393),
 ('listen', 0.008222913076240183),
 ('stop', 0.0078054419491149435),
 ('great', 0.007470288882926339),
 ('use', 0.007050719968073989),
 ('also', 0.006993121395420004)]

In [49]:
model.visualize_barchart(top_n_topics=12)

In [50]:
model.visualize_topics()

In [62]:
model.visualize_hierarchy(top_n_topics=100)

In [68]:
# Select most 3 similar topics
similar_topics, similarity = model.find_topics("bug", top_n = 3)
similar_topics

[19, 54, 76]

In [69]:
most_similar = similar_topics[0]
print("Most Similar Topic Info: \n{}".format(model.get_topic(most_similar)))
print("Similarity Score: {}".format(similarity[0]))

Most Similar Topic Info: 
[('update', 0.061313719673048404), ('bug', 0.059843803315980534), ('buggy', 0.048612295777688636), ('worse', 0.027212425565392533), ('last', 0.02266253935247174), ('fix', 0.022074954979976545), ('new', 0.02033044690807893), ('latest', 0.020000775725816197), ('fixed', 0.015302865144464656), ('recent', 0.014855789269411125)]
Similarity Score: 0.7959357803399751


## Dynamic Topic Modeling (DTM)
https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html

In [54]:
data["Time_submitted"] = pd.to_datetime(data["Time_submitted"])
data["Week"] = data["Time_submitted"].dt.isocalendar().week

In [55]:
## Topics over time
topics_over_time = model.topics_over_time(data["Review"], data["Week"])

28it [00:08,  3.42it/s]


In [65]:
fig = model.visualize_topics_over_time(topics_over_time, top_n_topics=100, normalize_frequency=True)
fig.update_layout(xaxis_title="Week number")
fig.update_xaxes(range=[0, 27])
fig.show()