In [None]:
import pandas as pd

from bertopic import BERTopic
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
df_main = pd.read_csv('reviews.csv.zip')

In [3]:
df_main.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,


In [4]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Time_submitted  61594 non-null  object
 1   Review          61594 non-null  object
 2   Rating          61594 non-null  int64 
 3   Total_thumbsup  61594 non-null  int64 
 4   Reply           216 non-null    object
dtypes: int64(2), object(3)
memory usage: 2.3+ MB


In [5]:
# use only time submitted and reviews for analyses and final output
df = df_main.drop(['Rating','Total_thumbsup','Reply'], axis=1)
df.head()

Unnamed: 0,Time_submitted,Review
0,2022-07-09 15:00:00,"Great music service, the audio is high quality..."
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o..."
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...


## Topic analysis

In [6]:
t_model = BERTopic(
    language='english', 
    embedding_model='all-MiniLM-L6-v2', 
    nr_topics='auto', 
    top_n_words=8, 
    calculate_probabilities=True, 
    verbose=True)

In [7]:
topics, probs = t_model.fit_transform(df['Review'])

Batches: 100%|██████████| 1925/1925 [00:35<00:00, 53.88it/s] 
2022-11-22 23:54:52,533 - BERTopic - Transformed documents to Embeddings
2022-11-22 23:55:22,669 - BERTopic - Reduced dimensionality
2022-11-23 00:07:15,018 - BERTopic - Clustered reduced embeddings
2022-11-23 00:07:19,280 - BERTopic - Reduced number of topics from 391 to 191


In [14]:
t_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name
0,-1,27152,-1_to_the_and_it
1,0,5589,0_love_music_best_spotify
2,1,4545,1_ads_ad_30_too
3,2,2631,2_playlist_song_play_songs
4,3,2110,3_podcasts_podcast_episode_listening


In [15]:
gen_topic = t_model.get_topics()

In [16]:
gen_topic

{-1: [('to', 0.004957986270023924),
  ('the', 0.004932824975719216),
  ('and', 0.004901828184766566),
  ('it', 0.00489113145943461),
  ('app', 0.0047237839723945725),
  ('music', 0.0046005728390924325),
  ('is', 0.0045802088638491585),
  ('you', 0.004492401766820211)],
 0: [('love', 0.012299522934202615),
  ('music', 0.011179164141198574),
  ('best', 0.010782570181302064),
  ('spotify', 0.010394649854181615),
  ('easy', 0.009319947859039935),
  ('app', 0.008413446116803308),
  ('great', 0.008207941477388212),
  ('use', 0.006866411473162016)],
 1: [('ads', 0.030932098801262575),
  ('ad', 0.017434187202798126),
  ('30', 0.01045060238481711),
  ('too', 0.009345399254742755),
  ('many', 0.008638388919014627),
  ('minutes', 0.00806949072566124),
  ('after', 0.008004659735111545),
  ('are', 0.007835183513949563)],
 2: [('playlist', 0.014793923601259987),
  ('song', 0.01221226061223215),
  ('play', 0.010167697856024768),
  ('songs', 0.009646484888232996),
  ('want', 0.009570891087164867),
  (

### post processing for topic modelling

In [23]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [39]:
topic_words = []
for i in topics:
    temp_word = []
    for j in gen_topic[i]:
        temp_word.append(j[0])
    l = ' '.join(list(set(temp_word)))
    topic_words.append(l)

In [43]:
topic_words

['music and the you it to is app',
 'stars rating give because five rate would star',
 'music and the you it to is app',
 'spotify music great love use easy best app',
 'play listen want songs playlist plays song random',
 'android playback pause player disappearing disappear controls control',
 'still lyrics available feature the showing but see',
 'music and the you it to is app',
 'music and the you it to is app',
 'music and the you it to is app',
 'music and the you it to is app',
 'internet data wifi connection working no connected network',
 'music and the you it to is app',
 'music and the you it to is app',
 'music and the you it to is app',
 'music and the you it to is app',
 'update pause control playing disappearing disappears bar bottom',
 'music and the you it to is app',
 'music and the you it to is app',
 '30 after many ads ad are minutes too',
 'music service great platform ever stream streaming best',
 'music and the you it to is app',
 'me email out logged login log 

In [46]:
clean_topics = []
for i in topic_words:
    filter_sent = []
    temp = i.split()
    for w in temp:
        if w not in stop_words:
            filter_sent.append(w)
    lemma_words = []
    for w in filter_sent:
        lemma_words.append(lemmatizer.lemmatize(w))
    clean_topics.append(' '.join(list(set(lemma_words))))


### add topic column to dataframe

In [47]:
df['topics'] = pd.DataFrame(clean_topics, columns=['topics'])['topics']

In [52]:
df

Unnamed: 0,Time_submitted,Review,topics
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",music app
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,rating give five rate would star
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",music app
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,spotify music great easy use love best app
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,play listen want playlist song random
...,...,...,...
61589,2022-01-01 03:01:29,Even though it was communicated that lyrics fe...,still available lyric feature showing see
61590,2022-01-01 02:13:40,"Use to be sooo good back when I had it, and wh...",worth version without premium pay free
61591,2022-01-01 01:02:29,This app would be good if not for it taking ov...,music app
61592,2022-01-01 00:49:23,The app is good hard to navigate and won't jus...,music app
