In [126]:
## Topic modelling is a technique to extract hidden topics from large volume of text
# Latent Dirichlet Allocation (LDA) is unsupervised algorithm that we are going to use 
# LDA represents topics as word probabilities and allows for uncovering hidden topics as it clusters
# the words based ontheir co-occurrence in a respective document

# LDA discovers topics into a collection of documents
# LDA tags each document with topics

# LDA gives us 2 things - 1. Cluster of words by topic 2. Cluster of documents by topic

## Assumptions for LDA - 
# 1. Documents with similar topics use similar groups of words
# 2. Latent topics can be found by searching for groups of words that frequently occur together in documents across corpus
# 3. Documents are probability distributions over latent topics that signifies certain document will contain more words of specific topic
# 4. Topics themselves are probability distribution over words

## Goal - 
# To find out the topics around what users speak about, their concerns if we can interpret form topics

## One can use any vectorizer and not just count vectorizer e.g. tfidf! So, can experiment around it!

## Steps that we will follow are below:
# 1. Get the data
# 2. Data cleaning
# 3. Get the tokens
# 4. 

In [127]:
# Setting to ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [128]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import nltk
import numpy as np
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Text processing libraries
import re
from nltk.corpus import stopwords
import emoji
import contractions  # from https://github.com/kootenpv/contractions
import string


In [129]:
# Read data and clean a bit
spotify = pd.read_csv("../data/raw/spotify_review_kaggle.csv")
data_in = spotify.copy()
data_in = data_in.drop_duplicates(subset="Review")  # Drop dupes in Reviews
data_in.drop(["Time_submitted", "Total_thumbsup",
             "Reply"], axis=1, inplace=True)
data_in["Length"] = data_in["Review"].str.split(" ").str.len()
data_in = data_in[data_in.Length < 150]
data_in.drop(["Length"], axis=1, inplace=True)
data = data_in
data.head(10)


Unnamed: 0,Review,Rating
0,"Great music service, the audio is high quality and the app is easy to use. Also very quick and friendly support.",5
1,Please ignore previous negative rating. This app is super great. I give it five stars+,5
2,"This pop-up ""Get the best Spotify experience on Android 12"" is too annoying. Please let's get rid of this.",4
3,Really buggy and terrible to use as of recently,1
4,Dear Spotify why do I get songs that I didn't put on my playlist??? And why do we have shuffle play?,1
5,The player controls sometimes disappear for no reason. App restart forgets what I was playing but fixes the issue.,3
6,I love the selection and the lyrics are provided with the song you're listening to!,5
7,"Still extremely slow when changing storage to external sd card.. I'm convinced this is done on purpose, spotify knows of this issue and has done N...",3
8,"It's a great app and the best mp3 music app I have ever used but there is one problem that, why can't we play some songs or find some songs? despi...",5
9,"I'm deleting this app, for the following reasons: This app now has a failing business model. Whether streaming services like it, or not: the consu...",1


### Clean data

In [130]:
# Get NLTK English stopwords
our_stopwords = set(stopwords.words('english'))
our_stopwords.remove("not")

In [131]:
# Define a function to do some text cleaning
def clean_text(text):
    text = emoji.demojize(text)  # fix emojis
    text = contractions.fix(text)  # fix contractions
    text = re.sub(r'[^\x00-\x7f]', r'', text)  # remove strange fonts
    text = re.sub(r"\d+", "number", text)  # replace numbers with "number"
    text = re.sub(r'[^\w\s]', '', text)  # remove non-alphanumeric chars
    text = text.replace('_', ' ')  # replace underscores with space
    text = text.strip()  # strip extra spaces
    return text.lower()

In [132]:
# Test our text cleaner
clean_text("I ❤️ Spotify, even though it is critizised for exploiting artists")


'i red heart spotify even though it is critizised for exploiting artists'

In [133]:
# Apply to all data
data["Review"] = data["Review"].apply(clean_text)

In [134]:
## Tokenize/remove punctuations
tokenizer = RegexpTokenizer(r'\w+')

In [135]:
data["Tokens"] = data["Review"].apply(tokenizer.tokenize)

In [136]:
data.head(10)

Unnamed: 0,Review,Rating,Tokens
0,great music service the audio is high quality and the app is easy to use also very quick and friendly support,5,"[great, music, service, the, audio, is, high, quality, and, the, app, is, easy, to, use, also, very, quick, and, friendly, support]"
1,please ignore previous negative rating this app is super great i give it five stars,5,"[please, ignore, previous, negative, rating, this, app, is, super, great, i, give, it, five, stars]"
2,this popup get the best spotify experience on android number is too annoying please let us get rid of this,4,"[this, popup, get, the, best, spotify, experience, on, android, number, is, too, annoying, please, let, us, get, rid, of, this]"
3,really buggy and terrible to use as of recently,1,"[really, buggy, and, terrible, to, use, as, of, recently]"
4,dear spotify why do i get songs that i did not put on my playlist and why do we have shuffle play,1,"[dear, spotify, why, do, i, get, songs, that, i, did, not, put, on, my, playlist, and, why, do, we, have, shuffle, play]"
5,the player controls sometimes disappear for no reason app restart forgets what i was playing but fixes the issue,3,"[the, player, controls, sometimes, disappear, for, no, reason, app, restart, forgets, what, i, was, playing, but, fixes, the, issue]"
6,i love the selection and the lyrics are provided with the song you are listening to,5,"[i, love, the, selection, and, the, lyrics, are, provided, with, the, song, you, are, listening, to]"
7,still extremely slow when changing storage to external sd card i am convinced this is done on purpose spotify knows of this issue and has done not...,3,"[still, extremely, slow, when, changing, storage, to, external, sd, card, i, am, convinced, this, is, done, on, purpose, spotify, knows, of, this,..."
8,it is a great app and the best mpnumber music app i have ever used but there is one problem that why cannot we play some songs or find some songs ...,5,"[it, is, a, great, app, and, the, best, mpnumber, music, app, i, have, ever, used, but, there, is, one, problem, that, why, cannot, we, play, some..."
9,i am deleting this app for the following reasons this app now has a failing business model whether streaming services like it or not the consumer ...,1,"[i, am, deleting, this, app, for, the, following, reasons, this, app, now, has, a, failing, business, model, whether, streaming, services, like, i..."


In [137]:
# Remove stopwords
# We loop through each list of words and remove stopwords
data['Tokens'] = data['Tokens'].apply(lambda x: [word for word in x if word not in (our_stopwords)])
data.head(5)

Unnamed: 0,Review,Rating,Tokens
0,great music service the audio is high quality and the app is easy to use also very quick and friendly support,5,"[great, music, service, audio, high, quality, app, easy, use, also, quick, friendly, support]"
1,please ignore previous negative rating this app is super great i give it five stars,5,"[please, ignore, previous, negative, rating, app, super, great, give, five, stars]"
2,this popup get the best spotify experience on android number is too annoying please let us get rid of this,4,"[popup, get, best, spotify, experience, android, number, annoying, please, let, us, get, rid]"
3,really buggy and terrible to use as of recently,1,"[really, buggy, terrible, use, recently]"
4,dear spotify why do i get songs that i did not put on my playlist and why do we have shuffle play,1,"[dear, spotify, get, songs, not, put, playlist, shuffle, play]"


In [143]:
# Transformations
X = data['Tokens']

In [144]:
# Split data in train and test
X_train, X_test = train_test_split(X, test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape)

(46009,) (15337,)


In [145]:
# Count Vectorizer
def dummy(doc):
    return doc

count_vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy, max_features = 10000, ngram_range = (1,3))

X_train_counts = count_vectorizer.fit_transform(X_train)

In [None]:
#Get number of features from vectorizer and it shall match number of features that we provide to it
len(count_vectorizer.get_feature_names())

In [None]:
# Showing words randomly

for i in range(10):
    random_word_id = np.random.randint(0,10000)
    print(count_vectorizer.get_feature_names()[random_word_id])

In [146]:
# Fit model
LDA = LatentDirichletAllocation(n_components=4,random_state=42)
LDA.fit(X_train_counts)

In [149]:
# number of topics (its euqal to number of components we provide to model)
n_components = len(LDA.components_)
n_components

4

In [150]:
# 
single_topic = LDA.components_[0]
top_10_word_indices = single_topic.argsort()[-10:]

for i in top_10_word_indices:
    print(count_vectorizer.get_feature_names()[i])

get
spotify
song
like
app
premium
songs
not
ads
number


In [151]:
#
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([count_vectorizer.get_feature_names()[index] for index in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 WORDS FOR TOPIC #0
['get', 'spotify', 'song', 'like', 'app', 'premium', 'songs', 'not', 'ads', 'number']


THE TOP 10 WORDS FOR TOPIC #1
['bar', 'smiling', 'please', 'stop', 'songs', 'music', 'playing', 'update', 'face', 'app']


THE TOP 10 WORDS FOR TOPIC #2
['fix', 'playing', 'even', 'spotify', 'songs', 'cannot', 'song', 'play', 'app', 'not']


THE TOP 10 WORDS FOR TOPIC #3
['like', 'songs', 'listen', 'best', 'good', 'great', 'love', 'spotify', 'app', 'music']




In [152]:
topic_results = LDA.transform(X_train_counts)


In [153]:
# Split original data
data_train, _ = train_test_split(data, test_size=0.25, random_state=42)

In [154]:
pd.set_option('display.max_colwidth', 150)
data_train['Topic'] = topic_results.argmax(axis=1)

data_train[["Review", "Topic"]].head(10)

Unnamed: 0,Review,Topic
29893,great app best music platform but the group session feature needs work still they should add a controll feature that let us someone be host of the...,3
29959,spotifys best feature is the notifications for new releases by artists you follow but that feature has not been working for months after several u...,2
54863,would have gave number stars if spotify completely supported joe rogans freedom of expression and did not delete those number episodes i am not a ...,0
54056,have to uninstall and reinstall multiple times to receive any new music podcast etc very buggy and i have android lol,1
24842,good quantity of music and quality,3
2890,my experience with spotify is great i love this platform thanks spotify team,3
9323,great to be able to create own playlist from number of choices,3
34086,there is one request for spotify please add recently added songs,1
38814,i highly suggest not getting this i have tried number months now to cancel my subscription it says its canceled yet they keep taking funds from me...,2
60771,i really liked spotify at first but then it got updated or something and now it has gotten very annoying i cannot enter my playlists to pick songs...,0


In [166]:
topic_results[[5]]

array([[0.01802183, 0.01813698, 0.01808897, 0.94575222]])

In [158]:
LDA.perplexity(X_train_counts)

1724.032346537392

In [159]:
LDA.score(X_train_counts)

-7863772.507264096

In [161]:
# Fit model
perplexities = []
scores = []
for n_topic in range(2,20,5):
    LDA = LatentDirichletAllocation(n_components=n_topic,random_state=42)
    LDA.fit(X_train_counts)

    perplexities.append(LDA.perplexity(X_train_counts))
    scores.append(LDA.score(X_train_counts))

In [163]:
perplexities

[1873.4875890604776, 1604.864441814075, 1586.136041368198, 1561.2975854831523]

In [164]:
scores

[-7951497.133128195,
 -7788192.290062701,
 -7775805.968601594,
 -7759151.124461923]

In [None]:
## To-Do
# Topic and its visualization against word
# Check against test data
# Coherence measure
# Think of considering app as stopword as its present in almost all topics
# Make use of time column from data and see how topic changes over time