In [1]:
# imports
import pandas as pd
import numpy as np

# nlp packages
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import matplotlib.pyplot as plt
import string
import re

# vectorizers 
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize 
import itertools
from nltk.corpus import stopwords

# lemmatizers 
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet 
from nltk import pos_tag

# vader sentiment intensity analyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
spotify_df = pd.read_csv('data/reviews.csv',usecols=['Review','Rating', 'Total_thumbsup', 'Reply'])
spotify_df = spotify_df.rename(columns={'Review':'review', 
                           'Rating':'rating',
                           'Total_thumbsup':'total_thumbs_up',
                           'Reply':'reply'})

In [3]:
spotify_df.head()

Unnamed: 0,review,rating,total_thumbs_up,reply
0,"Great music service, the audio is high quality...",5,2,
1,Please ignore previous negative rating. This a...,5,1,
2,"This pop-up ""Get the best Spotify experience o...",4,0,
3,Really buggy and terrible to use as of recently,1,1,
4,Dear Spotify why do I get songs that I didn't ...,1,1,


In [4]:
# checking if NaN values exist 
spotify_df.info()

# looks like reply is only column with NaN 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review           61594 non-null  object
 1   rating           61594 non-null  int64 
 2   total_thumbs_up  61594 non-null  int64 
 3   reply            216 non-null    object
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


There are no NaN values in the `review` column, which is where our reviews are; `reply` has the majority of NaNs. `reply` denotes whether Spotify's customer service team replied to the review on the Google Play Store

In [5]:
# generating a new column that maps whether Spotify replied or not 
spotify_df['replied'] = spotify_df['reply'].notna().astype(int)
'''
0 = did not reply 
1 = replied
'''

'\n0 = did not reply \n1 = replied\n'

In [6]:
# viewing updated df and records where reply == True 
spotify_df[spotify_df.replied == 1]

# updated dataframe since the exact reply is not relevant 
spotify_df = spotify_df.drop(columns=['reply'])
spotify_df

Unnamed: 0,review,rating,total_thumbs_up,replied
0,"Great music service, the audio is high quality...",5,2,0
1,Please ignore previous negative rating. This a...,5,1,0
2,"This pop-up ""Get the best Spotify experience o...",4,0,0
3,Really buggy and terrible to use as of recently,1,1,0
4,Dear Spotify why do I get songs that I didn't ...,1,1,0
...,...,...,...,...
61589,Even though it was communicated that lyrics fe...,1,6,0
61590,"Use to be sooo good back when I had it, and wh...",1,0,0
61591,This app would be good if not for it taking ov...,2,10,0
61592,The app is good hard to navigate and won't jus...,2,1,0


In [7]:
# sanity check 
spotify_df.isna().sum()

review             0
rating             0
total_thumbs_up    0
replied            0
dtype: int64

In order to begin preprocessing our text data, we need to normalize our data so that all reviews have the same format. 

In [8]:
# converting all review text to lowercase to normalize text 
spotify_df.review = spotify_df.review.str.lower()

# sanity check 
spotify_df.review

0        great music service, the audio is high quality...
1        please ignore previous negative rating. this a...
2        this pop-up "get the best spotify experience o...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didn't ...
                               ...                        
61589    even though it was communicated that lyrics fe...
61590    use to be sooo good back when i had it, and wh...
61591    this app would be good if not for it taking ov...
61592    the app is good hard to navigate and won't jus...
61593    its good but sometimes it doesnt load the musi...
Name: review, Length: 61594, dtype: object

In [9]:
# grabbing a randomly selected review from our dataset 
first_review = spotify_df.review.iloc[1047]
first_review

'after the latest update it keeps glitching, sometimes it doesnt work at all, and stopped working on my android auto,tried uninstalling and reinstalling but still does the same'

After normalizing, we'll need to tokenize each word; there are two methods to tokenize: `word_tokenize` and `sent_tokenize` which split each text by each word or sentence respectively. 

In [10]:
# seeing different tokenizing methods with our selected review 
print('word tokenized method:')
print(word_tokenize(first_review,language='english'))
print('\n')
print('sentence tokenizer method:')
print(sent_tokenize(first_review,language='english'))

word tokenized method:
['after', 'the', 'latest', 'update', 'it', 'keeps', 'glitching', ',', 'sometimes', 'it', 'doesnt', 'work', 'at', 'all', ',', 'and', 'stopped', 'working', 'on', 'my', 'android', 'auto', ',', 'tried', 'uninstalling', 'and', 'reinstalling', 'but', 'still', 'does', 'the', 'same']


sentence tokenizer method:
['after the latest update it keeps glitching, sometimes it doesnt work at all, and stopped working on my android auto,tried uninstalling and reinstalling but still does the same']


We can also tokenize each word after tokenizing each sentence for a more nuanced method 

In [11]:
# tokenized word in each sentence 
print('word-sentence tokenizer method:')
print([word_tokenize(sent) for sent in sent_tokenize(first_review)])

# produces a nested list where each list is a sentence and each sublist contains tokenized word

word-sentence tokenizer method:
[['after', 'the', 'latest', 'update', 'it', 'keeps', 'glitching', ',', 'sometimes', 'it', 'doesnt', 'work', 'at', 'all', ',', 'and', 'stopped', 'working', 'on', 'my', 'android', 'auto', ',', 'tried', 'uninstalling', 'and', 'reinstalling', 'but', 'still', 'does', 'the', 'same']]


Let's use a function to: 
1. Initialize WordNetLemmatizer()
2. Change NLTK's POS tagger to WordNet format 
3. Remove stop words and punctuation
4. Return a corpus of normalized and tokenized text 

In [12]:
# importing stop_words 
stop_words = stopwords.words('english')

In [13]:
# takes in untokenized document and returns fully normalized token list
def process_doc(doc):

    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    # remove stop words and punctuations, then lower case
    doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok.lower() not in stop_words)) ]

    #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
    doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
    
    return doc_norm

In [14]:
# testing our function on our randomly selected review 
print('original review:')
print(first_review)
print('\n')
print('processed review')
process_doc(first_review)

original review:
after the latest update it keeps glitching, sometimes it doesnt work at all, and stopped working on my android auto,tried uninstalling and reinstalling but still does the same


processed review


['late',
 'update',
 'keep',
 'glitching',
 'sometimes',
 'doesnt',
 'work',
 'stop',
 'work',
 'android',
 'auto',
 'try',
 'uninstalling',
 'reinstalling',
 'still']

We can see that using our helper function to process our sample review that words such as "the", "it", and "on my" have been removed to only generate words of importance 

In [15]:
spotify_df

Unnamed: 0,review,rating,total_thumbs_up,replied
0,"great music service, the audio is high quality...",5,2,0
1,please ignore previous negative rating. this a...,5,1,0
2,"this pop-up ""get the best spotify experience o...",4,0,0
3,really buggy and terrible to use as of recently,1,1,0
4,dear spotify why do i get songs that i didn't ...,1,1,0
...,...,...,...,...
61589,even though it was communicated that lyrics fe...,1,6,0
61590,"use to be sooo good back when i had it, and wh...",1,0,0
61591,this app would be good if not for it taking ov...,2,10,0
61592,the app is good hard to navigate and won't jus...,2,1,0


In [16]:
# creating a new Series object that has our processed,tokenized text 
normalized_corpus = spotify_df.review.apply(process_doc)

In [17]:
# seeing our new corpus 
normalized_corpus

0        [great, music, service, audio, high, quality, ...
1        [please, ignore, previous, negative, rating, a...
2        [get, best, spotify, experience, android, anno...
3                 [really, buggy, terrible, use, recently]
4        [dear, spotify, get, song, put, playlist, shuf...
                               ...                        
61589    [even, communicate, lyric, feature, available,...
61590    [use, sooo, good, back, download, free, versio...
61591    [app, good, take, device, start, comp, spotify...
61592    [app, good, hard, navigate, let, play, song, c...
61593    [good, sometimes, doesnt, load, music, play, s...
Name: review, Length: 61594, dtype: object

In [21]:
# exporting corpus for lda topic modeling
# normalized_corpus.to_csv('data/review-corpus.csv',index=False)

In [22]:
# flattening our corpus to see how many unique tokens we have 
flattened_corpus = pd.Series(list(itertools.chain(*normalized_corpus)))

# amount of unique tokens
len(flattened_corpus.unique())

19451

In [32]:
print(normalized_corpus.iloc[0])

['great', 'music', 'service', 'audio', 'high', 'quality', 'app', 'easy', 'use', 'also', 'quick', 'friendly', 'support']


Let's look at the frequency distribution for our flattened corpus!

In [62]:
# seeing the ten most common words in our flattened corpus 
fdist = FreqDist(flattened_corpus)
fdist.most_common(10)

[('app', 34869),
 ('song', 32689),
 ('music', 25725),
 ('play', 19226),
 ('spotify', 18936),
 ('listen', 12910),
 ('use', 11473),
 ('ad', 10399),
 ('playlist', 9963),
 ('get', 9926)]

In [113]:
# seeing the ten least common words in our flattened_corpus 
fdist.most_common()[-10:]

[('aslways', 1),
 ('descoverd', 1),
 ('unonstalling', 1),
 ('basicly', 1),
 ('favourties', 1),
 ('autovolume', 1),
 ('zambia', 1),
 ('frkng', 1),
 ('strenght', 1),
 ('needy', 1)]

In [117]:
# number of tokens with less than five occurances 
(flattened_corpus.value_counts() < 5).sum()

14776

In [55]:
flat_corpus = normalized_corpus.apply(
    " ".join)
flat_corpus

0        great music service audio high quality app eas...
1        please ignore previous negative rating app sup...
2        get best spotify experience android annoy plea...
3                       really buggy terrible use recently
4          dear spotify get song put playlist shuffle play
                               ...                        
61589    even communicate lyric feature available user ...
61590    use sooo good back download free version pick ...
61591    app good take device start comp spotify start ...
61592    app good hard navigate let play song click pla...
61593    good sometimes doesnt load music play second s...
Name: review, Length: 61594, dtype: object

We now have a flattened corpus however we still have ~14k tokens with less than 5 occurrences so we will need to further clean it up when we vectorize. 

## VADER Sentiment Analysis

In order to create a binary classification problem, we will use VADER `SentimentIntensityAnalyzer` to extract the polarity scores and use the composite score to generate class labels 

In [121]:
# instantiating our SentimentIntensityAnalyzer
sent = SentimentIntensityAnalyzer()

Let's get the sentiment for our randomly selected review to better understand what `SentimentIntensityAnalyzer()` does

In [123]:
print('Original Review:')
print(first_review)
print('\n')
print('VADER SIA output')
sent.polarity_scores(first_review)

Original Review:
after the latest update it keeps glitching, sometimes it doesnt work at all, and stopped working on my android auto,tried uninstalling and reinstalling but still does the same


VADER SIA output


{'neg': 0.051, 'neu': 0.949, 'pos': 0.0, 'compound': -0.1154}

For our first review, VADER generated the polarity across three different metrics: `neg`, `neu`, and `pos`. `compound` represents the sum of all scores normalized between -1 and 1. Looking at the `compound` score, we can see that this particular review is most likely a negative review.

We'll use VADER to calculate the compound scores for each review and then use the `compound` score to map out our class labels

In [127]:
# creating a function that extracts each score and appends them to a list 
score_comp = []
score_pos = []
score_neg = []
sent = SentimentIntensityAnalyzer()
for i in range(0,spotify_df.shape[0]):
    score = sent.polarity_scores(spotify_df.iloc[i][0])
    compound_score = score['compound']
    score_comp.append(compound_score)
    # extracting positive sentiment 
    pos_score = score['pos']
    score_pos.append(pos_score)
    # extracting negative sentiment 
    neg_score = score['neg']
    score_neg.append(neg_score)

In [154]:
# creating new columns in our original dataframe that maps the VADER results
spotify_df['comp_score'] = score_comp
spotify_df['pos_score'] = score_pos
spotify_df['neg_score'] = score_neg

In [155]:
# viewing updated dataframe
spotify_df.head()

Unnamed: 0,review,rating,total_thumbs_up,replied,comp_score,pos_score,neg_score
0,"great music service, the audio is high quality...",5,2,0,0.9211,0.436,0.0
1,please ignore previous negative rating. this a...,5,1,0,0.6249,0.404,0.243
2,"this pop-up ""get the best spotify experience o...",4,0,0,0.5859,0.258,0.107
3,really buggy and terrible to use as of recently,1,1,0,-0.5209,0.0,0.296
4,dear spotify why do i get songs that i didn't ...,1,1,0,0.7149,0.26,0.0


In [160]:
spotify_df.loc[spotify_df.comp_score == 0]

Unnamed: 0,review,rating,total_thumbs_up,replied,comp_score,pos_score,neg_score
22,i logged out on my acc but when i tried to ope...,3,0,0,0.0,0.000,0.000
29,everything is perfect... just add light theme,4,0,0,0.0,0.000,0.000
33,am unable to use on my ph .....it gets downloa...,1,0,0,0.0,0.000,0.000
39,wide range of songs collection,5,0,0,0.0,0.000,0.000
43,my account was glitching so i logged out to tr...,2,0,0,0.0,0.000,0.000
...,...,...,...,...,...,...,...
61170,the current ui needs work. there's certain ele...,3,93,0,-0.0,0.136,0.097
61221,it pauses itself in between songs until i turn...,4,0,0,0.0,0.000,0.000
61285,"still doesn't recognise local files, wastes da...",4,0,0,0.0,0.000,0.000
61343,"the decision to ""retire"" car view without a re...",1,3,0,0.0,0.000,0.000


In [166]:
pos_mask = spotify_df.loc[:, 'comp_score'] > 0
neg_mask = spotify_df.loc[:, 'comp_score'] < 0
neutral_mask = spotify_df.loc[:, 'comp_score'] == 0

In [167]:
print(f'Number of Positive Reviews:')
print(len(spotify_df[pos_mask]))
print('\n')
print(f'Number of Negative Reviews:')
print(len(spotify_df[neg_mask]))
print('\n')
print(f'Number of Neutral Reviews:')
print(len(spotify_df[neutral_mask]))

Number of Positive Reviews:
40065


Number of Negative Reviews:
17508


Number of Neutral Reviews:
4021


**INSERT TEXT HERE**

In [194]:
# creating a helper function that takes the compound scores for each record and generates the correct sentiment label
def sentiment_labels(score):
    if score > 0: 
        return 'pos'
    elif score < 0:
        return 'neg'
    else: 
        return 'neutral'

In [212]:
# creating new column that contains our sentiment labels
spotify_df['sentiment'] = spotify_df.comp_score.apply(sentiment_labels)

In [217]:
# previewing dataframe
spotify_df.head()

Unnamed: 0,review,rating,total_thumbs_up,replied,comp_score,pos_score,neg_score,sentiment
0,"great music service, the audio is high quality...",5,2,0,0.9211,0.436,0.0,pos
1,please ignore previous negative rating. this a...,5,1,0,0.6249,0.404,0.243,pos
2,"this pop-up ""get the best spotify experience o...",4,0,0,0.5859,0.258,0.107,pos
3,really buggy and terrible to use as of recently,1,1,0,-0.5209,0.0,0.296,neg
4,dear spotify why do i get songs that i didn't ...,1,1,0,0.7149,0.26,0.0,pos


## Exporting Processed Data

In [218]:
# exporting our preprocessed data which contains our VADER sentiment analysis score and new class labels 
spotify_df.to_csv('data/preprocessed-reviews.csv',index=False)

In [219]:
# exporting our flattened corpus ready for vectorization 
flat_corpus.to_csv('data/spotify-reviews.csv',index=False)

# Review

In [27]:
from spellchecker import SpellChecker

In [28]:
from textblob import TextBlob

In [29]:
spotify_df.review

0        great music service, the audio is high quality...
1        please ignore previous negative rating. this a...
2        this pop-up "get the best spotify experience o...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didn't ...
                               ...                        
61589    even though it was communicated that lyrics fe...
61590    use to be sooo good back when i had it, and wh...
61591    this app would be good if not for it taking ov...
61592    the app is good hard to navigate and won't jus...
61593    its good but sometimes it doesnt load the musi...
Name: review, Length: 61594, dtype: object

In [363]:
spell = SpellChecker()
misspelled = spell.unknown(spotify_df.review)
print(len(misspelled))

3892


In [94]:
def correct_text(sent):
    blob = TextBlob(str(sent))
    correct_blob = blob.correct()
    return correct_blob

In [96]:
test_toks = flattened_toks.values[-5::]
#test_blob = TextBlob(test_toks)
#correct_blob = test_blob.correct()
#print(correct_blob)

In [121]:
first_five = flattened_toks[-20::]
first_five
#first_five.apply(lambda x: str(TextBlob(x).correct()))

896159          need
896160          play
896161          song
896162         click
896163          good
896164     sometimes
896165        doesnt
896166          load
896167         music
896168          play
896169        second
896170          song
896171          stop
896172           say
896173    connection
896174        diesnt
896175          work
896176          time
896177          work
896178         great
dtype: object

In [97]:
correct_text(test_toks)

TextBlob("['doesn' 'work' 'time' 'work' 'great']")

In [39]:
#misspelled_toks = TextBlob(misspelled_toks)
#corrected = misspelled_toks.correct()
#print(corrected)

In [43]:
spotify_df[spotify_df.review.str.contains('prob😕')]

Unnamed: 0,review,rating,total_thumbs_up,reply,replied,tok_norm
23736,the best app for playlist but worst while logi...,1,0,,0,"[best, app, playlist, worst, loging, everytime..."


In [125]:
spotify_df

Unnamed: 0,review,rating,total_thumbs_up,reply,replied,process_tok
0,"great music service, the audio is high quality...",5,2,,0,"[great, music, service, audio, high, quality, ..."
1,please ignore previous negative rating. this a...,5,1,,0,"[please, ignore, previous, negative, rating, a..."
2,"this pop-up ""get the best spotify experience o...",4,0,,0,"[get, best, spotify, experience, android, anno..."
3,really buggy and terrible to use as of recently,1,1,,0,"[really, buggy, terrible, use, recently]"
4,dear spotify why do i get songs that i didn't ...,1,1,,0,"[dear, spotify, get, song, put, playlist, shuf..."
...,...,...,...,...,...,...
61589,even though it was communicated that lyrics fe...,1,6,,0,"[even, communicate, lyric, feature, available,..."
61590,"use to be sooo good back when i had it, and wh...",1,0,,0,"[use, sooo, good, back, download, free, versio..."
61591,this app would be good if not for it taking ov...,2,10,,0,"[app, good, take, device, start, comp, spotify..."
61592,the app is good hard to navigate and won't jus...,2,1,,0,"[app, good, hard, navigate, let, play, song, c..."


In [159]:
spotify_df[spotify_df.review.str.contains('diesnt')].index

Int64Index([21827, 61593], dtype='int64')

In [160]:
ninth_review = spotify_df.review[21827]
ninth_review

"app keeps letting songs play for 10 seconds when try to select and then stops. it diesnt recognise it's in playback mode (best way i can describe). you have to turn phone off and then on to reset and make it work again. such a pain as you cant even just close and re open app to reset. incredibly infuriating."