In [2]:
# imports
import pandas as pd
import numpy as np

In [3]:
spotify_df = pd.read_csv('data/reviews.csv',usecols=['Review','Rating', 'Total_thumbsup', 'Reply'])
spotify_df = spotify_df.rename(columns={'Review':'review', 
                           'Rating':'rating',
                           'Total_thumbsup':'total_thumbs_up',
                           'Reply':'reply'})

In [4]:
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import matplotlib.pyplot as plt
import string
import re
import numpy as np

In [5]:
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize 

In [6]:
import itertools

In [7]:
from nltk.corpus import stopwords

In [8]:
from nltk import WordNetLemmatizer # lemmatizer using WordNet
from nltk.corpus import wordnet # imports WordNet
from nltk import pos_tag # nltk's native part of speech tagging

In [9]:
spotify_df.head()

Unnamed: 0,review,rating,total_thumbs_up,reply
0,"Great music service, the audio is high quality...",5,2,
1,Please ignore previous negative rating. This a...,5,1,
2,"This pop-up ""Get the best Spotify experience o...",4,0,
3,Really buggy and terrible to use as of recently,1,1,
4,Dear Spotify why do I get songs that I didn't ...,1,1,


In [10]:
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review           61594 non-null  object
 1   rating           61594 non-null  int64 
 2   total_thumbs_up  61594 non-null  int64 
 3   reply            216 non-null    object
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


no NaN values in `review` column

In [11]:
# generating a new column that maps whether Spotify replied or not 
spotify_df['replied'] = spotify_df['reply'].notna().astype(int)
'''
0 = did not reply 
1 = replied
'''

'\n0 = did not reply \n1 = replied\n'

In [12]:
# viewing updated df and records where reply == True 
spotify_df[spotify_df.replied == 1]

Unnamed: 0,review,rating,total_thumbs_up,reply,replied
73,"Great song selection, amazing audio quality. H...",3,8,"Hey, thanks for the feedback! We'd love to hel...",1
699,Seems like the random stopping of music has be...,4,4,"Hey, thanks for the report. If you haven't tri...",1
1151,They would add their songs to MY PLAYLIST! I c...,1,1,"Hey, thanks for the feedback! You can always a...",1
1277,I paid last month for Premium Family & my wife...,5,0,Hi James! We’re sorry to hear that. We’d love ...,1
1668,I really don't care about the random pauses in...,3,0,"Hi again. If you haven't tried Premium yet, we...",1
...,...,...,...,...,...
61116,Why can't you skip back without premium? Spoti...,4,3,Hey! Have you tried restarting your device or ...,1
61257,"Ads are fine, if anything they're a necessity....",1,5,"Hey, thanks for the feedback. Suggested tracks...",1
61374,Considering stopping my subscription due to th...,1,0,Hey! We'd love to help get this fixed. If you ...,1
61397,I am 100% sure this is the worst music app I e...,3,0,Hey! We're sorry to hear you feel that way abo...,1


In [13]:
# converting all review text to lowercase to normalize text 
spotify_df.review = spotify_df.review.str.lower()
spotify_df.review

0        great music service, the audio is high quality...
1        please ignore previous negative rating. this a...
2        this pop-up "get the best spotify experience o...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didn't ...
                               ...                        
61589    even though it was communicated that lyrics fe...
61590    use to be sooo good back when i had it, and wh...
61591    this app would be good if not for it taking ov...
61592    the app is good hard to navigate and won't jus...
61593    its good but sometimes it doesnt load the musi...
Name: review, Length: 61594, dtype: object

In [14]:
first_review = spotify_df.review.iloc[0]
first_review

'great music service, the audio is high quality and the app is easy to use. also very quick and friendly support.'

In [15]:
# seeing different tokenizing methods 
print('word tokenized method:')
print(word_tokenize(first_review,language='english'))
print('\n')
print('sentence tokenizer method:')
print(sent_tokenize(first_review,language='english'))

word tokenized method:
['great', 'music', 'service', ',', 'the', 'audio', 'is', 'high', 'quality', 'and', 'the', 'app', 'is', 'easy', 'to', 'use', '.', 'also', 'very', 'quick', 'and', 'friendly', 'support', '.']


sentence tokenizer method:
['great music service, the audio is high quality and the app is easy to use.', 'also very quick and friendly support.']


In [16]:
# tokenized word in each sentence 
print([word_tokenize(sent) for sent in sent_tokenize(first_review)])

[['great', 'music', 'service', ',', 'the', 'audio', 'is', 'high', 'quality', 'and', 'the', 'app', 'is', 'easy', 'to', 'use', '.'], ['also', 'very', 'quick', 'and', 'friendly', 'support', '.']]


In [17]:
# tokenizing word 
corpus = [word_tokenize(review) for review in spotify_df.review]
print(corpus[0:4])

[['great', 'music', 'service', ',', 'the', 'audio', 'is', 'high', 'quality', 'and', 'the', 'app', 'is', 'easy', 'to', 'use', '.', 'also', 'very', 'quick', 'and', 'friendly', 'support', '.'], ['please', 'ignore', 'previous', 'negative', 'rating', '.', 'this', 'app', 'is', 'super', 'great', '.', 'i', 'give', 'it', 'five', 'stars+'], ['this', 'pop-up', '``', 'get', 'the', 'best', 'spotify', 'experience', 'on', 'android', '12', "''", 'is', 'too', 'annoying', '.', 'please', 'let', "'s", 'get', 'rid', 'of', 'this', '.'], ['really', 'buggy', 'and', 'terrible', 'to', 'use', 'as', 'of', 'recently']]


In [18]:
stop_words = stopwords.words('english')

In [19]:
# takes in untokenized document and returns fully normalized token list
def process_doc(doc):

    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    # remove stop words and punctuations, then lower case
    doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok.lower() not in stop_words)) ]

    #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
    doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
    
    return doc_norm

In [20]:
# using function on first review 
process_doc(first_review)

['great',
 'music',
 'service',
 'audio',
 'high',
 'quality',
 'app',
 'easy',
 'use',
 'also',
 'quick',
 'friendly',
 'support']

In [21]:
spotify_df['process_tok'] = spotify_df.review.apply(process_doc)

In [22]:
# new column for reviews with processed tokens 
spotify_df

Unnamed: 0,review,rating,total_thumbs_up,reply,replied,process_tok
0,"great music service, the audio is high quality...",5,2,,0,"[great, music, service, audio, high, quality, ..."
1,please ignore previous negative rating. this a...,5,1,,0,"[please, ignore, previous, negative, rating, a..."
2,"this pop-up ""get the best spotify experience o...",4,0,,0,"[get, best, spotify, experience, android, anno..."
3,really buggy and terrible to use as of recently,1,1,,0,"[really, buggy, terrible, use, recently]"
4,dear spotify why do i get songs that i didn't ...,1,1,,0,"[dear, spotify, get, song, put, playlist, shuf..."
...,...,...,...,...,...,...
61589,even though it was communicated that lyrics fe...,1,6,,0,"[even, communicate, lyric, feature, available,..."
61590,"use to be sooo good back when i had it, and wh...",1,0,,0,"[use, sooo, good, back, download, free, versio..."
61591,this app would be good if not for it taking ov...,2,10,,0,"[app, good, take, device, start, comp, spotify..."
61592,the app is good hard to navigate and won't jus...,2,1,,0,"[app, good, hard, navigate, let, play, song, c..."


In [24]:
flattened_toks = pd.Series(list(itertools.chain(*spotify_df.process_tok)))
len(flattened_toks.unique())

19451

In [25]:
flattened_toks.values

array(['great', 'music', 'service', ..., 'time', 'work', 'great'],
      dtype=object)

In [26]:
flattened_toks.unique

<bound method Series.unique of 0           great
1           music
2         service
3           audio
4            high
           ...   
896174     diesnt
896175       work
896176       time
896177       work
896178      great
Length: 896179, dtype: object>

In [27]:
from spellchecker import SpellChecker

In [28]:
from textblob import TextBlob

In [29]:
spotify_df.review

0        great music service, the audio is high quality...
1        please ignore previous negative rating. this a...
2        this pop-up "get the best spotify experience o...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didn't ...
                               ...                        
61589    even though it was communicated that lyrics fe...
61590    use to be sooo good back when i had it, and wh...
61591    this app would be good if not for it taking ov...
61592    the app is good hard to navigate and won't jus...
61593    its good but sometimes it doesnt load the musi...
Name: review, Length: 61594, dtype: object

In [363]:
spell = SpellChecker()
misspelled = spell.unknown(spotify_df.review)
print(len(misspelled))

3892


In [94]:
def correct_text(sent):
    blob = TextBlob(str(sent))
    correct_blob = blob.correct()
    return correct_blob

In [96]:
test_toks = flattened_toks.values[-5::]
#test_blob = TextBlob(test_toks)
#correct_blob = test_blob.correct()
#print(correct_blob)

In [121]:
first_five = flattened_toks[-20::]
first_five
#first_five.apply(lambda x: str(TextBlob(x).correct()))

896159          need
896160          play
896161          song
896162         click
896163          good
896164     sometimes
896165        doesnt
896166          load
896167         music
896168          play
896169        second
896170          song
896171          stop
896172           say
896173    connection
896174        diesnt
896175          work
896176          time
896177          work
896178         great
dtype: object

In [97]:
correct_text(test_toks)

TextBlob("['doesn' 'work' 'time' 'work' 'great']")

In [71]:
fdist = FreqDist(flattened_toks)
# top 20 most common tokens 
fdist.most_common(20)

[('app', 34869),
 ('song', 32689),
 ('music', 25723),
 ('play', 19226),
 ('spotify', 18935),
 ('listen', 12910),
 ('use', 11473),
 ('ad', 10399),
 ('playlist', 9963),
 ('get', 9926),
 ('good', 9698),
 ('love', 9439),
 ('premium', 8940),
 ('great', 7454),
 ('want', 7064),
 ('even', 7022),
 ('work', 6889),
 ('update', 6731),
 ('time', 6687),
 ('fix', 6092)]

In [73]:
flattened_toks.value_counts().tail(20)

reached        1
sop            1
icebreaker     1
mooder         1
iove           1
netweok        1
vrey           1
tgey           1
premeim        1
severally      1
alo            1
gradient       1
preveiw        1
vgood          1
travellling    1
goldan         1
resistant      1
bttr           1
benwfitnit     1
needy          1
dtype: int64

In [39]:
#misspelled_toks = TextBlob(misspelled_toks)
#corrected = misspelled_toks.correct()
#print(corrected)

In [43]:
spotify_df[spotify_df.review.str.contains('prob😕')]

Unnamed: 0,review,rating,total_thumbs_up,reply,replied,tok_norm
23736,the best app for playlist but worst while logi...,1,0,,0,"[best, app, playlist, worst, loging, everytime..."


In [125]:
spotify_df

Unnamed: 0,review,rating,total_thumbs_up,reply,replied,process_tok
0,"great music service, the audio is high quality...",5,2,,0,"[great, music, service, audio, high, quality, ..."
1,please ignore previous negative rating. this a...,5,1,,0,"[please, ignore, previous, negative, rating, a..."
2,"this pop-up ""get the best spotify experience o...",4,0,,0,"[get, best, spotify, experience, android, anno..."
3,really buggy and terrible to use as of recently,1,1,,0,"[really, buggy, terrible, use, recently]"
4,dear spotify why do i get songs that i didn't ...,1,1,,0,"[dear, spotify, get, song, put, playlist, shuf..."
...,...,...,...,...,...,...
61589,even though it was communicated that lyrics fe...,1,6,,0,"[even, communicate, lyric, feature, available,..."
61590,"use to be sooo good back when i had it, and wh...",1,0,,0,"[use, sooo, good, back, download, free, versio..."
61591,this app would be good if not for it taking ov...,2,10,,0,"[app, good, take, device, start, comp, spotify..."
61592,the app is good hard to navigate and won't jus...,2,1,,0,"[app, good, hard, navigate, let, play, song, c..."


In [159]:
spotify_df[spotify_df.review.str.contains('diesnt')].index

Int64Index([21827, 61593], dtype='int64')

In [153]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [160]:
ninth_review = spotify_df.review[21827]
ninth_review

"app keeps letting songs play for 10 seconds when try to select and then stops. it diesnt recognise it's in playback mode (best way i can describe). you have to turn phone off and then on to reset and make it work again. such a pain as you cant even just close and re open app to reset. incredibly infuriating."

In [156]:
sent = SentimentIntensityAnalyzer()

In [161]:
sent.polarity_scores(ninth_review)

{'neg': 0.132, 'neu': 0.831, 'pos': 0.037, 'compound': -0.7346}

In [175]:
corrected_review = "app keeps letting songs play for 10 seconds when I try to select and then stops. it doesn recognise it's in playback mode (best way i can describe). you have to turn phone off and then on to rest and make it work again. such a pain as you can even just close and re open app to rest. incredibly infuriating."

In [176]:
sent.polarity_scores(corrected_review)

{'neg': 0.132, 'neu': 0.831, 'pos': 0.037, 'compound': -0.7346}

## VADER Sentiment Analysis

Using VADER to calculate the compound scores for each review so as to create labels for positive, negative, and neutral reviews

In [198]:
sent.polarity_scores(spotify_df.iloc[3][0])

{'neg': 0.296, 'neu': 0.704, 'pos': 0.0, 'compound': -0.5209}

In [199]:
score_comp = []
score_pos = []
score_neg = []
sent = SentimentIntensityAnalyzer()
for i in range(0,spotify_df.shape[0]):
    score = sent.polarity_scores(spotify_df.iloc[i][0])
    compound_score = score['compound']
    score_comp.append(compound_score)
    # extracting positive sentiment 
    pos_score = score['pos']
    score_pos.append(pos_score)
    # extracting negative sentiment 
    neg_score = score['neg']
    score_neg.append(neg_score)

In [200]:
spotify_df['comp_score'] = score_comp
spotify_df['pos_score'] = score_pos
spotify_df['neg_score'] = score_neg

In [204]:
spotify_df = spotify_df.drop(columns='reply')

In [242]:
spotify_df

Unnamed: 0,review,rating,total_thumbs_up,replied,process_tok,comp_score,pos_score,neg_score
0,"great music service, the audio is high quality...",5,2,0,"[great, music, service, audio, high, quality, ...",0.9211,0.436,0.000
1,please ignore previous negative rating. this a...,5,1,0,"[please, ignore, previous, negative, rating, a...",0.6249,0.404,0.243
2,"this pop-up ""get the best spotify experience o...",4,0,0,"[get, best, spotify, experience, android, anno...",0.5859,0.258,0.107
3,really buggy and terrible to use as of recently,1,1,0,"[really, buggy, terrible, use, recently]",-0.5209,0.000,0.296
4,dear spotify why do i get songs that i didn't ...,1,1,0,"[dear, spotify, get, song, put, playlist, shuf...",0.7149,0.260,0.000
...,...,...,...,...,...,...,...,...
61589,even though it was communicated that lyrics fe...,1,6,0,"[even, communicate, lyric, feature, available,...",-0.2960,0.047,0.072
61590,"use to be sooo good back when i had it, and wh...",1,0,0,"[use, sooo, good, back, download, free, versio...",0.8481,0.227,0.069
61591,this app would be good if not for it taking ov...,2,10,0,"[app, good, take, device, start, comp, spotify...",-0.9611,0.046,0.225
61592,the app is good hard to navigate and won't jus...,2,1,0,"[app, good, hard, navigate, let, play, song, c...",0.8074,0.159,0.023


In [262]:
# creating different boolean masks using comp_scores 
pos_mask = spotify_df.loc[:, 'comp_score'] >= 0.05
neg_mask = spotify_df.loc[:, 'comp_score'] <= -0.05
neutral_mask = (spotify_df.loc[:,'comp_score'] <.05) & (spotify_df.loc[:,'comp_score'] > -0.05)

In [267]:
print(f'Number of Positive Reviews:')
print(len(spotify_df[pos_mask]))
print('\n')
print(f'Number of Negative Reviews:')
print(len(spotify_df[neg_mask]))
print('\n')
print(f'Number of Neutral Reviews:')
print(len(spotify_df[neutral_mask]))

Number of Positive Reviews:
39589


Number of Negative Reviews:
16992


Number of Neutral Reviews:
5013


In [270]:
spotify_df.comp_score

0        0.9211
1        0.6249
2        0.5859
3       -0.5209
4        0.7149
          ...  
61589   -0.2960
61590    0.8481
61591   -0.9611
61592    0.8074
61593    0.8641
Name: comp_score, Length: 61594, dtype: float64

In [271]:
def sentiment_labels(score):
    if score >= 0.05: 
        return 'pos'
    elif score <= -0.05:
        return 'neg'
    else: 
        return 'neutral'

In [272]:
spotify_df['sentiment'] = spotify_df.comp_score.apply(sentiment_labels)

In [351]:
spotify_df[spotify_df.review.str.contains('😕')].tail().index

Int64Index([58206, 58325, 59033, 59203, 60968], dtype='int64')

In [361]:
spotify_df.to_csv('data/preprocessed-reviews.csv',index=False)

## Exporting Processed Data

In [192]:
fnc_output = spotify_df.process_tok.apply(
    " ".join)
fnc_output.to_csv('data/spotify-reviews.csv',index=False)

In [193]:
pd.read_csv('data/spotify-reviews.csv')

Unnamed: 0,process_tok
0,great music service audio high quality app eas...
1,please ignore previous negative rating app sup...
2,get best spotify experience android annoy plea...
3,really buggy terrible use recently
4,dear spotify get song put playlist shuffle play
...,...
61589,even communicate lyric feature available user ...
61590,use sooo good back download free version pick ...
61591,app good take device start comp spotify start ...
61592,app good hard navigate let play song click pla...
