In [2]:
# imports
import pandas as pd
import numpy as np

In [3]:
spotify_df = pd.read_csv('data/reviews.csv',usecols=['Review','Rating', 'Total_thumbsup', 'Reply'])
spotify_df = spotify_df.rename(columns={'Review':'review', 
                           'Rating':'rating',
                           'Total_thumbsup':'total_thumbs_up',
                           'Reply':'reply'})

In [4]:
spotify_df.head()

Unnamed: 0,review,rating,total_thumbs_up,reply
0,"Great music service, the audio is high quality...",5,2,
1,Please ignore previous negative rating. This a...,5,1,
2,"This pop-up ""Get the best Spotify experience o...",4,0,
3,Really buggy and terrible to use as of recently,1,1,
4,Dear Spotify why do I get songs that I didn't ...,1,1,


In [5]:
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   review           61594 non-null  object
 1   rating           61594 non-null  int64 
 2   total_thumbs_up  61594 non-null  int64 
 3   reply            216 non-null    object
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


no NaN values in `review` column

In [6]:
spotify_df['replied'] = spotify_df['reply'].notna().astype(int)
'''
0 = did not reply 
1 = replied
'''

'\n0 = did not reply \n1 = replied\n'

In [7]:
# viewing updated df 
spotify_df[spotify_df.replied == 1]

Unnamed: 0,review,rating,total_thumbs_up,reply,replied
73,"Great song selection, amazing audio quality. H...",3,8,"Hey, thanks for the feedback! We'd love to hel...",1
699,Seems like the random stopping of music has be...,4,4,"Hey, thanks for the report. If you haven't tri...",1
1151,They would add their songs to MY PLAYLIST! I c...,1,1,"Hey, thanks for the feedback! You can always a...",1
1277,I paid last month for Premium Family & my wife...,5,0,Hi James! We’re sorry to hear that. We’d love ...,1
1668,I really don't care about the random pauses in...,3,0,"Hi again. If you haven't tried Premium yet, we...",1
...,...,...,...,...,...
61116,Why can't you skip back without premium? Spoti...,4,3,Hey! Have you tried restarting your device or ...,1
61257,"Ads are fine, if anything they're a necessity....",1,5,"Hey, thanks for the feedback. Suggested tracks...",1
61374,Considering stopping my subscription due to th...,1,0,Hey! We'd love to help get this fixed. If you ...,1
61397,I am 100% sure this is the worst music app I e...,3,0,Hey! We're sorry to hear you feel that way abo...,1


In [8]:
# converting all review text to lowercase
spotify_df.review = spotify_df.review.str.lower()
spotify_df.review

0        great music service, the audio is high quality...
1        please ignore previous negative rating. this a...
2        this pop-up "get the best spotify experience o...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didn't ...
                               ...                        
61589    even though it was communicated that lyrics fe...
61590    use to be sooo good back when i had it, and wh...
61591    this app would be good if not for it taking ov...
61592    the app is good hard to navigate and won't jus...
61593    its good but sometimes it doesnt load the musi...
Name: review, Length: 61594, dtype: object

In [9]:
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import matplotlib.pyplot as plt
import string
import re
import numpy as np

In [10]:
from nltk.tokenize import word_tokenize # nltk's gold standard word tokenizer
from nltk.tokenize import sent_tokenize # nltk's sentence tokenizer

In [11]:
first_review = spotify_df.review.iloc[0]
first_review

'great music service, the audio is high quality and the app is easy to use. also very quick and friendly support.'

In [12]:
# seeing different tokenizing methods 
print('word tokenized method:')
print(word_tokenize(first_review,language='english'))
print('\n')
print('sentence tokenizer method:')
print(sent_tokenize(first_review,language='english'))

word tokenized method:
['great', 'music', 'service', ',', 'the', 'audio', 'is', 'high', 'quality', 'and', 'the', 'app', 'is', 'easy', 'to', 'use', '.', 'also', 'very', 'quick', 'and', 'friendly', 'support', '.']


sentence tokenizer method:
['great music service, the audio is high quality and the app is easy to use.', 'also very quick and friendly support.']


In [13]:
print([word_tokenize(sent) for sent in sent_tokenize(first_review)])

[['great', 'music', 'service', ',', 'the', 'audio', 'is', 'high', 'quality', 'and', 'the', 'app', 'is', 'easy', 'to', 'use', '.'], ['also', 'very', 'quick', 'and', 'friendly', 'support', '.']]


In [14]:
corpus = [word_tokenize(review) for review in spotify_df.review]
print(corpus[0:4])

[['great', 'music', 'service', ',', 'the', 'audio', 'is', 'high', 'quality', 'and', 'the', 'app', 'is', 'easy', 'to', 'use', '.', 'also', 'very', 'quick', 'and', 'friendly', 'support', '.'], ['please', 'ignore', 'previous', 'negative', 'rating', '.', 'this', 'app', 'is', 'super', 'great', '.', 'i', 'give', 'it', 'five', 'stars+'], ['this', 'pop-up', '``', 'get', 'the', 'best', 'spotify', 'experience', 'on', 'android', '12', "''", 'is', 'too', 'annoying', '.', 'please', 'let', "'s", 'get', 'rid', 'of', 'this', '.'], ['really', 'buggy', 'and', 'terrible', 'to', 'use', 'as', 'of', 'recently']]


In [15]:
import itertools

In [16]:
flattenedcorpus_tokens = pd.Series(list(itertools.chain(*corpus)))
print(flattenedcorpus_tokens.shape)

(2148576,)


In [17]:
dictionary = pd.Series(
    flattenedcorpus_tokens.unique())
print(len(dictionary))

31505


In [18]:
flattenedcorpus_tokens.value_counts().head(20)

.          93449
i          79943
the        72193
to         67068
it         57093
and        48498
,          45867
app        35799
a          33034
is         30360
music      25553
n't        24349
my         24018
for        22766
of         21841
you        21242
this       21076
but        20646
!          19338
spotify    18985
dtype: int64

In [19]:
flattenedcorpus_tokens.value_counts().tail(20)

aapi                  1
clickinv              1
dipped                1
'variety              1
subsciption           1
respot                1
resson                1
prob😕                 1
shins                 1
optionally            1
algorithm-inserted    1
noteworthy            1
guees                 1
similatso             1
abouts                1
studied               1
good.but              1
setings               1
happenekg             1
wifi/internet         1
dtype: int64

In [20]:
(flattenedcorpus_tokens.value_counts()<5).sum()

24953

In [21]:
dictionary[dictionary.str.isnumeric()]

40         12
171         6
292         3
295         1
386       100
         ... 
30834     105
30942     967
31123    1080
31345    6000
31371    2003
Length: 244, dtype: object

In [22]:
from nltk.corpus import stopwords

In [23]:
stop_words = stopwords.words('english')

In [24]:
def first_step_normalizer(doc):
    # filters for alphabetic (no punctuation or numbers) and filters out stop words. 
    # lower cases all tokens
    norm_text = [x.lower() for x in word_tokenize(doc) if ((x.isalpha()) & (x.lower() not in stop_words)) ]
    return norm_text

In [25]:
spotify_df['tok_norm'] = spotify_df.review.apply(first_step_normalizer)
spotify_df.head()

Unnamed: 0,review,rating,total_thumbs_up,reply,replied,tok_norm
0,"great music service, the audio is high quality...",5,2,,0,"[great, music, service, audio, high, quality, ..."
1,please ignore previous negative rating. this a...,5,1,,0,"[please, ignore, previous, negative, rating, a..."
2,"this pop-up ""get the best spotify experience o...",4,0,,0,"[get, best, spotify, experience, android, anno..."
3,really buggy and terrible to use as of recently,1,1,,0,"[really, buggy, terrible, use, recently]"
4,dear spotify why do i get songs that i didn't ...,1,1,,0,"[dear, spotify, get, songs, put, playlist, shu..."


In [26]:
spotify_df.tok_norm[311]

['never',
 'knew',
 'app',
 'till',
 'friend',
 'introduced',
 'choose',
 'whatever',
 'want',
 'thanks',
 'friend',
 'spotify',
 'best',
 'thing',
 'ever',
 'could',
 'happened']

In [27]:
from nltk import WordNetLemmatizer # lemmatizer using WordNet
from nltk.corpus import wordnet # imports WordNet
from nltk import pos_tag # nltk's native part of speech tagging

In [28]:
# helper function to change nltk's part of speech tagging to a wordnet format.
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [29]:
first_review_tok = spotify_df.tok_norm.iloc[0]
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(first_review_tok))) 
print(wordnet_tagged)

[('great', 'a'), ('music', 'n'), ('service', 'n'), ('audio', 'r'), ('high', 'a'), ('quality', 'n'), ('app', 'v'), ('easy', 'a'), ('use', 'n'), ('also', 'r'), ('quick', 'r'), ('friendly', 'a'), ('support', 'n')]


In [30]:
wnl = WordNetLemmatizer()
review_lemmatized = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
print(review_lemmatized)

['great', 'music', 'service', 'audio', 'high', 'quality', 'app', 'easy', 'use', 'also', 'quick', 'friendly', 'support']


In [31]:
print(first_review_tok)
print('\n')
print(review_lemmatized)

['great', 'music', 'service', 'audio', 'high', 'quality', 'app', 'easy', 'use', 'also', 'quick', 'friendly', 'support']


['great', 'music', 'service', 'audio', 'high', 'quality', 'app', 'easy', 'use', 'also', 'quick', 'friendly', 'support']


In [32]:
# takes in tokenized document and returns fully normalized token list
def process_doc(doc):

    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc))) 
    doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
    
    return doc_norm

In [33]:
fully_normalized_corpus = spotify_df.tok_norm.apply(process_doc)

In [34]:
flattened_fully_norm = pd.Series(list(itertools.chain(*fully_normalized_corpus)))
len(flattened_fully_norm.unique())

19451

In [35]:
len(dictionary)

31505

In [36]:
flattened_fully_norm.value_counts().tail(20)

reached        1
sop            1
icebreaker     1
mooder         1
iove           1
netweok        1
vrey           1
tgey           1
premeim        1
severally      1
alo            1
gradient       1
preveiw        1
vgood          1
travellling    1
goldan         1
resistant      1
bttr           1
benwfitnit     1
needy          1
dtype: int64

In [37]:
from spellchecker import SpellChecker

In [38]:
from textblob import TextBlob

In [40]:
misspelled_tokens = ['audio', 'quality', 'great', 'app', 'force', 'stop', 'middle', 'song', 'plus', 'tried', 'renew', 'subsciption', 'tried', 'force', 'buy', 'duo', 'make', 'new', 'account', 'hope', 'fixed']

In [41]:
spell = SpellChecker()
misspelled = spell.unknown(misspelled_tokens)
print(misspelled)

{'subsciption'}


In [42]:
new_doc = TextBlob('subsciption')
corrected = new_doc.correct()
print(corrected)

subscription


In [43]:
spotify_df[spotify_df.review.str.contains('prob😕')]

Unnamed: 0,review,rating,total_thumbs_up,reply,replied,tok_norm
23736,the best app for playlist but worst while logi...,1,0,,0,"[best, app, playlist, worst, loging, everytime..."


# packages in environment at C:\Users\Kevin\anaconda3\envs\duolingo-proj:
#
# Name                    Version                   Build  Channel
aiofiles                  22.1.0          py311haa95532_0  
aiosqlite                 0.18.0          py311haa95532_0  
anyio                     3.5.0           py311haa95532_0  
app-store-scraper         0.3.5                    pypi_0    pypi
argon2-cffi               21.3.0             pyhd3eb1b0_0  
argon2-cffi-bindings      21.2.0          py311h2bbff1b_0  
asttokens                 2.0.5              pyhd3eb1b0_0  
attrs                     22.1.0          py311haa95532_0  
babel                     2.11.0          py311haa95532_0  
backcall                  0.2.0              pyhd3eb1b0_0  
beautifulsoup4            4.12.2          py311haa95532_0  
blas                      1.0                         mkl  
bleach                    4.1.0              pyhd3eb1b0_0  
bottleneck                1.3.5           py311h5bb9823_0  
brotlipy   