In [170]:
# imports 
import pandas as pd 
import numpy as np

#gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

#nltk
from nltk.corpus import stopwords
import itertools

# spacy 
import spacy
#visualization
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

In [41]:
# our processed dataset and corpus with all our lemmatized tokens 
spotify_df = pd.read_csv('data/preprocessed-reviews.csv')
corpus = pd.read_csv('data/spotify-reviews.csv')

#our corpus has identical column name, so need to rename
corpus = corpus.rename(columns={'review':'tokens'})
corpus.head()

Unnamed: 0,tokens
0,great music service audio high quality app eas...
1,please ignore previous negative rating app sup...
2,get best spotify experience android annoy plea...
3,really buggy terrible use recently
4,dear spotify get song put playlist shuffle play


In [18]:
# concatenating our two dataframes 
spotify_reviews = pd.concat([spotify_df,corpus], axis=1)

# creating separate dataframes for positive and negative reviews using the sentiment labels 
positive_reviews = spotify_reviews[spotify_reviews.sentiment.str.contains('pos')]
negative_reviews = spotify_reviews[spotify_reviews.sentiment.str.contains('neg')]

## Topic Modeling

In [19]:
stop_words = stopwords.words('english')
stop_words.extend(['app','spotify'])

# for better filtering, adding 'app' and 'spotify' to our stopwords 
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Topic Modeling Positive Sentiment Reviews 

In [52]:
data = pd.read_csv('data/review-corpus.csv')

Unnamed: 0,review
0,"['great', 'music', 'service', 'audio', 'high',..."
1,"['please', 'ignore', 'previous', 'negative', '..."
2,"['get', 'best', 'spotify', 'experience', 'andr..."
3,"['really', 'buggy', 'terrible', 'use', 'recent..."
4,"['dear', 'spotify', 'get', 'song', 'put', 'pla..."
...,...
61589,"['even', 'communicate', 'lyric', 'feature', 'a..."
61590,"['use', 'sooo', 'good', 'back', 'download', 'f..."
61591,"['app', 'good', 'take', 'device', 'start', 'co..."
61592,"['app', 'good', 'hard', 'navigate', 'let', 'pl..."


In [111]:
token_str = positive_reviews.tokens.values.tolist()
token_str

['great music service audio high quality app easy use also quick friendly support',
 'please ignore previous negative rating app super great give',
 'get best spotify experience android annoy please let get rid',
 'dear spotify get song put playlist shuffle play',
 'love selection lyric provide song listening',
 'still extremely slow change storage external sd card convince do purpose spotify know issue do nothing solve time change sd card faster read write speed samsung brand please add song never appear search playlist',
 'great app best music app ever use problem play song find songs app wonderful recommend best',
 'delete app follow reason app fail business model stream service consumer want pay music fully ad successively log single song much close app ad number patient way profit already peak left decline',
 'amazon premium music family package good everyone listen liked respective alexas room ask play problem spotify premium family alexa integration poor spotify stop play time p

In [129]:
abc = []
for i in token_str:
    token_list = i.split(" ")
    for i in token_list: 
        abc.append(i)
abc[0:20]

['great',
 'music',
 'service',
 'audio',
 'high',
 'quality',
 'app',
 'easy',
 'use',
 'also',
 'quick',
 'friendly',
 'support',
 'please',
 'ignore',
 'previous',
 'negative',
 'rating',
 'app',
 'super']

In [151]:
b = [r.split(",") for r in abc[0:20]]

In [156]:
test_id2word = Dictionary(b)
test_corpus = [test_id2word.doc2bow(token) for token in b]

In [159]:
[[(test_id2word[i], freq) for i,freq in doc] for doc in corpus[:1]]

[[('great', 1),
  ('music', 1),
  ('service', 1),
  ('audio', 1),
  ('high', 1),
  ('quality', 1),
  ('app', 1),
  ('easy', 1),
  ('use', 1),
  ('also', 1),
  ('quick', 1),
  ('friendly', 1),
  ('support', 1)]]

In [161]:
from gensim.models import LdaModel

In [200]:
lda_model = LdaModel(corpus=test_corpus,
                    id2word=test_id2word,
                    num_topics=4,
                    random_state=42,
                    chunksize=100,
                    alpha='auto',
                    per_word_topics=True)
vis = pyLDAvis.gensim_models.prepare(lda_model,test_corpus,test_id2word)
vis

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [149]:
#reviews = [i.split(" ") for i in token_str]
#reviews
for i in abc[0:5]:
    reviews = [r.split(",") for r in i]

In [135]:
abc2 = [i.split(',') for i in abc]

In [199]:
id2word = Dictionary(b)
corpus = [id2word.doc2bow(review) for review in reviews]

In [57]:
[[(id2word[i],freq) for i,freq in review] for review in corpus[:1]]

[[('great music service audio high quality app easy use also quick friendly support',
   1)]]

## N-Grams

In [184]:
bigrams_phrases = gensim.models.Phrases(abc,min_count=3,threshold=20)
bigram = gensim.models.phrases.Phraser(bigrams_phrases)

trigram_phrases = gensim.models.Phrases(abc,min_count=3,threshold=20)
trigam = gensim.models.phrases.Phraser(trigram_phrases)

In [187]:
def make_bigrams(texts):
    return(bigram[review] for review in texts)

def make_trigrams(texts):
    return(trigam[bigram[review]] for review in texts)

In [188]:
data_bigrams = make_bigrams(token_str)
data_bigrams_trigrams = make_trigrams(data_bigrams)

In [191]:
print(data_bigrams_trigrams.)

<generator object make_trigrams.<locals>.<genexpr> at 0x000002DEACFDEC00>
