In [1]:
import pandas as pd

In [2]:
path = "../data/final/"
DATA = pd.read_csv(path+"chart.csv")

In [3]:
DATA.head(5)

Unnamed: 0,sentiment_comp,sentiment_pos,sentiment_neg,sentiment_neu,f_k_grade,flesch_index,fog_index,difficult_words,num_syllables,num_words,num_lines,num_dup,artist,song,lyricId
0,0.3568,0.164,0.16,0.676,1.7,97.09,3.75,40,541,469,58,0,Marty Robbins,El Paso,1
1,0.9991,0.532,0.031,0.437,-0.4,107.79,2.62,3,175,154,26,4,Frankie Avalon,Why,2
2,0.9982,0.322,0.022,0.657,2.2,90.46,2.52,5,275,220,35,9,Johnny Preston,Running Bear,4
3,0.9924,0.28,0.119,0.601,1.2,103.83,3.92,3,243,216,22,9,Guy Mitchell,Heartaches By The Number,5
4,-0.9698,0.03,0.228,0.742,0.4,105.86,3.52,1,110,101,13,3,Paul Anka,It's Time To Cry,6


In [4]:
# need only lyrics
lyIdList = DATA["lyricId"]
lyric_path = "../data/interim/lyrics/lyric_"
text_data = []
for idx in lyIdList:
    l = ""
    with open(lyric_path+str(idx)+".txt", 'r') as lyricsFile:
        l = lyricsFile.read()
    text_data.append({
        "lyricId" : idx,
        "text" : l
    })
LYRICS = pd.DataFrame(text_data)

In [5]:
LYRICS.head(3)

Unnamed: 0,lyricId,text
0,1,Out in the West Texas town of El Paso\nI fell ...
1,2,I'll never let you go\nWhy? Because I love you...
2,4,"On the bank of the river\nStood Running Bear, ..."


In [6]:
# Load the regular expression library
import re

# Remove punctuation
LYRICS['text_processed'] = LYRICS['text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
LYRICS['text_processed'] = LYRICS['text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
LYRICS['text_processed'].head()

0    out in the west texas town of el paso\ni fell ...
1    i'll never let you go\nwhy because i love you\...
2    on the bank of the river\nstood running bear y...
3    heartaches by the number troubles by the score...
4    when somebody leaves you that's the time to cr...
Name: text_processed, dtype: object

In [7]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = LYRICS.text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['out', 'in', 'the', 'west', 'texas', 'town', 'of', 'el', 'paso', 'fell', 'in', 'love', 'with', 'mexican', 'girl', 'nighttime', 'would', 'find', 'me', 'in', 'rosa', 'cantina', 'music', 'would', 'play', 'and', 'feleena', 'would', 'whirl', 'blacker']


In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [9]:
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['chorus', 'verse'])

[nltk_data] Downloading package stopwords to /home/aps/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Using cached en_core_web_sm-2.3.1-py3-none-any.whl
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
data_words_trigrams = make_trigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:10])

['fall', 'love', 'mexican', 'girl', 'nighttime', 'would', 'find', 'music', 'would', 'play']


In [13]:

import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 2), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 3), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 3), (27, 1), (28, 1), (29, 2)]


In [42]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [43]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.149*"take" + 0.069*"light" + 0.067*"turn" + 0.064*"high" + 0.054*"world" '
  '+ 0.048*"fire" + 0.033*"burn" + 0.033*"fly" + 0.029*"touch" + 0.016*"hand"'),
 (1,
  '0.046*"go" + 0.035*"night" + 0.028*"day" + 0.026*"away" + 0.019*"life" + '
  '0.019*"time" + 0.017*"dream" + 0.015*"fall" + 0.014*"still" + '
  '0.013*"never"'),
 (2,
  '0.107*"go" + 0.105*"baby" + 0.104*"get" + 0.057*"want" + 0.045*"know" + '
  '0.021*"let" + 0.020*"keep" + 0.020*"need" + 0.019*"girl" + 0.017*"right"'),
 (3,
  '0.065*"say" + 0.032*"know" + 0.032*"tell" + 0.028*"look" + 0.026*"see" + '
  '0.026*"think" + 0.020*"could" + 0.018*"call" + 0.016*"way" + 0.015*"walk"'),
 (4,
  '0.173*"let" + 0.161*"come" + 0.077*"back" + 0.049*"feel" + 0.034*"song" + '
  '0.026*"bring" + 0.026*"sing" + 0.025*"rain" + 0.019*"music" + 0.019*"play"'),
 (5,
  '0.051*"get" + 0.025*"bitch" + 0.015*"know" + 0.014*"make" + 0.012*"go" + '
  '0.012*"money" + 0.011*"fuck" + 0.010*"shit" + 0.010*"niggas" + 0.010*"put"'),
 (6,
  '0.1

In [44]:
#Compute Model Perplexity and Coherence Score
#Let’s calculate the baseline coherence score
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.35953852426556737


In [53]:
# tuning for nuber of topics, alpha, beta
def compute_coherence_values(corpus, dictionary, k, a, b):
    #print(corpus[:1][0][:10])
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [54]:
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 3
max_topics = 30
step_size = 3
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('../data/interim/lda_tuning_results.csv', index=False)
    pbar.close()





  0%|          | 0/540 [11:34<?, ?it/s]
  0%|          | 0/540 [01:39<?, ?it/s]
  0%|          | 0/540 [13:35<?, ?it/s]
  0%|          | 0/540 [00:18<?, ?it/s]
  0%|          | 0/540 [13:36<?, ?it/s]
  0%|          | 0/540 [00:18<?, ?it/s]
  0%|          | 0/540 [13:36<?, ?it/s]
  0%|          | 0/540 [00:18<?, ?it/s]




  0%|          | 0/540 [16:01<?, ?it/s] 145.51s/it][A[A[A[A
  0%|          | 0/540 [02:43<?, ?it/s]
  0%|          | 0/540 [16:01<?, ?it/s]
  0%|          | 0/540 [02:44<?, ?it/s]
  0%|          | 0/540 [16:02<?, ?it/s]
  0%|          | 0/540 [02:44<?, ?it/s]




  0%|          | 0/540 [18:40<?, ?it/s] 149.79s/it][A[A[A[A
  0%|          | 0/540 [05:23<?, ?it/s]
  0%|          | 0/540 [18:40<?, ?it/s]
  0%|          | 0/540 [05:23<?, ?it/s]
  0%|          | 0/540 [18:41<?, ?it/s]
  0%|          | 0/540 [05:23<?, ?it/s]
  0%|          | 0/540 [21:22<?, ?it/s]
  0%|          | 0/540 [08:05<?, ?it/s]




  1%|          | 3/540 [07:47<22:54:04, 153.53s/it][A[

 41%|████      | 222/540 [9:49:00<18:37:54, 210.93s/it][A[A[A[A



 41%|████▏     | 223/540 [9:51:48<17:26:01, 197.99s/it][A[A[A[A



 41%|████▏     | 224/540 [9:54:16<16:04:16, 183.09s/it][A[A[A[A



 42%|████▏     | 225/540 [9:58:05<17:13:26, 196.84s/it][A[A[A[A



 42%|████▏     | 226/540 [10:02:01<18:11:17, 208.53s/it][A[A[A[A



 42%|████▏     | 227/540 [10:05:40<18:23:55, 211.62s/it][A[A[A[A



 42%|████▏     | 228/540 [10:08:24<17:06:07, 197.33s/it][A[A[A[A



 42%|████▏     | 229/540 [10:10:49<15:41:45, 181.69s/it][A[A[A[A



 43%|████▎     | 230/540 [10:14:40<16:54:34, 196.37s/it][A[A[A[A



 43%|████▎     | 231/540 [10:18:38<17:55:22, 208.81s/it][A[A[A[A



 43%|████▎     | 232/540 [10:21:54<17:33:04, 205.14s/it][A[A[A[A



 43%|████▎     | 233/540 [10:24:35<16:21:14, 191.77s/it][A[A[A[A



 43%|████▎     | 234/540 [10:27:04<15:12:17, 178.88s/it][A[A[A[A



 44%|████▎     | 235/540 [10:30:58<16:34:32, 195.65s/it][A[A[A[A

 83%|████████▎ | 450/540 [22:51:46<5:43:27, 228.97s/it][A[A[A[A



 84%|████████▎ | 451/540 [22:56:14<5:56:53, 240.60s/it][A[A[A[A



 84%|████████▎ | 452/540 [23:00:15<5:53:01, 240.70s/it][A[A[A[A



 84%|████████▍ | 453/540 [23:03:47<5:36:32, 232.10s/it][A[A[A[A



 84%|████████▍ | 454/540 [23:07:04<5:17:40, 221.64s/it][A[A[A[A



 84%|████████▍ | 455/540 [23:11:37<5:35:29, 236.82s/it][A[A[A[A



 84%|████████▍ | 456/540 [23:16:14<5:48:29, 248.92s/it][A[A[A[A



 85%|████████▍ | 457/540 [23:20:29<5:46:51, 250.74s/it][A[A[A[A



 85%|████████▍ | 458/540 [23:24:10<5:30:30, 241.83s/it][A[A[A[A



 85%|████████▌ | 459/540 [23:27:36<5:12:05, 231.18s/it][A[A[A[A



 85%|████████▌ | 460/540 [23:32:04<5:23:08, 242.35s/it][A[A[A[A



 85%|████████▌ | 461/540 [23:36:21<5:24:42, 246.61s/it][A[A[A[A



 86%|████████▌ | 462/540 [23:40:32<5:22:09, 247.82s/it][A[A[A[A



 86%|████████▌ | 463/540 [23:44:17<5:09:15, 240.98s/it][A[A[A[A



 86%|█

In [14]:
# from 2.1-hyper parameter search
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.9)

In [15]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.420057388514231


In [18]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
LDAvis_prepared

  and should_run_async(code)
