## LDA

Exploring song lyrics with LDA

In [1]:
import pandas as pd
from pymongo import  MongoClient

### connect to mongodb

In [2]:
# connect to mongodb
client = MongoClient('localhost', 27017)
db = client.lyrics
coll = db.yearly

### load data into pandas

In [8]:
# load data from mongodb into pandas
data = coll.find()
song_lyrics = pd.DataFrame(list(data))
song_lyrics.head()

Unnamed: 0,_id,artist,lyrics,title,track_id,year
0,5827dca77aa2eb0ad91b8fdc,Bukka White,I was over in Aberdeen\nOn my way to New Orlea...,Aberdeen Mississippi Blues,TRHRKYP128F4280BB1,1940
1,5827dca77aa2eb0ad91b8fdd,Bukka White,When a man gets trouble in his mind\nHe wanna ...,Sleepy Man Blues,TRCAHZD128F4280BC1,1940
2,5827dca77aa2eb0ad91b8fde,Bessie Smith,Woke up this mornin' when chickens was crowin'...,Young Woman's Blues,TRJBDVE128F9306FDB,1940
3,5827dca77aa2eb0ad91b8fdf,Bukka White,I'm taken down with the fever and it won't let...,High Fever Blues,TRRRGCS128F4280BB6,1940
4,5827dca77aa2eb0ad91b8fe2,Bukka White,"Hey-eee, come on you women\nLet's a do the the...",Bukka's Jitterbug Swing,TRXZHEC128F4280BB2,1940


In [11]:
# count records
song_lyrics._id.count()

12664

### inspect & explore

In [61]:
first_song30 = song_lyrics.ix[30]['lyrics']

In [30]:
print(first_song30)

I need a taste
Of what you keep hidden
Got to give it up
For the working man
The sweetest fruit
Is the fruit forbidden
Something in your smile
Says you understand
You can pump me up
You can let me down
You can sidestep
But you know what i need

I need a taste of things to come
I need a taste of things to come
(make me happy)
I need a taste on the tip of my tongue
I need a taste of things to come

I need a taste
Of what i've been missing
A little bit of love
Just to get me through
I'm on my knees
But you keep me wishing
There's a part of me
That needs a part of you
You can pump me up
You can let me down
You can sidestep
But you know what i need..

I need a taste of things to come
I need a taste of things to come
(make me happy)
I need a taste on the tip of my tongue
I need a taste of things to come

You've been holding out
I've been holding on
'cause i've got you in my sights
I don't want to let go
But i can't wait another day
It's got to be tonight girl

You know what i need
I need a t

### nlp pipeline - Steps

In [38]:
#1. split into sentences
#2. tokenize
#3. remove punctuation and stopwords
#4. lowercase
#5. stemming (lemmas)
#6. vectorize (tf-idf, count vectors)

# we can use spacy (only supports English & German currently), it provides the following fundamentals
#1.text normalization 
#2.sentence detection 
#3.tokenization
#4.part-of-speech tagging
#5.named-entity recognition 
#6.word vectors
import spacy
nlp_pipeline = spacy.load('en')

In [46]:
%%time
parsed_song = nlp_pipeline(first_song30)

CPU times: user 14.8 ms, sys: 0 ns, total: 14.8 ms
Wall time: 13.1 ms


In [47]:
parsed_song

I need a taste
Of what you keep hidden
Got to give it up
For the working man
The sweetest fruit
Is the fruit forbidden
Something in your smile
Says you understand
You can pump me up
You can let me down
You can sidestep
But you know what i need

I need a taste of things to come
I need a taste of things to come
(make me happy)
I need a taste on the tip of my tongue
I need a taste of things to come

I need a taste
Of what i've been missing
A little bit of love
Just to get me through
I'm on my knees
But you keep me wishing
There's a part of me
That needs a part of you
You can pump me up
You can let me down
You can sidestep
But you know what i need..

I need a taste of things to come
I need a taste of things to come
(make me happy)
I need a taste on the tip of my tongue
I need a taste of things to come

You've been holding out
I've been holding on
'cause i've got you in my sights
I don't want to let go
But i can't wait another day
It's got to be tonight girl

You know what i need
I need a t

In [49]:
for num, sentence in enumerate(parsed_song.sents):
    print('Sentence {}:'.format(num + 1))
    print(sentence)
    print('')

Sentence 1:
I need a taste
Of what you keep hidden
Got to give it up


Sentence 2:
For the working man
The sweetest fruit
Is the fruit forbidden
Something in your smile
Says you understand
You can pump me up
You can let me down
You can sidestep


Sentence 3:
But you know what i need

I need a taste of things to come
I need a taste of things to come
(make me happy)
I need a taste on the tip of my tongue
I need a taste of things to come

I need a taste
Of what i've been missing
A little bit of love
Just to get me through
I'm on my knees


Sentence 4:
But you keep me wishing


Sentence 5:
There's a part of me
That needs a part of you
You can pump me up
You can let me down
You can sidestep


Sentence 6:
But you know what i need..

I need a taste of things to come
I need a taste of things to come
(make me happy)


Sentence 7:
I need a taste on the tip of my tongue
I need a taste of things to come

You've been holding out
I've been holding on
'cause i've got you in my sights
I don't want to 

__Note__: Upon inspection, Spacy gets the sentence segmentation completely wrong. We must do the segmentation ourselves.

### build lda model

In [195]:
#7. lda
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import nltk
from nltk import sent_tokenize
from nltk.tokenize import LineTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation


### parse sentence

In [196]:
sentences = sent_tokenize(first_song30)
sentences[0]

"I need a taste\nOf what you keep hidden\nGot to give it up\nFor the working man\nThe sweetest fruit\nIs the fruit forbidden\nSomething in your smile\nSays you understand\nYou can pump me up\nYou can let me down\nYou can sidestep\nBut you know what i need\n\nI need a taste of things to come\nI need a taste of things to come\n(make me happy)\nI need a taste on the tip of my tongue\nI need a taste of things to come\n\nI need a taste\nOf what i've been missing\nA little bit of love\nJust to get me through\nI'm on my knees\nBut you keep me wishing\nThere's a part of me\nThat needs a part of you\nYou can pump me up\nYou can let me down\nYou can sidestep\nBut you know what i need.."

In [197]:
sentences = LineTokenizer(blanklines='discard').tokenize(first_song30)
sentences

['I need a taste',
 'Of what you keep hidden',
 'Got to give it up',
 'For the working man',
 'The sweetest fruit',
 'Is the fruit forbidden',
 'Something in your smile',
 'Says you understand',
 'You can pump me up',
 'You can let me down',
 'You can sidestep',
 'But you know what i need',
 'I need a taste of things to come',
 'I need a taste of things to come',
 '(make me happy)',
 'I need a taste on the tip of my tongue',
 'I need a taste of things to come',
 'I need a taste',
 "Of what i've been missing",
 'A little bit of love',
 'Just to get me through',
 "I'm on my knees",
 'But you keep me wishing',
 "There's a part of me",
 'That needs a part of you',
 'You can pump me up',
 'You can let me down',
 'You can sidestep',
 'But you know what i need..',
 'I need a taste of things to come',
 'I need a taste of things to come',
 '(make me happy)',
 'I need a taste on the tip of my tongue',
 'I need a taste of things to come',
 "You've been holding out",
 "I've been holding on",
 "'ca

In [198]:
list(punctuation)

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [199]:
# stop words and punctuation
stopwords_punctuation = set(stopwords.words('english') + list(punctuation) + ['i', 'be', 'to', 'if', 'the', 'on', 'a', 'it', 'of', 'in', 'and'] )

In [200]:
# process each song
def lemmatized_corpus(song):
    lemmatized_song = ""
    parsed_song = nlp_pipeline(song)
    for sent in parsed_song.sents:
        lemmatized_song += " ".join([token.lemma_ for token in sent if token not in stopwords_punctuation])
    return lemmatized_song

def processed_df(song_lyrics_df):
    all_songs = ""
    for index, row in song_lyrics_df.iterrows():
        song_lemmas = lemmatized_corpus(row['lyrics'])
        all_songs += song_lemmas
        
    processed_df = pd.DataFrame([all_songs], columns=['title'])
    return processed_df
        

In [201]:
new_df = processed_df(song_lyrics)

In [202]:
pd.set_option('display.max_colwidth', 1000)
new_df[['title']]

Unnamed: 0,title
0,"i be over in aberdeen \non my way to new orlean \n i be over in aberdeen \n on my way to new orlean \n them aberdeen woman tell me \n will buy my gasoline \n\n hey , two little woman \n that i be not ever see \nthey have two little woman \n that i be not never see \nthese two little woman \n just from new orlean \n\n ooh , sittin ' down in aberdeen \n with new orlean on my mind \n i be sittin ' down in aberdeen \n with new orlean on my mind \nwell , i believe them aberdeen woman \n gonna make me lose my mind , yeah \n\n aber - deen be my home \n but the men do not want me around \n aberdeen be my home \n but the man do not want me around \n they know i will take these woman \n an take them outta town \n\n listen , you aberdeen woman \n you know i be not get no dime \noh - oh listenyou woman \n you know'd i be not get no dime \n they be have the po ' boy \n all up and downwhen a man get trouble in his mind \n he wanna sleep all the time \n when a man get trouble in mind \n he wanna ..."


In [203]:
# save to a file
new_df.to_csv(r'processed_song_lyrics.txt', header=None, index=None, sep=' ', mode='a')

In [204]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pickle
import pyLDAvis
import pyLDAvis.gensim
import warnings

In [205]:
!pwd

/home/ubuntu/SongLyricsProject/notebooks


In [206]:
#learn the dictionary
processed_lyrics = LineSentence('/home/ubuntu/SongLyricsProject/notebooks/processed_song_lyrics.txt')
processed_ly = Dictionary(processed_lyrics)

In [207]:
processed_ly.filter_extremes(no_below=10, no_above=0.4)
processed_ly.compactify()
processed_ly.save('processed_songs_dictionary.dict')

In [208]:
def bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for song_lyrics in LineSentence(filepath):
        yield processed_ly.doc2bow(song_lyrics)



In [209]:
MmCorpus.serialize('song_list_bow.mm',
                       bow_generator('/home/ubuntu/SongLyricsProject/notebooks/processed_song_lyrics.txt'))

In [210]:
# load the finished bag-of-words corpus from disk
bow_corpus = MmCorpus('/home/ubuntu/SongLyricsProject/notebooks/song_list_bow.mm')

In [211]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda = LdaMulticore(bow_corpus, num_topics=50, id2word=processed_ly, workers=3)
    lda.save('songs_lda_model')

In [212]:
def explore_topic(topic_number, topn=25):
        
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [213]:
explore_topic(topic_number=19)

term                 frequency

the                  0.087
thing                0.078
only                 0.065
die                  0.047
light                0.043
mine                 0.038
be                   0.035
to                   0.029
of                   0.029
and                  0.026
these                0.025
god                  0.024
seem                 0.019
that                 0.018
,                    0.015
you                  0.014
write                0.014
in                   0.014
all                  0.013
a                    0.012
lead                 0.011
i                    0.011
for                  0.010
pray                 0.010
.                    0.010


In [214]:
lda = LdaMulticore.load('/home/ubuntu/SongLyricsProject/notebooks/songs_lda_model')

In [215]:

LDAvis_prepared = pyLDAvis.gensim.prepare(lda, bow_corpus,
                                              processed_ly)
with open('/home/ubuntu/SongLyricsProject/notebooks/lda_vis_prepared', 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

In [216]:
with open('/home/ubuntu/SongLyricsProject/notebooks/lda_vis_prepared', 'rb') as f:
    LDAvis_prepared = pickle.load(f)

In [217]:
pyLDAvis.display(LDAvis_prepared)

In [79]:
# # "sentence" tokenization
# def song_lines(song):
#     """returns a line of lyric"""a
#     song_lines = LineTokenizer(blanklines='discard').tokenize(song)
#     return song_lines

# # tokenization
# def song_words(song_line):
#     """returns and array of tokens"""
#     return word_tokenize(song_line)
        


# # stop word & punctuation removal

# # stemming/lemmatization

# # pos tagging

# def parsed_sentence(txt):
#     """ use spacy to parse the songs, lemmatize the text and yield a song line or sentence"""
#     pass