In [62]:
import pandas as pd
from pymongo import  MongoClient

### setup connections

In [63]:
client = MongoClient('localhost', 27017)
db = client.lyrics
coll = db.yearly

### load data

In [64]:
# load data from mongodb into pandas
data = coll.find()
song_lyrics = pd.DataFrame(list(data))
song_lyrics.head()

Unnamed: 0,_id,artist,lyrics,title,track_id,year
0,5827dca77aa2eb0ad91b8fdc,Bukka White,I was over in Aberdeen\nOn my way to New Orlea...,Aberdeen Mississippi Blues,TRHRKYP128F4280BB1,1940
1,5827dca77aa2eb0ad91b8fdd,Bukka White,When a man gets trouble in his mind\nHe wanna ...,Sleepy Man Blues,TRCAHZD128F4280BC1,1940
2,5827dca77aa2eb0ad91b8fde,Bessie Smith,Woke up this mornin' when chickens was crowin'...,Young Woman's Blues,TRJBDVE128F9306FDB,1940
3,5827dca77aa2eb0ad91b8fdf,Bukka White,I'm taken down with the fever and it won't let...,High Fever Blues,TRRRGCS128F4280BB6,1940
4,5827dca77aa2eb0ad91b8fe2,Bukka White,"Hey-eee, come on you women\nLet's a do the the...",Bukka's Jitterbug Swing,TRXZHEC128F4280BB2,1940


In [57]:
docs = song_lyrics['lyrics'] # a series

In [42]:
print(doc[0]) # take a look

I was over in Aberdeen
On my way to New Orlean
I was over in Aberdeen
On my way to New Orlean
Them Aberdeen women told me
Will buy my gasoline

Hey, two little women
That I ain't ever seen
They has two little women
That I ain't never seen
These two little women
Just from New Orlean

Ooh, sittin' down in Aberdeen
With New Orlean on my mind
I'm sittin' down in Aberdeen
With New Orlean on my mind
Well, I believe them Aberdeen women
Gonna make me lose my mind, yeah

Aber-deen is my home
But the mens don't want me around
Aberdeen is my home
But the men don't want me around
They know I will take these women
An take them outta town

Listen, you Aberdeen women
You know I ain't got no dime
Oh-oh listen you women
You know'd I ain't got no dime
They been had the po' boy
All up and down


In [72]:
# pre-processed song lyrics -> song_lyrics_processed
# I tried running the lda algorithm without the usual pre-processing of the data and it seems to work just fine. However, I will add this 
# in the future: pipeline -> 
#0. tokenization
#1. remove stopword, punctuation 
#2. lowercase 
#3. expansions/replacements 
#4. (stemming, phrase processing, pos-tagging)
# these can all be done in one shot using spacy

# spacy processing
def spacy_processing(lyric):
    pass

# nltk processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
import re
def lyric_preprocessing(lyric, stem=1):
    stop_words = set(stopwords.words('english') + list(punctuation))
    re_replace = {
        r"\bdon't\b": "do not",
        r"\bdoesn't\b": "does not",
        r"\bdidn't\b": "did not",
        r"\bhasn't\b": "has not",
        r"\bhaven't\b": "have not",
        r"\bhadn't\b": "had not",
        r"\bwon't\b": "will not",
        r"\bwouldn't\b": "would not",
        r"\bcan't\b": "can not",
        r"\bcannot\b": "can not"
    }
    
    lyric = lyric.lower()
    for r, replacement in re_replace.items():
        lyric = re.sub(r, replacement, lyric)
        
    lyric_words = word_tokenize(lyric)
    lyric_words_clean = [word for word in lyric_words if word not in stop_words]
    return lyric_words_clean
    

In [73]:
# testing pre-processing
lyric_preprocessing(doc[0])

AttributeError: 'dict' object has no attribute 'iteritems'

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from pyLDAvis import prepare
import pyLDAvis

In [59]:
def from_sklearn(docs,vect,lda,**kwargs):
    
    """Create Prepared Data from sklearn's vectorizer and Latent Dirichlet
    Application
    
    Parameters
    ----------
    docs : Pandas Series.
        Documents to be passed as an input.
    vect : Scikit-Learn Vectorizer (CountVectorizer,TfIdfVectorizer).
        vectorizer to convert documents into matrix sparser
    lda  : sklearn.decomposition.LatentDirichletAllocation.
        Latent Dirichlet Allocation
    
    **kwargs: Keyword argument to be passed to pyLDAvis.prepare()
    
    
    Returns
    -------
    prepared_data : PreparedData
    vect : sklearn's Vectorizer.
    lda : sklearn's Latent Dirichlet Allocation.
    """
    
    norm = lambda data: pd.DataFrame(data).div(data.sum(1),axis=0).values
    
    vected = vect.fit_transform(docs)
    doc_topic_dists = norm(lda.fit_transform(vected))
    
    prepared = prepare(
                        doc_lengths = docs.str.len(),
                        vocab = vect.get_feature_names(),
                        term_frequency = vected.sum(axis=0).tolist()[0],
                        topic_term_dists = norm(lda.components_),
                        doc_topic_dists = doc_topic_dists,
                        **kwargs)

    return prepared,lda,vect

#source: https://gist.github.com/napjon/ef842b41e0048d132607

In [60]:
# lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
#                                 learning_method='online',
#                                 learning_offset=50.,
#                                 random_state=0)

vect = CountVectorizer()
lda = LatentDirichletAllocation() # just use defaults

In [27]:
#pyLDAvis.enable_notebook()

In [61]:
prepared, lda, vector = from_sklearn(docs,vect,lda)

In [49]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" - ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print('\n')
    print('\n\n')

In [50]:
# look at the top words
print_top_words(lda, vector.get_feature_names(), 20)

Topic #0:
da - ha - ride - dum - hang - easy - bamba - lee - ghetto - celebrate - dance - yeh - dig - dat - weep - hitch - thy - movies - ye - dove


Topic #1:
christmas - ho - oo - bop - on - ja - bird - santa - jingle - san - boogety - shoo - hee - pata - mambo - merry - ooo - sloopy - sleigh - ei


Topic #2:
yeah - oh - ooh - ya - ah - get - it - whoa - uh - huh - up - man - boogie - yo - shit - beat - fuck - boy - hoo - funk


Topic #3:
hey - bad - party - new - gimme - woo - big - bout - bang - york - talkin - well - square - gal - day - texas - jones - oh - gas - hi


Topic #4:
na - doo - ba - que - da - eu - de - um - do - dee - não - ron - och - se - pa - meu - en - me - bom - og


Topic #5:
revolution - mercy - doctor - doll - doot - maria - fever - in - marie - bim - limbo - slippin - definition - lollipop - motor - wimoweh - lard - of - grace - lolli


Topic #6:
the - and - in - to - of - she - on - he - my - we - is - that - they - it - all - her - was - for - with - down



In [29]:
pyLDAvis.display(prepared)