In [1]:
import pandas as pd
from ast import literal_eval
from gensim import corpora, models

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


### Topic modeling

Import the lyrics csv which have already been tokenized

In [34]:
# import the lyrics as dataframe
country_lyrics_df = pd.read_csv('Data\df_cty_lyrics.csv', converters={'words':literal_eval,'segments':literal_eval})
country_lyrics_df = country_lyrics_df.drop_duplicates('track')
country_lyrics_df.head()

rock_lyrics_df = pd.read_csv('Data\df_rock_lyrics.csv', converters={'words':literal_eval,'segments':literal_eval})
rock_lyrics_df = rock_lyrics_df.drop_duplicates('track')
rock_lyrics_df.head()

Unnamed: 0.1,Unnamed: 0,track,artist,lyrics,words,segments
0,0,Planet Zero,Shinedown,Planet Zero Lyrics[Verse 1]\r\nDown here on pl...,"[planet, zero, lyric, planet, zero, swing, gav...","[([Verse 1], Down here on planet zero They sw..."
1,1,Black Summer,Red Hot Chili Peppers,Black Summer Lyrics[Verse 1]\r\nA lazy rain am...,"[black, summer, lyric, lazy, rain, sky, refuse...","[([Verse 1], A lazy rain am I, the skies refu..."
2,2,Love Dies Young,Foo Fighters,Love Dies Young Lyrics[Verse 1]\r\nLove dies y...,"[love, dy, young, lyric, love, dy, young, resu...","[([Verse 1], Love dies young and there's no r..."
3,3,So Called Life,Three Days Grace,"So Called Life Lyrics[Verse 1]\r\nCan't laugh,...","[called, life, lyric, laugh, cry, live, die, a...","[([Verse 1], Can't laugh, can't cry, can't li..."
5,5,For The Glory (feat. Hollywood Undead),All Good Things,For the Glory Lyrics[Verse 1]\r\nBetter back d...,"[glory, lyric, better, back, domain, got, whol...","[([Verse 1], Better back down, you're in my d..."


In [35]:
# assign the tokenzied words to tokenized_lyrics
tokenized_lyrics = country_lyrics_df[['words']]
tokenized_lyrics

Unnamed: 0,words
0,"[buy, dirt, lyric, day, turned, 80, sitting, b..."
1,"[fancy, like, lyric, ayy, girl, bangin, low, m..."
2,"[sand, boot, lyric, asked, said, somewhere, ne..."
3,"[didnt, love, lyric, mind, bein, alone, keep, ..."
8,"[livin, dream, lyric, mama, pray, success, any..."
...,...
993,"[didnt, much, lyric, tonka, truck, gi, joes, j..."
994,"[makin, plan, lyric, ever, left, town, never, ..."
995,"[god, speed, album, version, lyric, one, go, w..."
996,"[mountain, lyric, got, spell, draggin, heart, ..."


#### Feature engineering of lyrics tokens into tf and tfidf vectors

In [90]:
# dummy function for converting list of tokens into objects
# so we don't have to rejoin the tokens into strings as inputs into the vectorizers
def dummy_func(lyric):
    return lyric

tfidf_vectorizer = TfidfVectorizer(tokenizer=dummy_func, max_df=0.95, min_df=2, lowercase=False)
tfidf = tfidf_vectorizer.fit_transform(list(tokenized_lyrics.words.values))

count_vectorizer = CountVectorizer(tokenizer=dummy_func, max_df=0.95, min_df=2, lowercase=False)
tf = count_vectorizer.fit_transform(list(tokenized_lyrics.words.values))

In [91]:
def display_topics(model, feature_names, n_top_words):
    '''
    Displays the top words for each topic.
    Parameters:
        model -> topic model from sklearn
        feature_names -> list of str
        n_top_words -> number of top words to show for each topic
    '''
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [92]:
# Initializing parameters

# no. of topics
n_topics = 5
# no. of top words to show for each topic
n_top_words = 10

#### LDA

In [93]:
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=10).fit(tf)

In [94]:
display_topics(lda, count_vectorizer.get_feature_names(), n_top_words)

Topic 0:
know like yeah got love one get girl time way
Topic 1:
bloom one woman hand na man time would stephen new
Topic 2:
like oh love ooh gonna little wanna yeah time get
Topic 3:
let love like got night go back one long get
Topic 4:
nigga bitch yeah got like get fuck shit gon as




#### EMF

In [95]:
nmf = NMF(n_components=n_topics, random_state=1, 
            alpha=.1, 
            # l1_ratio=.5, 
            init='nndsvd').fit(tfidf)



In [96]:
display_topics(nmf, tfidf_vectorizer.get_feature_names(), n_top_words)

Topic 0:
love know never could say let one heart would want
Topic 1:
nigga bitch fuck got shit gon yeah get real ayy
Topic 2:
ooh oohooh oh ohoh know yeah yeahyeahyeah runnin care high
Topic 3:
yeah like got girl little good get back night country
Topic 4:
wanna baby make like take know feel kiss gonna stay




### Gensim models

In [75]:
# create a BoW dictionary
dictionary = corpora.Dictionary(tokenized_lyrics.words)
dictionary.filter_extremes(no_below=10, keep_n=100000)

In [76]:
def top_freq_words(dictionary, n_top_words):
    '''
    Return the top words based on freq in the BoW dictionary.
    Parameters:
        dictionary -> bag-of-word count of word frequency
        n_top_words -> n most frequent words in the dictionary
    '''
    for k, v in dictionary.iteritems():
        print(k, v)
        if k == n_top_words:
            break

In [77]:
top_freq_words(dictionary, n_top_words)

0 add
1 around
2 back
3 buy
4 call
5 caught
6 chasing
7 church
8 coffee
9 count
10 day


In [81]:
bow_corpus = [dictionary.doc2bow(lyric) for lyric in tokenized_lyrics.words]

In [82]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [83]:
lda_model = models.LdaMulticore(bow_corpus, num_topics=n_topics, id2word=dictionary, passes=2, workers=2)

In [84]:
for idx, topic in lda_model.print_topics(num_topics=-1, num_words=n_top_words):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.015*"one" + 0.011*"go" + 0.011*"time" + 0.011*"little" + 0.009*"let" + 0.008*"night" + 0.008*"back" + 0.008*"every" + 0.007*"right" + 0.007*"never"
Topic: 1 
Words: 0.014*"baby" + 0.012*"back" + 0.011*"need" + 0.010*"nigga" + 0.009*"time" + 0.009*"want" + 0.009*"go" + 0.008*"come" + 0.008*"one" + 0.007*"bitch"
Topic: 2 
Words: 0.013*"wanna" + 0.011*"baby" + 0.011*"gonna" + 0.010*"back" + 0.010*"night" + 0.010*"never" + 0.010*"way" + 0.009*"make" + 0.009*"long" + 0.008*"time"
Topic: 3 
Words: 0.016*"one" + 0.013*"oh" + 0.012*"let" + 0.011*"way" + 0.008*"time" + 0.008*"go" + 0.008*"girl" + 0.008*"gonna" + 0.008*"make" + 0.007*"say"
Topic: 4 
Words: 0.017*"girl" + 0.010*"say" + 0.010*"good" + 0.010*"time" + 0.010*"go" + 0.009*"right" + 0.009*"ooh" + 0.009*"one" + 0.009*"make" + 0.009*"heart"


In [86]:
lda_model_tfidf = models.LdaMulticore(corpus_tfidf, num_topics=n_topics, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(num_topics=-1, num_words=n_top_words):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.007*"bitch" + 0.004*"nigga" + 0.004*"fuck" + 0.004*"let" + 0.003*"want" + 0.003*"oh" + 0.003*"good" + 0.003*"goodbye" + 0.003*"baby" + 0.003*"nasty"
Topic: 1 
Word: 0.004*"wanna" + 0.004*"baby" + 0.004*"make" + 0.003*"country" + 0.003*"never" + 0.003*"stand" + 0.003*"la" + 0.003*"oh" + 0.003*"life" + 0.003*"gonna"
Topic: 2 
Word: 0.005*"girl" + 0.004*"wanna" + 0.004*"oh" + 0.004*"gonna" + 0.003*"heart" + 0.003*"little" + 0.003*"could" + 0.003*"one" + 0.003*"home" + 0.003*"make"
Topic: 3 
Word: 0.004*"still" + 0.003*"need" + 0.003*"oh" + 0.003*"baby" + 0.003*"let" + 0.003*"heart" + 0.003*"every" + 0.003*"time" + 0.003*"call" + 0.003*"go"
Topic: 4 
Word: 0.004*"baby" + 0.004*"want" + 0.004*"wanna" + 0.004*"always" + 0.004*"never" + 0.003*"gonna" + 0.003*"would" + 0.003*"girl" + 0.003*"say" + 0.003*"little"
