In [31]:
from gensim.utils import simple_preprocess, ClippedCorpus
from gensim.corpora import Dictionary, MmCorpus
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import LdaModel
import pandas as pd
import numpy as np

In [26]:
# need to download song lyric dataset from https://www.kaggle.com/mousehead/songlyrics
# and unzip it to ./data
songs = pd.read_csv('./data/songdata.csv')
songs_lyrics = songs.text.apply(lambda x: x.replace('\n', ''))
print('Number of song lyrics in dataset: {}'.format(len(songs_lyrics)))

Number of song lyrics in dataset: 57650


57650

In [14]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]
    
# create iterator over the documents (song lyrics)
def corpus_iter():
    for song in songs_lyrics:
        yield tokenize(song)

In [15]:
# create dictionary from corpus
%time song_dictionary = Dictionary(corpus_iter())
print(song_dictionary)

CPU times: user 25 s, sys: 157 ms, total: 25.1 s
Wall time: 25.4 s
Dictionary(85449 unique tokens: ['believe', 'blue', 'face', 'feel', 'fellow']...)


In [17]:
# sample sentence
doc = "Phenotypic characterization of the SIRC (Statens Seruminstitut Rabbit Cornea) cell line reveals a mixed epithelial and fibroblastic nature"
bow = song_dictionary.doc2bow(tokenize(doc))
print('Word IDs present in dictionary:')
print(bow)
print([song_dictionary[i[0]] for i in bow])

Word IDs present in dictionary:
[(808, 1), (2847, 1), (3412, 1), (5648, 1), (8740, 1), (12675, 1), (21085, 1)]
['line', 'nature', 'rabbit', 'cell', 'mixed', 'reveals', 'cornea']


In [18]:
# create a stream of bag-of-words vectors
class BOWCorpus(object):
    def __init__(self, corpus_iter, dictionary):
        self.corpus_iter = corpus_iter
        self.dictionary = dictionary
    
    def __iter__(self):
        for tokens in self.corpus_iter():
            yield self.dictionary.doc2bow(tokens)

song_corpus = BOWCorpus(corpus_iter, song_dictionary)

In [19]:
# store BOW-corpus into a file
%time MmCorpus.serialize('./data/song_corpus_bow.mm', song_corpus)

CPU times: user 29 s, sys: 401 ms, total: 29.4 s
Wall time: 29.6 s


In [21]:
# load BOW-corpus
mm_corpus = MmCorpus('./data/song_corpus_bow.mm')
print(mm_corpus)

MmCorpus(57650 documents, 85449 features, 2982186 non-zero entries)


In [29]:
# use fewer documents during training, LDA is slow
clipped_corpus = ClippedCorpus(mm_corpus, 10000)  

In [33]:
# fit LDA model
%time lda_model = LdaModel(clipped_corpus, num_topics=20, id2word=song_dictionary, passes=4)

CPU times: user 39.5 s, sys: 662 ms, total: 40.2 s
Wall time: 40.7 s


In [41]:
lda_model.print_topics(20)

[(0,
  '0.018*"em" + 0.014*"mother" + 0.011*"roll" + 0.010*"brother" + 0.009*"doo" + 0.007*"father" + 0.007*"death" + 0.007*"ooo" + 0.007*"young" + 0.007*"morning"'),
 (1,
  '0.046*"wanna" + 0.041*"christmas" + 0.022*"ah" + 0.018*"ha" + 0.015*"round" + 0.012*"year" + 0.010*"merry" + 0.009*"burn" + 0.009*"drink" + 0.008*"shout"'),
 (2,
  '0.040*"know" + 0.029*"time" + 0.023*"ve" + 0.019*"like" + 0.017*"life" + 0.015*"love" + 0.013*"got" + 0.012*"oh" + 0.012*"ll" + 0.011*"way"'),
 (3,
  '0.070*"want" + 0.036*"little" + 0.014*"hell" + 0.011*"fool" + 0.011*"like" + 0.010*"knock" + 0.009*"bit" + 0.007*"break" + 0.006*"white" + 0.006*"try"'),
 (4,
  '0.074*"home" + 0.065*"da" + 0.013*"ba" + 0.011*"going" + 0.006*"dee" + 0.006*"wah" + 0.006*"chickens" + 0.005*"houston" + 0.005*"dan" + 0.004*"thou"'),
 (5,
  '0.138*"la" + 0.065*"na" + 0.021*"pum" + 0.013*"bang" + 0.012*"pa" + 0.012*"sa" + 0.011*"ang" + 0.010*"di" + 0.009*"ng" + 0.008*"rum"'),
 (6,
  '0.077*"baby" + 0.071*"oh" + 0.046*"yeah" + 

In [43]:
lda_model.show_topic(17, topn=50)

[('lord', 0.0431258),
 ('god', 0.038804255),
 ('jesus', 0.029804338),
 ('glory', 0.018821897),
 ('holy', 0.015902156),
 ('uh', 0.01526181),
 ('king', 0.0107914265),
 ('sing', 0.009564806),
 ('born', 0.008407606),
 ('hear', 0.008236479),
 ('hallelujah', 0.008143266),
 ('freedom', 0.008082764),
 ('mighty', 0.007882832),
 ('earth', 0.007433106),
 ('worship', 0.0073015536),
 ('heaven', 0.0068215164),
 ('shall', 0.0068201623),
 ('war', 0.006256063),
 ('huh', 0.0061148163),
 ('peace', 0.0059593124),
 ('soul', 0.0056202076),
 ('christ', 0.0055074594),
 ('angels', 0.0050303647),
 ('free', 0.0048797955),
 ('men', 0.004693255),
 ('thy', 0.0045556896),
 ('chorus', 0.0044774497),
 ('stand', 0.0044649006),
 ('spirit', 0.00418655),
 ('thee', 0.0038032983),
 ('pray', 0.003653935),
 ('power', 0.0035389818),
 ('children', 0.0034555),
 ('bum', 0.003356569),
 ('nah', 0.00334771),
 ('great', 0.003335568),
 ('calling', 0.0032657867),
 ('battle', 0.0032543887),
 ('mercy', 0.0032310637),
 ('son', 0.003163857