## Topic Modeling of Lyrics with LDA

Author: Miles Mezaki

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/labeled_lyrics_cleaned.csv')
df

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.630
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.240
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371
...,...,...,...,...,...
158348,158348,Adam Green,"And we live on borrowed time,\r\nBut this head...",Friends of Mine,0.737
158349,158349,Adam Green,Frozin in time forever\r\nCarrying that torch ...,Frozen in Time,0.482
158350,158350,Adam Green,Hard to be a girl. \r\nSo nice to be a boy. \r...,Hard to Be a Girl,0.733
158351,158351,Adam Green,"I want to chose to die,\r\nAnd be buried with ...",I Wanna Die,0.361


In [12]:
add_on = pd.read_csv('songs_with_lyrics.csv')
add_on['song'] = add_on['track_name']
add_on['seq'] = add_on['lyrics']
add_on

Unnamed: 0,file_name,track_name,artist,lyrics,song,seq
0,share_video_7329274198096973099_.mp3,Montagem Mysterious Game,LXNGVX,This song is an instrumental,Montagem Mysterious Game,This song is an instrumental
1,share_video_7339685917759769899_.mp3,Dumptruck,Kinfolk Thugs,[Intro]\r\nAh-where they at?\r\n(K-K-K-Kin Fol...,Dumptruck,[Intro]\r\nAh-where they at?\r\n(K-K-K-Kin Fol...
2,share_video_7324083003284573482_.mp3,Little Life,Cordelia,[Verse 1]\r\nHow would you have me described?\...,Little Life,[Verse 1]\r\nHow would you have me described?\...
3,share_video_7303373812387679490_.mp3,A Sky Full of Stars (Live at the Royal Albert ...,Coldplay,"[Intro]\r\nOh, turn the lights on, turn the li...",A Sky Full of Stars (Live at the Royal Albert ...,"[Intro]\r\nOh, turn the lights on, turn the li..."
4,share_video_7306351228974288159_.mp3,How To Never Stop Being Sad,dandelion hands,Repeat to yourself that they're not really gon...,How To Never Stop Being Sad,Repeat to yourself that they're not really gon...
...,...,...,...,...,...,...
349,share_video_7339632851790384427_.mp3,Enamorado De Ti,los temerario,Que Deficil es sin ti tengo miedo eres tu mi l...,Enamorado De Ti,Que Deficil es sin ti tengo miedo eres tu mi l...
350,share_video_7331565050198543659_.mp3,Show Me How,Men I Trust,[Verse 1]\r\nShow me how you care\r\nTell me h...,Show Me How,[Verse 1]\r\nShow me how you care\r\nTell me h...
351,share_video_7341274575193378090_.mp3,"yes, and",Ariana Grande,[Verse 1]\r\nIn case you haven't noticed\r\nWe...,"yes, and",[Verse 1]\r\nIn case you haven't noticed\r\nWe...
352,share_video_7322305748975848705_.mp3,Funny,Gold-Tiger,This song is an instrumental,Funny,This song is an instrumental


In [33]:
merged_df = pd.merge(df, add_on, on=['artist','seq','song'], how='left')
merged_df
seen = set()
to_drop = []
for i,row in df.iterrows():
    if (row['artist'], row['song']) in seen:
        to_drop.append(i)
    seen.add((row['artist'], row['song']))

In [34]:
print(len(merged_df))
df = merged_df.drop(to_drop)
print(len(df))

158353
158312


Define code to expand contracted words, which are prominent in music:

In [4]:
def expand_contractions(s):
    res = ""
    contractions = { 
        "ain't": "aint", # Could be many tenses, but is not is probably the best mapping for all tenses
        "aren't": "arent",
        "can't": "cant",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "couldve",
        "couldn't": "couldnt",
        "couldn't've": "couldnt have",
        "didn't": "didnt",
        "doesn't": "doesnt",
        "don't": "dont",
        "hadn't": "hadnt",
        "hadn't've": "had not have",
        "hasn't": "hasnt",
        "haven't": "havent",
        "he'd": "hed",
        "he'd've": "he wouldve",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "hes",
        "how'd": "howd",
        "how'd'y": "howd you",
        "how'll": "how will",
        "how's": "hows",
        "I'd": "Id",
        "I'd've": "I wouldve",
        "I'll": "I will",
        "I'll've": "I will have",
        "I'm": "Im",
        "I've": "Ive",
        "isn't": "isnt",
        "it'd": "it would",
        "it'd've": "it wouldve",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "its",
        "let's": "lets",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "mightve",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "mustve",
        "musta" : "mustve",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "shed",
        "she'd've": "she would have",
        "she'll": "shell",
        "she'll've": "she will have",
        "she's": "shes",
        "should've": "shouldve",
        "shouldn't": "shouldnt",
        "shouldn't've": "shouldnt have",
        "so've": "sove",
        "so's": "sos",
        "that'd": "thatd",
        "that'd've": "thatd have",
        "that's": "thats",
        "there'd": "thered",
        "there'd've": "thered have",
        "there's": "theres",
        "they'd": "theyd",
        "they'd've": "theyd have",
        "they'll": "theyll",
        "they'll've": "theyll have",
        "they're": "theyre",
        "they've": "theyve",
        "to've": "to have",
        "wasn't": "wasnt",
        "we'd": "wed",
        "we'd've": "we wouldve",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "weve",
        "weren't": "werent",
        "what'll": "whatll",
        "what'll've": "whatll have",
        "what're": "whatre",
        "what's": "whats",
        "what've": "whatve",
        "when's": "whens",
        "when've": "whenve",
        "where'd": "whered",
        "where's": "wheres",
        "where've": "whereve",
        "who'll": "wholl",
        "who'll've": "wholl have",
        "who's": "whos",
        "who've": "whove",
        "why's": "whys",
        "why've": "whyve",
        "will've": "will have",
        "won't": "wont",
        "won't've": "wont have",
        "would've": "wouldve",
        "wouldn't": "wouldnt",
        "wouldn't've": "wouldnt have",
        "y'all": "yall",
        "y'all'd": "yall would",
        "y'all'd've": "yall would have",
        "y'all're": "yall are",
        "y'all've": "yall have",
        "you'd": "youd",
        "you'd've": "youd have",
        "you'll": "youll",
        "you'll've": "youll have",
        "you're": "youre",
        "you've": "youve"
        }
    for word in s.split():
        if word in contractions:
            res += contractions[word] + " "
        else:
            res += word + " "
    return res

print(df['seq'][0])

No, no
I ain't ever trapped out the bando
But oh Lord, don't get me wrong
I know a couple niggas that do
I'm from a place where everybody knows your name
They say I gotta watch my attitude
When they see money, man they all start actin' strange
So fuck with the ones that fuck with you
They can never say I'm brand new

It's everyday, everyday
Everyday, everyday, everyday
Everyday, everyday
Everyday, everyday
I've been talkin' my shit, nigga that's regular
I've been lovin' 'em thick, life is spectacular
I spend like I'ma die rich, nigga I'm flexin', yeah
Everyday, that's everyday
That's everyday
That's everyday
That's everyday, everyday

I see all of these wanna-be hot R&B singers
I swear you all sound the same
They start from the bottom, so far from the motto
You niggas'll never be Drake
Shout out to OVO
Most of them prolly don't know me though
I stay in the cut, I don't fuck with no
Body but I D, that's a pun on No I.D
When nobody know my name
Runnin' for my dream wasn't hard to do
You 

In [5]:
expand_contractions(df['seq'][0])

"No, no I aint ever trapped out the bando But oh Lord, dont get me wrong I know a couple niggas that do Im from a place where everybody knows your name They say I gotta watch my attitude When they see money, man they all start actin' strange So fuck with the ones that fuck with you They can never say Im brand new It's everyday, everyday Everyday, everyday, everyday Everyday, everyday Everyday, everyday Ive been talkin' my shit, nigga thats regular Ive been lovin' 'em thick, life is spectacular I spend like I'ma die rich, nigga Im flexin', yeah Everyday, thats everyday That's everyday That's everyday That's everyday, everyday I see all of these wanna-be hot R&B singers I swear you all sound the same They start from the bottom, so far from the motto You niggas'll never be Drake Shout out to OVO Most of them prolly dont know me though I stay in the cut, I dont fuck with no Body but I D, thats a pun on No I.D When nobody know my name Runnin' for my dream wasnt hard to do You break bread, I

Next define code that will match expressions missing the trailing g in their progressive tense and replace the apostrophe with a g. (e.g. trippin' -> tripping)

In [6]:
import re

pattern = r"(.)in'"
replacement =  r"\1ing"
def expand_verbs(s: str, pattern, replacement):
    """
    Assume s is the lyrics string
    """
    res, _ = re.subn(pattern, replacement, s)
    return res

expand_verbs("trippin', grippin', dribblin'.", pattern, replacement)

'tripping, gripping, dribbling.'

In [7]:
def expand_verbs_and_contractions(s, pattern=r"(.)in'", replacement=r"\1ing"):
    """
    s: string representing lyrics
    pattern: regular expression representing the pattern to be replaced
    replacement: regular expression to replace pattern

    Returns:
        res: string ready for tokenization
    """
    return expand_verbs(expand_contractions(s.lower()), pattern, replacement)
    
expand_verbs_and_contractions(df['seq'][0])

"no, no i aint ever trapped out the bando but oh lord, dont get me wrong i know a couple niggas that do i'm from a place where everybody knows your name they say i gotta watch my attitude when they see money, man they all start acting strange so fuck with the ones that fuck with you they can never say i'm brand new its everyday, everyday everyday, everyday, everyday everyday, everyday everyday, everyday i've been talking my shit, nigga thats regular i've been loving 'em thick, life is spectacular i spend like i'ma die rich, nigga i'm flexing, yeah everyday, thats everyday thats everyday thats everyday thats everyday, everyday i see all of these wanna-be hot r&b singers i swear you all sound the same they start from the bottom, so far from the motto you niggas'll never be drake shout out to ovo most of them prolly dont know me though i stay in the cut, i dont fuck with no body but i d, thats a pun on no i.d when nobody know my name running for my dream wasnt hard to do you break bread, 

In [8]:
df['seq'] = df['seq'].apply(expand_verbs_and_contractions)

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,file_name,track_name,lyrics
0,0,Elijah Blake,"no, no i aint ever trapped out the bando but o...",Everyday,0.626,,,
1,1,Elijah Blake,"the drinks go down and smoke goes up, i feel m...",Live Till We Die,0.63,,,
2,2,Elijah Blake,she dont live on planet earth no more she foun...,The Otherside,0.24,,,
3,3,Elijah Blake,"tripping off that grigio, mobbing, lights low ...",Pinot,0.536,,,
4,4,Elijah Blake,"i see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,,,


Next, we want to tokenize and stem the data. We will not remove stop words since the people are generally important in lyrics and the verbs are often stopwords.

In [41]:
# from nltk.tokenize import word_tokenize 

# text = df['seq'][0]
# # tokenize
# result = word_tokenize(text)
# result

In [42]:
# from nltk.stem import PorterStemmer
 
# ps = PorterStemmer()
 
# # choose some words to be stemmed
# words = df['seq'][3]
 
# for w in words.split():
#     print(w, " : ", ps.stem(w))

In [43]:
# def token_stem(s):
#     """
#     Given a string s, tokenize the string, then stem it.
#     """
#     ps = PorterStemmer()
#     word_tokenize(s)
#     return " ".join([ps.stem(w) for w in word_tokenize(s)])

# token_stem("converts the words in word_tokens to lower case and then checks whether ")

Now we apply the tokenization and stemming to our lyrics.

In [44]:
# df['seq'] = df['seq'].apply(token_stem)

Now our lyrics text is ready for post-processing.

In [9]:
df['seq']

0         no, no i aint ever trapped out the bando but o...
1         the drinks go down and smoke goes up, i feel m...
2         she dont live on planet earth no more she foun...
3         tripping off that grigio, mobbing, lights low ...
4         i see a midnight panther, so gallant and so br...
                                ...                        
158348    and we live on borrowed time, but this headsho...
158349    frozin in time forever carrying that torch for...
158350    hard to be a girl. so nice to be a boy. in my ...
158351    i want to chose to die, and be buried with a r...
158352    musical ladders leaning on mountains bathed in...
Name: seq, Length: 158353, dtype: object

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
# Initialize the vectorizer
vectorizer = CountVectorizer(
    strip_accents='unicode',
    stop_words='english',
    lowercase=True,
    token_pattern=r'\b[a-zA-Z]{3,}\b', # we want only words that contain letters and are 3 or more characters long
)

# Transform our data into the document-term matrix
dtm = vectorizer.fit_transform(df['seq'])
dtm

<158353x132486 sparse matrix of type '<class 'numpy.int64'>'
	with 8424215 stored elements in Compressed Sparse Row format>

In [12]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['aaa', 'aaaa', 'aaaaa', ..., 'zzzz', 'zzzzs', 'zzzzzombieee'],
      dtype=object)

In [13]:
feature_names.shape

(132486,)

In [14]:
feature_names[300:350]

array(['abductee', 'abductees', 'abducting', 'abduction', 'abductor',
       'abducts', 'abdul', 'abdullah', 'abe', 'abeam', 'abebe',
       'abecedario', 'abed', 'abednago', 'abednego', 'abeds', 'abeg',
       'abeille', 'abeja', 'abejita', 'abel', 'abelard', 'abelene',
       'abeline', 'abell', 'abend', 'abende', 'abendigo', 'abends',
       'abenteuer', 'aber', 'abercrombie', 'aberdeen', 'abernathy',
       'aberrant', 'aberration', 'aberrations', 'aberrettes', 'aberta',
       'abetter', 'abetting', 'abeyance', 'abgefahren', 'abgefickt',
       'abgehn', 'abgesaugt', 'abgestellt', 'abgrund', 'abgvll', 'abh'],
      dtype=object)

In [15]:
doc1 = dtm[0]
doc1

<1x132486 sparse matrix of type '<class 'numpy.int64'>'
	with 78 stored elements in Compressed Sparse Row format>

In [16]:
row_index = 0
doc_vec = dtm.getrow(row_index).toarray()

non_zero_indices = doc_vec.nonzero()[1]
dtm_scores = doc_vec[0, non_zero_indices] # goes and retrieves the values corresponding to the non_zero_indices
words = [feature_names[i] for i in non_zero_indices]

for word, score in zip(words, dtm_scores):
    print(f"{word}: {score}")

acting: 2
aint: 2
attitude: 1
bando: 1
body: 1
brand: 1
bread: 1
break: 1
change: 1
changed: 1
couple: 1
crew: 1
cut: 1
die: 3
dont: 3
drake: 1
dream: 1
dress: 1
eat: 1
everybody: 1
everyday: 45
far: 1
flexing: 3
fuck: 3
funny: 1
gets: 1
got: 2
gotta: 1
hard: 1
heard: 1
hobby: 1
hot: 1
know: 3
knows: 1
life: 3
like: 3
lonely: 1
lord: 1
lose: 1
loving: 4
man: 1
money: 2
motto: 1
new: 1
nigga: 6
niggas: 2
ones: 2
ovo: 1
place: 1
plate: 1
prolly: 1
pull: 1
pun: 1
regular: 3
rich: 3
running: 1
say: 2
shit: 3
shout: 1
singers: 1
sound: 1
spectacular: 3
spend: 3
start: 2
starved: 1
stay: 1
strange: 1
swear: 2
talking: 3
thats: 17
trapped: 1
wanna: 1
wasnt: 1
watch: 1
winning: 1
wrong: 1
yeah: 3
youre: 1


In [17]:
non_zero_indices

array([  1069,   2333,   6707,   8438,  12828,  14106,  14263,  14282,
        19060,  19063,  25167,  25818,  26989,  30897,  33063,  33699,
        33813,  33910,  35122,  38559,  38565,  40122,  42251,  44395,
        44726,  46309,  47995,  48021,  50768,  51560,  53286,  54213,
        63169,  63213,  66281,  66438,  67490,  67719,  67771,  67989,
        69652,  74727,  75580,  78258,  78579,  78599,  81256,  82955,
        87343,  87525,  90453,  91240,  91325,  94643,  96540,  98629,
       100227, 103870, 104241, 105166, 108234, 108695, 108787, 110190,
       110222, 110324, 111236, 113612, 114519, 116389, 119430, 127123,
       127365, 127441, 129360, 130409, 130976, 131587], dtype=int64)

In [18]:
dtm.getcol(2327).toarray().T # get the column, turn it into an array format, then transpose it to be a row

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
dtm.getcol(44592).toarray().T # Fuck, which should appear often in songs

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
np.count_nonzero(dtm.getcol(44592).toarray().T)

1

Back to DF

In [21]:
def matrix2Doc(dtMatrix, features, index):
    """Turns each row of the document-term matrix into a list of terms"""
    row = dtMatrix.getrow(index).toarray()
    non_zero_indices = row.nonzero()[1]
    words = [features[idx] for idx in non_zero_indices]
    return words

In [22]:
allDocsAsTerms = [matrix2Doc(dtm, feature_names, i) for i in range(dtm.shape[0])]

In [23]:
len(allDocsAsTerms)

158353

In [24]:
df['terms'] = allDocsAsTerms
df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,terms
0,0,Elijah Blake,"no, no i aint ever trapped out the bando but o...",Everyday,0.626,"[acting, aint, attitude, bando, body, brand, b..."
1,1,Elijah Blake,"the drinks go down and smoke goes up, i feel m...",Live Till We Die,0.63,"[ace, aint, away, band, bite, blow, cares, cha..."
2,2,Elijah Blake,she dont live on planet earth no more she foun...,The Otherside,0.24,"[bad, bags, broke, broken, called, calling, ca..."
3,3,Elijah Blake,"tripping off that grigio, mobbing, lights low ...",Pinot,0.536,"[aint, baby, beginnings, blow, boy, calling, c..."
4,4,Elijah Blake,"i see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,"[answers, believe, brave, broke, coal, coffee,..."


In [26]:
from sklearn.decomposition import LatentDirichletAllocation

# Step 1: Initialize the model

lda = LatentDirichletAllocation(n_components=5, # based on GridSearch, we discover that 5 is a good number of topics for LDA.
                                random_state=0)

# Step 2: Fit the model
lda.fit(dtm)

In [27]:
lda.components_

array([[13.07683477,  0.97586778,  0.20168688, ...,  1.1994976 ,
         1.19890976,  0.56857149],
       [ 0.20199086,  0.20376344,  0.20000019, ...,  0.20047703,
         0.20108918,  0.80166874],
       [24.7528757 ,  0.20315819,  4.19831148, ...,  0.20002281,
         0.20000027,  0.21335326],
       [ 0.37155214,  6.41423684,  0.20000113, ...,  0.20000201,
         0.20000061,  0.20000927],
       [ 9.59674653,  0.20297375,  0.20000032, ...,  0.20000054,
         0.20000017,  0.21639724]])

In [28]:
lda.components_.shape

(5, 132486)

In [29]:
def display_topics(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([features[i]
                        for i in topic.argsort()[:-no_top_words-1:-1]])) # syntax for reversing a list [::-1]

display_topics(lda, feature_names, 10)

Topic 0:
like got dont aint know man nigga shit just fuck
Topic 1:
love dont know just youre time want way say like
Topic 2:
baby yeah girl got like hey gonna little just shes
Topic 3:
old said home man bye pum train got que daddy
Topic 4:
love like night eyes heart world come away light life


In [30]:
doc_topic_dist = lda.transform(dtm)
doc_topic_dist 

array([[0.63345465, 0.36324054, 0.00110941, 0.00109572, 0.00109967],
       [0.2103766 , 0.40532117, 0.05992099, 0.00088204, 0.3234992 ],
       [0.13362025, 0.59991525, 0.26314157, 0.00165476, 0.00166817],
       ...,
       [0.29697931, 0.16173243, 0.39639511, 0.13942341, 0.00546974],
       [0.49958253, 0.00386002, 0.00384568, 0.00380699, 0.48890478],
       [0.21544335, 0.00379261, 0.28925932, 0.00381191, 0.48769281]])

In [31]:
doc_topic_dist.shape

(158353, 5)

In [32]:
def displayHeader(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    topicNames = []
    for topic_idx, topic in enumerate(model.components_):
        topicNames.append(f"Topic {topic_idx}: " + (", ".join([features[i]
                             for i in topic.argsort()[:-no_top_words-1:-1]])))
    return topicNames

In [33]:
# column names
topicnames = displayHeader(lda, feature_names, 10)

# index names
docnames = df.index.tolist() # We will use the original names of the documents

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(doc_topic_dist, 3), 
                                 columns=topicnames, 
                                 index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1) # finds the maximum argument
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic.head()

Unnamed: 0,"Topic 0: like, got, dont, aint, know, man, nigga, shit, just, fuck","Topic 1: love, dont, know, just, youre, time, want, way, say, like","Topic 2: baby, yeah, girl, got, like, hey, gonna, little, just, shes","Topic 3: old, said, home, man, bye, pum, train, got, que, daddy","Topic 4: love, like, night, eyes, heart, world, come, away, light, life",dominant_topic
0,0.633,0.363,0.001,0.001,0.001,0
1,0.21,0.405,0.06,0.001,0.323,1
2,0.134,0.6,0.263,0.002,0.002,1
3,0.635,0.192,0.171,0.001,0.001,0
4,0.269,0.553,0.003,0.07,0.105,1


In [34]:
df_document_topic[76:86]

Unnamed: 0,"Topic 0: like, got, dont, aint, know, man, nigga, shit, just, fuck","Topic 1: love, dont, know, just, youre, time, want, way, say, like","Topic 2: baby, yeah, girl, got, like, hey, gonna, little, just, shes","Topic 3: old, said, home, man, bye, pum, train, got, que, daddy","Topic 4: love, like, night, eyes, heart, world, come, away, light, life",dominant_topic
76,0.439,0.278,0.129,0.002,0.153,0
77,0.002,0.744,0.101,0.152,0.002,1
78,0.866,0.126,0.003,0.003,0.003,0
79,0.002,0.914,0.002,0.002,0.079,1
80,0.144,0.728,0.002,0.002,0.125,1
81,0.506,0.271,0.22,0.001,0.001,0
82,0.484,0.109,0.22,0.06,0.127,0
83,0.072,0.277,0.193,0.22,0.238,1
84,0.415,0.58,0.002,0.002,0.002,1
85,0.002,0.992,0.002,0.002,0.002,1


In [35]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,1,72503
1,4,43584
2,2,18219
3,0,17036
4,3,7011


In [36]:
df_document_topic.to_csv('../lda-results/document_topic_without_lyrics.csv')
df_topic_distribution.to_csv('../lda-results/topic_distribution_without_lyrics.csv')

In [37]:
from sklearn.model_selection import GridSearchCV

# We are going to test multiple values for the number of topics
search_params = {'n_components': [3, 4, 5]}

# Initialize the LDA model
lda = LatentDirichletAllocation()

# Initialize a Grid Search with cross-validation instance
grid = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
grid.fit(dtm)

In [38]:
grid.cv_results_

{'mean_fit_time': array([322.20832949, 342.09373679, 344.99867043]),
 'std_fit_time': array([22.70609384, 15.30608217,  4.5620314 ]),
 'mean_score_time': array([5.43091664, 6.09414873, 6.06560345]),
 'std_score_time': array([0.43705359, 0.36575259, 0.13991072]),
 'param_n_components': masked_array(data=[3, 4, 5],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_components': 3}, {'n_components': 4}, {'n_components': 5}],
 'split0_test_score': array([-23046081.20111998, -23050772.35207721, -23012318.59184919]),
 'split1_test_score': array([-23663310.51873978, -23555950.55534031, -23633661.00556118]),
 'split2_test_score': array([-23765093.48499016, -23865413.89566913, -23782554.40758479]),
 'split3_test_score': array([-24209871.31897045, -24168308.25315203, -24168346.24125382]),
 'split4_test_score': array([-23689125.6136878 , -23637105.46491072, -23640879.92009572]),
 'mean_test_score': array([-23674696.42750163, -23655510.104

In [39]:
# Best Model
best_lda_model = grid.best_estimator_

# Model Parameters
print("Best Model's Params: ", grid.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", grid.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(dtm))

Best Model's Params:  {'n_components': 5}
Best Log Likelihood Score:  -23647552.03326894
Model Perplexity:  1956.6398390262623
