## Topic Modeling of Lyrics with LDA

Author: Miles Mezaki

In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('data/labeled_lyrics_cleaned.csv')
df

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.630
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.240
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371
...,...,...,...,...,...
158348,158348,Adam Green,"And we live on borrowed time,\r\nBut this head...",Friends of Mine,0.737
158349,158349,Adam Green,Frozin in time forever\r\nCarrying that torch ...,Frozen in Time,0.482
158350,158350,Adam Green,Hard to be a girl. \r\nSo nice to be a boy. \r...,Hard to Be a Girl,0.733
158351,158351,Adam Green,"I want to chose to die,\r\nAnd be buried with ...",I Wanna Die,0.361


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# Initialize the vectorizer
vectorizer = CountVectorizer(
    strip_accents='unicode',
    stop_words='english',
    lowercase=True,
    token_pattern=r'\b[a-zA-Z]{3,}\b', # we want only words that contain letters and are 3 or more characters long
)

# Transform our data into the document-term matrix
dtm = vectorizer.fit_transform(df['seq'])
dtm

<158353x133009 sparse matrix of type '<class 'numpy.int64'>'
	with 8249146 stored elements in Compressed Sparse Row format>

In [8]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['aaa', 'aaaa', 'aaaaa', ..., 'zzzz', 'zzzzs', 'zzzzzombieee'],
      dtype=object)

In [9]:
feature_names.shape

(133009,)

In [10]:
feature_names[300:350]

array(['abductee', 'abductees', 'abducting', 'abduction', 'abductor',
       'abducts', 'abdul', 'abdullah', 'abe', 'abeam', 'abebe',
       'abecedario', 'abed', 'abednago', 'abednego', 'abeds', 'abeg',
       'abeille', 'abeja', 'abejita', 'abel', 'abelard', 'abelene',
       'abeline', 'abell', 'abend', 'abende', 'abendigo', 'abends',
       'abenteuer', 'aber', 'abercrombie', 'aberdeen', 'abernathy',
       'aberrant', 'aberration', 'aberrations', 'aberrettes', 'aberta',
       'abetter', 'abetting', 'abeyance', 'abgefahren', 'abgefickt',
       'abgehn', 'abgesaugt', 'abgestellt', 'abgrund', 'abgvll', 'abh'],
      dtype=object)

In [11]:
doc1 = dtm[0]
doc1

<1x133009 sparse matrix of type '<class 'numpy.int64'>'
	with 76 stored elements in Compressed Sparse Row format>

In [14]:
row_index = 0
doc_vec = dtm.getrow(row_index).toarray()

non_zero_indices = doc_vec.nonzero()[1]
dtm_scores = doc_vec[0, non_zero_indices] # goes and retrieves the values corresponding to the non_zero_indices
words = [feature_names[i] for i in non_zero_indices]

for word, score in zip(words, dtm_scores):
    print(f"{word}: {score}")

actin: 2
ain: 2
attitude: 1
bando: 1
body: 1
brand: 1
bread: 1
break: 1
change: 1
changed: 1
couple: 1
crew: 1
cut: 1
die: 3
don: 3
drake: 1
dream: 1
dress: 1
eat: 1
everybody: 1
everyday: 45
far: 1
flexin: 3
fuck: 3
funny: 1
gets: 1
got: 2
gotta: 1
hard: 1
heard: 1
hobby: 1
hot: 1
know: 3
knows: 1
life: 3
like: 3
lonely: 1
lord: 1
lose: 1
lovin: 4
man: 1
money: 2
motto: 1
new: 1
nigga: 6
niggas: 2
ones: 2
ovo: 1
place: 1
plate: 1
prolly: 1
pull: 1
pun: 1
regular: 3
rich: 3
runnin: 1
say: 2
shit: 3
shout: 1
singers: 1
sound: 1
spectacular: 3
spend: 3
start: 2
starved: 1
stay: 1
strange: 1
swear: 2
talkin: 3
trapped: 1
wanna: 1
wasn: 1
watch: 1
winnin: 1
wrong: 1
yeah: 3


In [15]:
non_zero_indices

array([  1074,   2327,   6725,   8460,  12878,  14166,  14323,  14342,
        19131,  19134,  25264,  25921,  27105,  31036,  33128,  33848,
        33964,  34061,  35274,  38733,  38739,  40300,  42437,  44592,
        44926,  46511,  48200,  48226,  50994,  51787,  53517,  54449,
        63433,  63476,  66546,  66705,  67758,  67988,  68040,  68256,
        69917,  75014,  75873,  78550,  78873,  78893,  81548,  83261,
        87662,  87843,  90781,  91574,  91659,  94995,  96916,  98999,
       100602, 104260, 104633, 105565, 108659, 109124, 109217, 110631,
       110664, 110766, 111691, 114081, 114991, 119914, 127631, 127871,
       127948, 129870, 130928, 131498])

In [16]:
dtm.getcol(2327).toarray().T # get the column, turn it into an array format, then transpose it to be a row

array([[2, 1, 0, ..., 0, 0, 0]])

In [18]:
dtm.getcol(44592).toarray().T # Fuck, which should appear often in songs

array([[3, 0, 0, ..., 0, 0, 0]])

In [19]:
np.count_nonzero(dtm.getcol(44592).toarray().T)

5757

Back to DF

In [20]:
def matrix2Doc(dtMatrix, features, index):
    """Turns each row of the document-term matrix into a list of terms"""
    row = dtMatrix.getrow(index).toarray()
    non_zero_indices = row.nonzero()[1]
    words = [features[idx] for idx in non_zero_indices]
    return words

In [23]:
allDocsAsTerms = [matrix2Doc(dtm, feature_names, i) for i in range(dtm.shape[0])]

In [24]:
len(allDocsAsTerms)

158353

In [25]:
df['terms'] = allDocsAsTerms
df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,terms
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,"[actin, ain, attitude, bando, body, brand, bre..."
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,"[ace, ain, away, band, bite, blow, cares, chas..."
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,"[bad, bags, broke, broken, called, calling, ca..."
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,"[ain, baby, beginnings, blow, boy, calling, ch..."
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,"[answers, believe, brave, broke, coal, coffee,..."


In [26]:
from sklearn.decomposition import LatentDirichletAllocation

# Step 1: Initialize the model

lda = LatentDirichletAllocation(n_components=15, # we are picking the number of topics arbitrarely at the moment
                                random_state=0)

# Step 2: Fit the model
lda.fit(dtm)

In [27]:
lda.components_

array([[0.06666673, 1.76866155, 0.0666667 , ..., 0.06666671, 0.0666667 ,
        0.06666667],
       [0.06666683, 0.06666671, 0.06666667, ..., 0.06666667, 0.06666721,
        0.06666667],
       [1.06305535, 0.07322839, 4.06666644, ..., 0.06666667, 0.06666667,
        0.0666667 ],
       ...,
       [0.06666674, 0.06666671, 0.06666667, ..., 0.06666667, 0.06666667,
        0.06666671],
       [0.06666668, 0.06666671, 0.06666667, ..., 0.06666667, 0.06666667,
        0.06666673],
       [0.06666718, 0.06666668, 0.06666667, ..., 0.06666667, 0.06666667,
        0.06666667]])

In [28]:
lda.components_.shape

(15, 133009)

In [38]:
def display_topics(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([features[i]
                        for i in topic.argsort()[:-no_top_words-1:-1]])) # syntax for reversing a list [::-1]

display_topics(lda, feature_names, 10)

Topic 0:
life world live die like end lies people living inside
Topic 1:
don know baby want got just wanna like cause say
Topic 2:
heart tonight dream love dance night song sweet kiss arms
Topic 3:
just time away like way know said gone day long
Topic 4:
run better home right town new round got big come
Topic 5:
like got ain nigga shit fuck know cause don niggas
Topic 6:
night gotta alright day high turn bye wait waiting goes
Topic 7:
man like got woman roll girls let sing rock hot
Topic 8:
love know let just feel say don baby heart make
Topic 9:
yeah hey girl little help got pretty miss bit midnight
Topic 10:
gonna like ain good day time bad got make way
Topic 11:
god lord soul heaven jesus born king earth holy world
Topic 12:
come let light feel like night dead sun blood dark
Topic 13:
blue sky beautiful christmas day fly happy little old sun
Topic 14:
rock que bop jingle devil boogie blue come bell lady


In [39]:
doc_topic_dist = lda.transform(dtm)
doc_topic_dist 

array([[4.01607591e-04, 4.01607549e-04, 4.01607287e-04, ...,
        4.01606930e-04, 4.01606954e-04, 4.01606532e-04],
       [2.94727229e-01, 2.88601126e-04, 2.88051603e-01, ...,
        8.00809387e-02, 2.88600796e-04, 2.88600474e-04],
       [6.53596499e-04, 6.53596470e-04, 6.53596240e-04, ...,
        6.53596324e-04, 6.53595936e-04, 6.53594853e-04],
       ...,
       [1.75439680e-03, 3.60019451e-01, 1.75439012e-03, ...,
        1.75439040e-03, 1.75438850e-03, 1.75438658e-03],
       [1.25786607e-03, 1.25786651e-03, 1.25786806e-03, ...,
        4.31300374e-01, 4.09091574e-02, 1.25786327e-03],
       [1.23457070e-03, 1.23457224e-03, 6.86848847e-02, ...,
        2.86846205e-01, 2.07152953e-01, 1.23456945e-03]])

In [40]:
doc_topic_dist.shape

(158353, 15)

In [41]:
def displayHeader(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    topicNames = []
    for topic_idx, topic in enumerate(model.components_):
        topicNames.append(f"Topic {topic_idx}: " + (", ".join([features[i]
                             for i in topic.argsort()[:-no_top_words-1:-1]])))
    return topicNames

In [43]:
# column names
topicnames = displayHeader(lda, feature_names, 5)

# index names
docnames = df.index.tolist() # We will use the original names of the documents

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(doc_topic_dist, 3), 
                                 columns=topicnames, 
                                 index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1) # finds the maximum argument
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic.head()

Unnamed: 0,"Topic 0: life, world, live, die, like","Topic 1: don, know, baby, want, got","Topic 2: heart, tonight, dream, love, dance","Topic 3: just, time, away, like, way","Topic 4: run, better, home, right, town","Topic 5: like, got, ain, nigga, shit","Topic 6: night, gotta, alright, day, high","Topic 7: man, like, got, woman, roll","Topic 8: love, know, let, just, feel","Topic 9: yeah, hey, girl, little, help","Topic 10: gonna, like, ain, good, day","Topic 11: god, lord, soul, heaven, jesus","Topic 12: come, let, light, feel, like","Topic 13: blue, sky, beautiful, christmas, day","Topic 14: rock, que, bop, jingle, devil",dominant_topic
0,0.0,0.0,0.0,0.0,0.0,0.662,0.0,0.0,0.333,0.0,0.0,0.0,0.0,0.0,0.0,5
1,0.295,0.0,0.288,0.0,0.0,0.161,0.0,0.0,0.0,0.024,0.149,0.0,0.08,0.0,0.0,0
2,0.001,0.001,0.001,0.428,0.001,0.537,0.001,0.001,0.001,0.028,0.001,0.001,0.001,0.001,0.001,5
3,0.0,0.241,0.0,0.062,0.0,0.322,0.0,0.309,0.062,0.0,0.0,0.0,0.0,0.0,0.0,5
4,0.243,0.001,0.001,0.001,0.001,0.001,0.001,0.249,0.333,0.031,0.066,0.001,0.067,0.001,0.001,8


In [44]:
df_document_topic[76:86]

Unnamed: 0,"Topic 0: life, world, live, die, like","Topic 1: don, know, baby, want, got","Topic 2: heart, tonight, dream, love, dance","Topic 3: just, time, away, like, way","Topic 4: run, better, home, right, town","Topic 5: like, got, ain, nigga, shit","Topic 6: night, gotta, alright, day, high","Topic 7: man, like, got, woman, roll","Topic 8: love, know, let, just, feel","Topic 9: yeah, hey, girl, little, help","Topic 10: gonna, like, ain, good, day","Topic 11: god, lord, soul, heaven, jesus","Topic 12: come, let, light, feel, like","Topic 13: blue, sky, beautiful, christmas, day","Topic 14: rock, que, bop, jingle, devil",dominant_topic
76,0.001,0.126,0.001,0.333,0.089,0.097,0.001,0.248,0.001,0.001,0.001,0.001,0.001,0.102,0.001,3
77,0.001,0.605,0.11,0.227,0.051,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,1
78,0.001,0.548,0.001,0.001,0.001,0.378,0.001,0.001,0.001,0.001,0.001,0.001,0.064,0.001,0.001,1
79,0.001,0.434,0.18,0.267,0.018,0.001,0.093,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,1
80,0.025,0.295,0.001,0.509,0.04,0.001,0.001,0.001,0.001,0.001,0.001,0.024,0.05,0.001,0.053,3
81,0.092,0.0,0.0,0.0,0.235,0.283,0.063,0.067,0.204,0.053,0.0,0.0,0.0,0.0,0.0,5
82,0.0,0.0,0.0,0.299,0.067,0.211,0.0,0.339,0.0,0.081,0.0,0.0,0.0,0.0,0.0,7
83,0.0,0.114,0.0,0.195,0.264,0.0,0.0,0.081,0.0,0.0,0.103,0.124,0.116,0.0,0.0,4
84,0.09,0.619,0.073,0.065,0.001,0.132,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.016,1
85,0.116,0.001,0.167,0.001,0.001,0.001,0.001,0.001,0.603,0.001,0.107,0.001,0.001,0.001,0.001,8


In [45]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,3,34801
1,8,23198
2,1,20459
3,0,12169
4,12,11463
5,5,10366
6,13,8708
7,2,7086
8,7,6412
9,4,6199


In [46]:
from sklearn.model_selection import GridSearchCV

# We are going to test multiple values for the number of topics
search_params = {'n_components': [5, 10, 15, 20, 25, 30, 35]}

# Initialize the LDA model
lda = LatentDirichletAllocation()

# Initialize a Grid Search with cross-validation instance
grid = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
grid.fit(dtm)

KeyboardInterrupt: 

In [None]:
grid.cv_results_

In [None]:
# Best Model
best_lda_model = grid.best_estimator_

# Model Parameters
print("Best Model's Params: ", grid.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", grid.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(dtm))

In [None]:
search_params = {'n_components': [1,2,3,4,5,6]}

lda = LatentDirichletAllocation()
grid = GridSearchCV(lda, param_grid=search_params)

grid.fit(dtm)

# Best Model
best_lda_model = grid.best_estimator_

# Model Parameters
print("Best Model's Params: ", grid.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", grid.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(dtm))

In [None]:
display_topics(best_lda_model, feature_names, 40)