In [30]:
import pandas as pd

song_theme = pd.read_json('./Dataset/top_songs_lyrics.json')
song_theme.columns

Index(['year', 'artist', 'song', 'lyrics'], dtype='object')

# Latent Dirichlet Allocation (LDA) - 165 songs

**Three major ideas**
- Translate songs in native language to english
- pre-processing: Clean and tokenize the translated lyrics, remove stopwords (common and custom)
- Topic Modelling: Create an LDA model to identify topics within the song lyrics

**Detailed Steps**
1) Importing libraries
    - googletrans: handle text transaltion
    - string: provides constants for string manipulation 
    - nltk: A toolkit for natural language processing (NLP)
    - gensim: Used for topic modelling

2) Initialize Translator

3) Define Translation Function

4) Translate Lyrics

5) Define Stopwords
    - default_stopwords: default set of english stopwords from NLTK
    - custom_stopwords: I manually added them.

6) Pre-process Lyrics Function
    - Remove punctuation
    - convert lyrics to lowercase
    - tokenize lyrics into words
    - filter out non-alphabetic words and stopwords

7) Apply pre-preprocessing
    
8) Perform Topic Modelling with LDA
    - dictionary: maps each unqiue word to an ID
    - corpus: transform the tokens into a bag-of-words format
    - lda_model: creates an LDA model with 10 topics and trains it on the corpus for 15 passes 
    - extracts and prints the top 5 words for each topic found by the LDA model


In [31]:
from googletrans import Translator
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models

# #download stopwords
# nltk.download('stopwords')
# nltk.download('punkt')

#initilaize translator 
translator = Translator()

#define function to translate lyrics to english
def translate_lyrics(lyrics):
    try:
        return translator.translate(lyrics, dest='en').text
    except Exception as e:
        print (f'Error in Transaltion: {e}')
        return lyrics

#translate lyrics to english
song_theme['translated_lyrics'] = song_theme['lyrics'].apply(translate_lyrics)

# get the list of stopwords - default & custom
default_stopwords = set(stopwords.words('english'))
custom_stopwords = default_stopwords.union(
    {'verse', 'chorus', '–', 'im', 'oh', 'na', 'yeah', 'got', 'ooh', 'wan', 'cause', 'ill', 'youre', '2', '1', 
     'ft', 'like', 'one', 'prechorus','la', 'aint', 'low', 'two', 'woah', 'gettin', 'postchorus', 'mmm', 'mxrxgxa', 'da', 'thank',
     'ah', 'gon', 'ya', "’", 'thats', 'another', 'outro', 'ive', 'hey', 'montenero', 'whats', 'gioielli', 'clap', 'del', 'blue', 'zyrtck',
     'bridge', 'dj', 'would', 'way', 'ta', 'em', 'yes', 'youd', 'didnt', 'nothin', 'nothing', 'ayy',
     'maybe', 'redrum', 'ohoh', 'ariana', 'vacca', 'bout', 'grande', '21', 'youve', 'youll', 'somethin', 'beyoncé',
     'id', 'smack', 'yo', 'freestyle', 'without', 'di', 'intro', 'woo', 'might', 'il', 'non', 'uh', 'knew', 'mm',
     'anybody', 'hes', 'ima', 'e', 'x', 'itll', 'refrain', 'could', 'ee', 'comin', 'lil', 'shes', '3', 'halo', 'whats', 
     'gionni', 'sometimes', 'gunz', 'ride', 'blow', 'black','mutha', 'uhoh', 'pai', 'zeno', 'wouldnt', 'air',
     'egreen', 'toni', 'georgia', 'throw', 'nah', 'cease', 'goes', 'dust', 'bet', 'bum', 'apart', 'doesnt', 'cassel', 'oohoohoohooh', 
     'huh', 'closet', 'goin', 'cleanin', 'claver', 'closet', 'nex', 'lot', 'michaels', 'less', 'things', 'per', 'second', 'line',
     'bites', 'shall', 'everybody', 'ele', 'fit', 'ho', 'jp', 'joe', 'smokestackstudio', 'getting', 'aleaka', 'three',
     })

#pre-process the lyrics function
def preprocess_lyrics(lyrics):
    translator = str.maketrans('', '', string.punctuation)
    tokens = word_tokenize(lyrics.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in custom_stopwords]
    return tokens

#apply pre-processing using the TRANSLATED lyrics
song_theme['tokens'] = song_theme['translated_lyrics'].apply(preprocess_lyrics)

#PERFORM TOPIC MODELLING

#create dictionary and corpus for LDA
dictionary = corpora.Dictionary(song_theme['tokens'])
corpus = [dictionary.doc2bow(text) for text in song_theme['tokens']]

#build LDA model
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

#print topics 
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.038*"star" + 0.015*"dancing" + 0.014*"make" + 0.014*"know" + 0.012*"money"')
(1, '0.026*"baby" + 0.021*"know" + 0.015*"side" + 0.014*"rain" + 0.014*"come"')
(2, '0.017*"know" + 0.016*"wake" + 0.012*"middle" + 0.011*"good" + 0.011*"let"')
(3, '0.039*"try" + 0.024*"never" + 0.023*"get" + 0.017*"fall" + 0.016*"meant"')
(4, '0.052*"love" + 0.018*"know" + 0.015*"body" + 0.013*"touch" + 0.010*"used"')
(5, '0.054*"somebody" + 0.053*"love" + 0.046*"fuck" + 0.039*"find" + 0.020*"come"')
(6, '0.070*"work" + 0.012*"lose" + 0.009*"night" + 0.008*"kiss" + 0.008*"always"')
(7, '0.030*"every" + 0.028*"love" + 0.024*"day" + 0.022*"baby" + 0.020*"want"')
(8, '0.023*"go" + 0.022*"let" + 0.015*"want" + 0.015*"need" + 0.012*"never"')
(9, '0.050*"girls" + 0.023*"run" + 0.022*"world" + 0.020*"know" + 0.016*"little"')


# Visualize distribution of Topics from LDA model

1) Visualize topic distibution per document
    - a document refers to individual unit of text that's being analzyed.
    - i have 165 songs in my data frame, it means i have 165 documents
    
2) Visualize topic distribution using pyLDAvis

# Extract themes for each song using LDA

In [32]:
#Function to get the dominant topic for a single document
def get_dominant_topic(lda_model, corpus_document):
    topic_distribution = lda_model.get_document_topics(corpus_document)
    dominant_topic = max(topic_distribution, key=lambda x:x[1])
    return dominant_topic[0], dominant_topic[1]

#apply the function to each doument in the corpus
dominant_topics = [get_dominant_topic(lda_model, doc) for doc in corpus]

#create a dataframe with the results
theme_df = pd.DataFrame(dominant_topics, columns=['Dominant_Topic', 'Topic_Probability'])

#add theme df to original df
song_theme = pd.concat([song_theme, theme_df], axis=1)

#function to get the top N words for a given topic
def get_topic_words(lda_model, topic_id, n_words=5):
    return [word for word, _ in lda_model.show_topic(topic_id, topn=n_words)]

# add a column with the top words for each song's 'dominatn topic
song_theme['Topic_Words'] = song_theme['Dominant_Topic'].apply(lambda x:get_topic_words(lda_model, x))


song_theme[['artist','song', 'translated_lyrics', 'Dominant_Topic', 'Topic_Probability', 'Topic_Words']]



Unnamed: 0,artist,song,translated_lyrics,Dominant_Topic,Topic_Probability,Topic_Words
0,Ed Sheeran,Thinking out Loud,[Verse 1]\nWhen your legs don't work like they...,4,0.993331,"[love, know, body, touch, used]"
1,Norah Jones,Come Away With Me,[Verse 1]\nCome away with me in the night\nCom...,7,0.983014,"[every, love, day, baby, want]"
2,Troye Sivan,Happy Little Pill,"[Verse 1]\nIn the crowd, alone\nAnd every seco...",9,0.993959,"[girls, run, world, know, little]"
3,The Script,Superheroes,"[Verse 1]\nAll her life, she has seen\nOh the ...",7,0.995754,"[every, love, day, baby, want]"
4,Ellie Goulding,How Long Will I Love You,[Intro]\nMm\nMm\n[Verse 1]\nHow long will I lo...,5,0.981246,"[somebody, love, fuck, find, come]"
...,...,...,...,...,...,...
160,Rossa,Takkan Berpaling DariMu,"[Lyrics ""Takkan Turning from You""]\n[VERSE]\nA...",1,0.725387,"[baby, know, side, rain, come]"
161,Hozier,Too Sweet,[Verse 1]\nIt can't be said I'm an early bird\...,1,0.981141,"[baby, know, side, rain, come]"
162,Hozier,Work Song,"[Intro]\nMmm, mmm, mmm, mmm\nMmm, mmm, mmm, mm...",1,0.993428,"[baby, know, side, rain, come]"
163,Coi Leray,Players,"[Chorus]\nYeah, 'cause girls is players too\nU...",8,0.825295,"[go, let, want, need, never]"


In [34]:
# #save df to a csv file 
# csv_path = './Dataset/LDA_sentiment.csv'
# song_theme.to_csv(csv_path)