In [16]:
import pandas as pd

song_theme = pd.read_json('./Dataset/top_songs_lyrics.json')
song_theme.columns

Index(['year', 'artist', 'song', 'lyrics'], dtype='object')

# Latent Dirichlet Allocation (LDA) - 165 songs

**Three major ideas**
- Translate songs in native language to english
- pre-processing: Clean and tokenize the translated lyrics, remove stopwords (common and custom)
- Topic Modelling: Create an LDA model to identify topics within the song lyrics

**Detailed Steps**
1) Importing libraries
    - googletrans: handle text transaltion
    - string: provides constants for string manipulation 
    - nltk: A toolkit for natural language processing (NLP)
    - gensim: Used for topic modelling

2) Initialize Translator

3) Define Translation Function

4) Translate Lyrics

5) Define Stopwords
    - default_stopwords: default set of english stopwords from NLTK
    - custom_stopwords: I manually added them.

6) Pre-process Lyrics Function
    - Remove punctuation
    - convert lyrics to lowercase
    - tokenize lyrics into words
    - filter out non-alphabetic words and stopwords

7) Apply pre-preprocessing
    
8) Perform Topic Modelling with LDA
    - dictionary: maps each unqiue word to an ID
    - corpus: transform the tokens into a bag-of-words format
    - lda_model: creates an LDA model with 10 topics and trains it on the corpus for 15 passes 
    - extracts and prints the top 5 words for each topic found by the LDA model


In [139]:
from deep_translator import GoogleTranslator
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models

# #download stopwords
# nltk.download('stopwords')
# nltk.download('punkt')

#initilaize translator 
translator = GoogleTranslator(source='auto', target='en')

#define function to translate lyrics to english
def translate_lyrics(lyrics):
    try:
        return translator.translate(lyrics)
    except Exception as e:
        print (f'Error in Transaltion: {e}')
        return lyrics

#translate lyrics to english
song_theme['translated_lyrics'] = song_theme['lyrics'].apply(translate_lyrics)

# get the list of stopwords - default & custom
default_stopwords = set(stopwords.words('english'))
custom_stopwords = default_stopwords.union(
    {'verse', 'chorus', '–', 'im', 'oh', 'na', 'yeah','ooh', 'wan', 'ill', 'youre', '2', '1', 
     'ft', 'like', 'prechorus','la',  'woah', 'gettin', 'postchorus', 'mmm', 'mxrxgxa', 'da',
     'ah', 'gon', 'ya', "’", 'thats', 'outro', 'ive', 'hey', 'montenero','gioielli','del', 'zyrtck',
     'bridge', 'dj', 'ta', 'em', 'youd', 'didnt', 'nothin', 'ayy','redrum', 'ohoh', 'ariana', 'vacca', 'bout', 'grande', 
     '21', 'youve', 'youll', 'somethin', 'beyoncé', 'id', 'smack', 'yo', 'freestyle','di', 'intro', 'woo', 'might', 'il', 'non', 'uh', 'mm',
     'hes', 'ima', 'e', 'x', 'itll','could', 'ee', 'comin', 'lil', 'shes', '3', 'halo', 'whats', 'thank',
     'gionni', 'gunz', 'ride', 'blow', 'black','mutha', 'uhoh', 'pai', 'zeno', 'wouldnt', 'air',
     'egreen', 'toni', 'georgia', 'throw', 'nah','doesnt', 'cassel', 'oohoohoohooh', 'huh','goin', 'cleanin', 'claver', 'closet', 
     'nex', 'michaels', 'per', 'line','shall', 'ele', 'fit', 'ho', 'jp', 'joe', 'smokestackstudio', 'getting', 'aleaka',
     })

#pre-process the lyrics function
def preprocess_lyrics(lyrics):
    translator = str.maketrans('', '', string.punctuation)
    tokens = word_tokenize(lyrics.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in custom_stopwords]
    return tokens

#apply pre-processing using the TRANSLATED lyrics
song_theme['tokens'] = song_theme['translated_lyrics'].apply(preprocess_lyrics)

#PERFORM TOPIC MODELLING

#create dictionary and corpus for LDA
dictionary = corpora.Dictionary(song_theme['tokens'])
corpus = [dictionary.doc2bow(text) for text in song_theme['tokens']]

#build LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=42)

#print topics 
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.019*"star" + 0.018*"love" + 0.016*"home" + 0.013*"next" + 0.011*"thank"')
(1, '0.019*"know" + 0.013*"side" + 0.012*"want" + 0.011*"let" + 0.011*"got"')
(2, '0.025*"low" + 0.022*"love" + 0.018*"try" + 0.017*"girls" + 0.014*"know"')
(3, '0.022*"love" + 0.022*"got" + 0.015*"one" + 0.014*"somebody" + 0.013*"baby"')
(4, '0.017*"every" + 0.015*"know" + 0.013*"let" + 0.013*"love" + 0.013*"baby"')


# Visualize distribution of Topics from LDA model

1) Visualize topic distibution per document
    - a document refers to individual unit of text that's being analzyed.
    - i have 165 songs in my data frame, it means i have 165 documents
    - aka 'Corpus'
    
2) Visualize topic distribution using pyLDAvis

In [133]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_topics_plotly(lda_model, num_topics=5, num_words=5):
    # Create a subplot grid: 1 row, 5 columns (for 5 topics)
    fig = make_subplots(
        rows=1, cols=num_topics,  # Adjusted for 1 row and 5 columns
        subplot_titles=[f'Topic {i+1}' for i in range(num_topics)]  # Titles adjusted to start from 1
    )

    for idx in range(num_topics):
        topic = lda_model.print_topics(num_words=num_words)[idx][1]
        words = topic.split(" + ")
        words = [word.split("*")[1].strip().replace('"', '') for word in words]
        weights = [float(word.split("*")[0]) for word in topic.split(" + ")]

        # Add a bar trace for each topic
        fig.add_trace(
            go.Bar(x=weights, y=words, orientation='h', name=f'Topic {idx + 1}'),
            row=1, col=idx + 1  # Update row/column to fit 1 row layout
        )

    # Update layout
    fig.update_layout(
        title_text='LDA Topic Modeling Results', height=300,  # Adjusted height for better visualization
        template='plotly_dark',
        font=dict(color='white'),  # Change font color to white for readability
        xaxis=dict(showgrid=False, zeroline=False, title_font=dict(color='white')), 
        yaxis=dict(showgrid=False, zeroline=False, title_font=dict(color='white')),
        showlegend=False  # Hide legend since it's unnecessary for this plot
    )

    # Save the figure as an HTML file
    # fig.write_html("./static/charts/lda_topics.html")

    fig.show()

# Call the function with 5 topics
plot_topics_plotly(lda_model, num_topics=5)
