In [9]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Load the dataset
file_path = '/Users/aliahmed/Downloads/archive/spotify_songs.csv'
data = pd.read_csv(file_path)

# Display the first few rows and the columns of the dataset
print(data.info())
print(data.head())

# Assuming the dataset has a 'track_name' column containing the titles of the songs
# Preprocess the titles
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'#\w+', '', text)  # Remove hashtags
        text = re.sub(r'\d+', '', text)  # Remove digits
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
        text = ' '.join([word for word in text.split() if word not in stop_words])
    else:
        text = ""
    return text

data['cleaned_titles'] = data['track_name'].apply(preprocess_text)

# Check the cleaned titles
print(data[['track_name', 'cleaned_titles']].head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32828 non-null  object 
 2   track_artist              32828 non-null  object 
 3   track_popularity          32833 non-null  int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32828 non-null  object 
 6   track_album_release_date  32833 non-null  object 
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key                       32833 non-null  int64  
 14  loudne

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aliahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import gensim
import gensim.corpora as corpora
from pprint import pprint

# Tokenize the titles
def tokenize(text):
    return text.split()

data['tokens'] = data['cleaned_titles'].apply(tokenize)

# Create Dictionary
id2word = corpora.Dictionary(data['tokens'])

# Create Corpus
texts = data['tokens']

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]


In [13]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10, 
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

# Print the topics
pprint(lda_model.print_topics())


[(0,
  '0.050*"good" + 0.039*"feel" + 0.034*"better" + 0.034*"party" + 0.030*"sun" '
  '+ 0.029*"day" + 0.028*"im" + 0.026*"big" + 0.022*"knows" + 0.021*"without"'),
 (1,
  '0.169*"remix" + 0.093*"mix" + 0.061*"original" + 0.012*"night" + '
  '0.012*"one" + 0.012*"club" + 0.010*"go" + 0.008*"extended" + 0.007*"vs" + '
  '0.007*"take"'),
 (2,
  '0.048*"bad" + 0.034*"dawn" + 0.026*"war" + 0.021*"hours" + 0.020*"el" + '
  '0.017*"five" + 0.017*"cold" + 0.013*"bebe" + 0.013*"rexha" + '
  '0.012*"justin"'),
 (3,
  '0.098*"dont" + 0.071*"life" + 0.036*"dance" + 0.025*"sweet" + 0.023*"lose" '
  '+ 0.023*"control" + 0.023*"make" + 0.021*"wanna" + 0.020*"hands" + '
  '0.019*"nothing"'),
 (4,
  '0.227*"love" + 0.051*"get" + 0.028*"got" + 0.024*"beautiful" + '
  '0.018*"found" + 0.018*"tonight" + 0.018*"could" + 0.017*"higher" + '
  '0.017*"klaas" + 0.016*"kill"'),
 (5,
  '0.303*"feat" + 0.025*"back" + 0.022*"way" + 0.018*"u" + 0.012*"steve" + '
  '0.012*"know" + 0.012*"alive" + 0.011*"say" + 0.0

In [19]:
pip install pyLDAvis


Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting numpy>=1.24.2 (from pyLDAvis)
  Downloading numpy-2.0.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting numpy>=1.24.2 (from pyLDAvis)
  Using cached numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (114 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl (14.0 MB)
Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, numpy, pyLDAvis
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling

In [21]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis


In [22]:
# Save the visualization as an HTML file
pyLDAvis.save_html(vis, 'lda_vis.html')
print("LDA visualization saved as lda_vis.html")


LDA visualization saved as lda_vis.html
