In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Load the dataset
file_path = '/Users/aliahmed/Downloads/archive/spotify_songs.csv'
data = pd.read_csv(file_path)

# Display the first few rows and the columns of the dataset
print(data.info())
print(data.head())

# Preprocess the text data (track_name and track_artist)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'#\w+', '', text)  # Remove hashtags
        text = re.sub(r'\d+', '', text)  # Remove digits
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
        text = ' '.join([word for word in text.split() if word not in stop_words])
    else:
        text = ""
    return text

data['combined_text'] = data['track_name'] + ' ' + data['track_artist']
data['cleaned_text'] = data['combined_text'].apply(preprocess_text)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32828 non-null  object 
 2   track_artist              32828 non-null  object 
 3   track_popularity          32833 non-null  int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32828 non-null  object 
 6   track_album_release_date  32833 non-null  object 
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key                       32833 non-null  int64  
 14  loudne

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aliahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import gensim
import gensim.corpora as corpora
from pprint import pprint

# Tokenize the text
def tokenize(text):
    return text.split()

data['tokens'] = data['cleaned_text'].apply(tokenize)

# Create Dictionary
id2word = corpora.Dictionary(data['tokens'])

# Create Corpus
texts = data['tokens']

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]]


In [7]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10, 
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

# Print the topics
pprint(lda_model.print_topics())


[(0,
  '0.052*"dont" + 0.036*"chris" + 0.028*"time" + 0.025*"good" + 0.024*"let" + '
  '0.023*"party" + 0.018*"stop" + 0.015*"high" + 0.015*"big" + 0.015*"new"'),
 (1,
  '0.164*"original" + 0.094*"like" + 0.052*"dimitri" + 0.050*"vegas" + '
  '0.024*"chainsmokers" + 0.019*"alesso" + 0.015*"ww" + 0.014*"walker" + '
  '0.012*"better" + 0.011*"axwell"'),
 (2,
  '0.059*"steve" + 0.039*"aoki" + 0.026*"thomas" + 0.020*"julian" + '
  '0.019*"cant" + 0.015*"light" + 0.015*"know" + 0.014*"hold" + 0.014*"music" '
  '+ 0.014*"da"'),
 (3,
  '0.051*"night" + 0.042*"dj" + 0.038*"lights" + 0.038*"get" + 0.035*"zedd" + '
  '0.020*"dirty" + 0.015*"body" + 0.014*"city" + 0.013*"cole" + 0.012*"snake"'),
 (4,
  '0.038*"john" + 0.032*"young" + 0.020*"beautiful" + 0.018*"vivo" + '
  '0.018*"mars" + 0.017*"de" + 0.016*"silva" + 0.015*"hey" + 0.015*"legend" + '
  '0.015*"wont"'),
 (5,
  '0.313*"remix" + 0.024*"one" + 0.023*"rhab" + 0.016*"world" + 0.014*"vs" + '
  '0.011*"klaas" + 0.011*"anthem" + 0.010*"kask

In [8]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis


In [9]:
# Save the visualization as an HTML file
pyLDAvis.save_html(vis, 'lda_vis.html')
print("LDA visualization saved as lda_vis.html")


LDA visualization saved as lda_vis.html
