In [1]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
from nltk import download

# Download required NLTK resources
download('stopwords')
download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shruti14/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/shruti14/nltk_data...


True

In [2]:
spotify_songs = pd.read_csv('/Users/arushimunjal/Desktop/Group145-FA24/spotify_songs.csv')
songs_popularity_genres = pd.read_csv('/Users/arushimunjal/Desktop/Group145-FA24/songs_normalize.csv')
songs_lyrics = pd.read_csv('/Users/arushimunjal/Desktop/Group145-FA24/labeled_Lyrics_Cleaned.csv')

# Clean Spotify Songs dataset
spotify_songs['track_album_release_date'] = pd.to_datetime(
    spotify_songs['track_album_release_date'], errors="coerce"
)

# Extract year from date
spotify_songs['year_released'] = spotify_songs['track_album_release_date'].dt.year
spotify_songs = spotify_songs.dropna(subset=['year_released'])
spotify_songs['year_released'] = spotify_songs['year_released'].astype(int)
spotify_songs = spotify_songs[
    (spotify_songs['year_released'] >= 2000) & (spotify_songs['year_released'] <= 2023)
]
spotify_songs = spotify_songs[spotify_songs['language'] == 'en']

# Drop unnecessary columns
spotify_songs = spotify_songs.drop(
    columns=[
        'track_album_release_date',
        'track_artist',
        'track_album_id',
        'track_album_name',
        'playlist_name',
        'playlist_id',
        'playlist_subgenre',
        'duration_ms',
        'language'
    ]
)

spotify_songs = spotify_songs.rename(columns={'track_name': 'song'})

# Merge songs_popularity_genres and songs_lyrics on 'song'
merged_df = pd.merge(songs_popularity_genres, songs_lyrics, on='song', how='inner')

# Select and rename relevant columns
cleaned_df = merged_df[['song', 'popularity', 'year', 'genre', 'seq']].rename(
    columns={
        'seq': 'lyrics',
        'popularity': 'track_popularity',
        'year': 'year_released',
        'genre': 'playlist_genre'
    }
)

# Filter by year of release
filtered_df = cleaned_df[
    (cleaned_df['year_released'] >= 2000) & (cleaned_df['year_released'] <= 2023)
]

# Remove duplicate songs
unique_songs_df = filtered_df.drop_duplicates(subset='song', keep='first')

# Define target genres
unique_genres = ['pop', 'rap', 'rock', 'r&b']

# Function to clean and simplify genres
def clean_genre(genre):
    first_genre = genre.split(',')[0].strip().lower()
    return first_genre if first_genre in unique_genres else 'other'

# Apply the cleaning function to the 'playlist_genre' column
unique_songs_df['playlist_genre'] = unique_songs_df['playlist_genre'].apply(clean_genre)

# Filter for only target genres
unique_songs_df = unique_songs_df[unique_songs_df['playlist_genre'].isin(unique_genres)]

# Combine with Spotify Songs dataset
final_df = pd.concat([spotify_songs, unique_songs_df], ignore_index=True)

# Filter for only target genres
final_df = final_df[final_df['playlist_genre'].isin(unique_genres)]

# Display the cleaned dataset
final_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/arushimunjal/Desktop/Group145-FA24/spotify_songs.csv'

In [None]:
# Find the minimum number of songs across genres
min_songs_per_genre = final_df.groupby('playlist_genre').size().min()

# Select top songs for each genre
top_songs = final_df.groupby('playlist_genre').apply(
    lambda x: x.nlargest(min_songs_per_genre, 'track_popularity')
).reset_index(drop=True)

# Check the balance
top_songs['playlist_genre'].value_counts()

In [None]:
# Preprocessing function
def preprocess_lyrics_simple(lyrics):
    custom_stopwords = set(stopwords.words('english')).union({
        'oh', 'yeah', 'na', 'll', 'baby', 'let', 'ca', 'wan', 've', 'ai', 'way',
        'come', 'ooh', 'gon', 'say', 'like', 'know', 'got', 'cause', 'im'
    })
    lyrics = lyrics.lower()
    lyrics = re.sub(r"'ll|n't|'ve|'re|'m|'d|'s", '', lyrics)
    lyrics = re.sub(r"'", '', lyrics)
    lyrics = lyrics.translate(str.maketrans('', '', string.punctuation))
    words = lyrics.split()
    words = [word for word in words if word not in custom_stopwords]
    return ' '.join(words)

# Apply preprocessing to the lyrics
top_songs['lyrics_cleaned'] = top_songs['lyrics'].fillna('').apply(preprocess_lyrics_simple)

# Display cleaned lyrics
top_songs[['song', 'lyrics_cleaned']].head()

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Define seed words for themes
seed_words = {
    'love and heartbreak': [
        'love', 'heartbreak', 'romance', 'passion', 'relationship', 
        'loss', 'desire', 'longing', 'betrayal', 'yearning', 
        'affection', 'connection', 'devotion', 'emotion', 
        'rejection', 'heartache', 'jealousy', 'care', 'adoration', 'regret'
    ],
    'coming of age': [
        'growth', 'identity', 'self-discovery', 'maturity', 
        'change', 'adventure', 'youth', 'independence', 'innocence', 
        'journey', 'transition', 'future', 'dream', 'freedom', 
        'hope', 'challenge', 'learning', 'experience', 'reality', 'choices'
    ],
    'struggle and hardship': [
        'pain', 'hardship', 'struggle', 'tears', 
        'suffering', 'challenge', 'perseverance', 'poverty', 'inequality', 
        'difficulty', 'anguish', 'defeat', 'adversity', 'survival', 
        'trial', 'despair', 'oppression', 'loss', 'battle', 'sacrifice'
    ],
    'social commentary': [
        'society', 'justice', 'equality', 'freedom', 
        'oppression', 'protest', 'activism', 'truth', 'change', 
        'community', 'voice', 'reform', 'power', 'system', 
        'revolution', 'awareness', 'humanity', 'rights', 'corruption', 'struggle'
    ],
    'introspection': [
        'thought', 'feeling', 'self', 'soul', 
        'reflection', 'emotion', 'mind', 'contemplation', 'question', 
        'purpose', 'existence', 'doubt', 'inner', 'dream', 
        'search', 'consciousness', 'journey', 'truth', 'philosophy', 'memory'
    ],
    'nostalgia': [
        'memory', 'past', 'dream', 'home', 
        'time', 'childhood', 'reflection', 'reminisce', 'history', 
        'remember', 'yesterday', 'longing', 'old', 'life', 
        'family', 'roots', 'comfort', 'tradition', 'place', 'moments'
    ],
    'rebellion': [
        'revolt', 'freedom', 'power', 'protest', 
        'revolution', 'defiance', 'independence', 'truth', 'rights', 
        'challenge', 'fight', 'stand', 'change', 'break', 
        'system', 'movement', 'strength', 'justice', 'individual', 'conflict'
    ]
}




# Expand themes using spaCy
def expand_theme_with_spacy(seed_words, nlp, top_n=10):
    expanded_themes = {}
    for theme, seeds in seed_words.items():
        expanded_themes[theme] = seeds.copy()
        for seed in seeds:
            if seed in nlp.vocab:
                similar_words = [w.text for w in nlp(seed).vector.most_similar(n=top_n)]
                expanded_themes[theme].extend(similar_words)
        expanded_themes[theme] = list(set(expanded_themes[theme]))
    return expanded_themes

expanded_themes = expand_theme_with_spacy(seed_words, nlp)

In [None]:
def assign_theme_weighted(lyrics, theme_dict):
    """
    Assign the theme with the highest word overlap in the lyrics.
    If no matches are found, assign the theme with the least overlap.
    """
    word_counts = {theme: 0 for theme in theme_dict}
    words = lyrics.split() if isinstance(lyrics, str) else []  # Ensure lyrics is a string

    # Count occurrences of each theme's words in the lyrics
    for theme, theme_words in theme_dict.items():
        for word in theme_words:
            word_counts[theme] += words.count(word)

    # Always return a valid theme, even if no words match
    return max(word_counts, key=word_counts.get)

# Apply function to assign themes
top_songs['primary_theme'] = top_songs['lyrics_cleaned'].apply(
    lambda x: assign_theme_weighted(x, expanded_themes)
)
# Verify that 'None' theme is removed
top_songs['primary_theme'].value_counts()

In [None]:
theme_counts = top_songs.groupby(['playlist_genre', 'primary_theme']).size().unstack(fill_value=0)
theme_counts_normalized = theme_counts.div(theme_counts.sum(axis=1), axis=0)

plt.figure(figsize=(12, 8))
sns.heatmap(theme_counts_normalized, cmap="Blues", annot=True, fmt=".2f", cbar=True)
plt.title('Heatmap of Themes Across Genres', fontsize=16)
plt.xlabel('Themes', fontsize=12)
plt.ylabel('Genres', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()