In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import os 
from dotenv import load_dotenv
from datetime import datetime

In [3]:
#Get Spotify API credentials
load_dotenv()
client_id = os.getenv("client_id")
client_secret = os.getenv("client_secret")


In [4]:
def setup_spotify_client(client_id, client_secret):
    """
    Setup Spotify client with credentials
    """
    client_credentials_manager = SpotifyClientCredentials(
        client_id=client_id,
        client_secret=client_secret
    )
    return spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [5]:
def get_playlist_tracks(sp, playlist_id, limit=100):
    """
    Get all tracks from a playlist, including artist IDs
    """
    tracks = []
    results = sp.playlist_tracks(playlist_id)
    
    while results:
        for item in results['items']:
            if item['track'] is not None:  # Check if track exists
                track = item['track']
                tracks.append({
                    'title': track['name'],
                    'artist': track['artists'][0]['name'],
                    'artist_id': track['artists'][0]['id'],  # Added artist_id
                    'popularity': track['popularity'],
                    'track_id': track['id']
                })
                
        if results['next'] and len(tracks) < limit:
            results = sp.next(results)
        else:
            break
            
    return pd.DataFrame(tracks)

In [6]:
def get_artist_genres(sp, artist_ids):
    """
    Get genres for a list of artists
    """
    genres_dict = {}
    # Process artists in batches of 50 (Spotify API limit)
    for i in range(0, len(artist_ids), 50):
        batch = artist_ids[i:i+50]
        try:
            artists = sp.artists(batch)['artists']
            for artist in artists:
                if artist and artist['id']:
                    # Join multiple genres with semicolon if they exist
                    genres_dict[artist['id']] = '; '.join(artist['genres']) if artist['genres'] else 'unknown'
        except Exception as e:
            print(f"Error fetching genres for batch: {str(e)}")
            continue
    return genres_dict

In [7]:


def get_featured_playlists(sp):
    """
    Get a list of featured playlist IDs
    """
    featured_playlists = sp.featured_playlists(limit=50)
    return [playlist['id'] for playlist in featured_playlists['playlists']['items']]



In [8]:
def get_audio_features(sp, track_ids):
    """
    Get audio features for a list of tracks
    """
    audio_features = sp.audio_features(track_ids)
    
    features_data = []
    for features in audio_features:
        if features:
            features_data.append({
                'track_id': features['id'],
                'danceability': features['danceability'],
                'energy': features['energy'],
                'tempo': features['tempo'],
                'valence': features['valence'],
                'instrumentalness': features['instrumentalness'],
                'acousticness': features['acousticness']
            })
    
    return pd.DataFrame(features_data)



In [9]:
def get_category_playlists(sp):
    """
    Get playlists from different categories
    """
    categories = sp.categories(limit=50)['categories']['items']
    playlist_ids = []
    
    for category in categories:
        try:
            category_playlists = sp.category_playlists(category['id'], limit=10)
            playlist_ids.extend([
                playlist['id'] 
                for playlist in category_playlists['playlists']['items']
            ])
        except:
            continue
            
    return playlist_ids

In [10]:
def main():
    
    try:
        # Initialize Spotify client
        sp = setup_spotify_client(client_id, client_secret)
        
        print("Collecting playlist IDs...")
        # Get various playlist IDs
        playlist_ids = [
            '37i9dQZEVXbMDoHDwVN2tF',  # Global Top 50
            '37i9dQZF1DXcBWIGoYBM5M',  # Today's Top Hits
            '37i9dQZF1DX0XUsuxWHRQd',  # RapCaviar
            '37i9dQZF1DX4JAvHpjipBk',  # New Music Friday
        ]
        
        # Add featured and category playlists
        playlist_ids.extend(get_featured_playlists(sp))
        playlist_ids.extend(get_category_playlists(sp))
        
        # Remove duplicates
        playlist_ids = list(set(playlist_ids))
        
        print(f"Found {len(playlist_ids)} unique playlists")
        
        # Collect tracks from all playlists
        all_tracks = []
        track_ids_seen = set()
        
        for idx, playlist_id in enumerate(playlist_ids, 1):
            try:
                print(f"Processing playlist {idx}/{len(playlist_ids)}...")
                playlist_tracks = get_playlist_tracks(sp, playlist_id)
                
                # Only add new tracks (avoid duplicates)
                new_tracks = playlist_tracks[~playlist_tracks['track_id'].isin(track_ids_seen)]
                all_tracks.append(new_tracks)
                track_ids_seen.update(new_tracks['track_id'])
                
                print(f"Total unique tracks so far: {len(track_ids_seen)}")
                
                if len(track_ids_seen) >= 5000:
                    break
                    
            except Exception as e:
                print(f"Error processing playlist {playlist_id}: {str(e)}")
                continue
        
        # Combine all tracks
        tracks_df = pd.concat(all_tracks, ignore_index=True)
        
        # Get genres for all artists
        print("\nFetching artist genres...")
        unique_artist_ids = tracks_df['artist_id'].unique()
        genres_dict = get_artist_genres(sp, unique_artist_ids)
        
        # Add genres to the dataframe
        tracks_df['genres'] = tracks_df['artist_id'].map(genres_dict)
        
        # Get audio features in batches
        print("\nFetching audio features...")
        all_audio_features = []
        batch_size = 100  # Spotify API limit
        
        for i in range(0, len(tracks_df), batch_size):
            batch_ids = tracks_df['track_id'][i:i+batch_size].tolist()
            batch_features = get_audio_features(sp, batch_ids)
            all_audio_features.append(batch_features)
            print(f"Processed {min(i+batch_size, len(tracks_df))}/{len(tracks_df)} audio features")
            
        # Combine audio features
        audio_features_df = pd.concat(all_audio_features, ignore_index=True)
        
        # Merge tracks with their audio features
        complete_df = pd.merge(tracks_df, audio_features_df, on='track_id')
        
        # Reorder columns
        columns_order = [
            'title', 'artist', 'genres', 'popularity',
            'danceability', 'energy', 'tempo', 'valence',
            'instrumentalness', 'acousticness', 'track_id', 'artist_id'
        ]
        complete_df = complete_df[columns_order]
        
        print("\nData collection completed successfully!")
        print(f"Total songs collected: {len(complete_df)}")
        
        return complete_df
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

if __name__ == "__main__":
    df = main()



Collecting playlist IDs...
Found 409 unique playlists
Processing playlist 1/409...
Total unique tracks so far: 60
Processing playlist 2/409...
Total unique tracks so far: 110
Processing playlist 3/409...
Total unique tracks so far: 180
Processing playlist 4/409...
Total unique tracks so far: 280
Processing playlist 5/409...
Total unique tracks so far: 330
Processing playlist 6/409...
Total unique tracks so far: 430
Processing playlist 7/409...
Total unique tracks so far: 530
Processing playlist 8/409...
Total unique tracks so far: 610
Processing playlist 9/409...
Total unique tracks so far: 710
Processing playlist 10/409...
Total unique tracks so far: 810
Processing playlist 11/409...
Total unique tracks so far: 905
Processing playlist 12/409...
Total unique tracks so far: 985
Processing playlist 13/409...
Total unique tracks so far: 1109
Processing playlist 14/409...
Total unique tracks so far: 1207
Processing playlist 15/409...
Total unique tracks so far: 1252
Processing playlist 16/

In [11]:
#Extract the primary genre and add Primary genre column

df['Primary genre'] = df['genres'].str.split(';').str[0]


In [12]:
df['Primary genre'] = df['Primary genre'].str.lower()

In [13]:
def create_genre_clusters():
    """
    Creates a mapping dictionary for genre clustering.
    """
    genre_clusters = {
    # Hip Hop/Rap 
    'hip hop': 'Hip Hop/Rap',
    'rap': 'Hip Hop/Rap',
    'atl hip hop': 'Hip Hop/Rap',
    'canadian hip hop': 'Hip Hop/Rap',
    'chicago rap': 'Hip Hop/Rap',
    'east coast hip hop': 'Hip Hop/Rap',
    'conscious hip hop': 'Hip Hop/Rap',
    'detroit hip hop': 'Hip Hop/Rap',
    'melodic rap': 'Hip Hop/Rap',
    'hip pop': 'Hip Hop/Rap',
    'dirty south rap': 'Hip Hop/Rap',
    'baton rouge rap': 'Hip Hop/Rap',
    'florida rap': 'Hip Hop/Rap',
    'dfw rap': 'Hip Hop/Rap',
    'pop rap': 'Hip Hop/Rap',
    'gangster rap': 'Hip Hop/Rap',
    'chicago drill': 'Hip Hop/Rap',
    'cali rap': 'Hip Hop/Rap',
    'memphis hip hop': 'Hip Hop/Rap',
    'emo rap': 'Hip Hop/Rap',
    'brooklyn drill': 'Hip Hop/Rap',
    'chicano rap': 'Hip Hop/Rap',
    'houston rap': 'Hip Hop/Rap',
    'bronx hip hop': 'Hip Hop/Rap',
    'indie pop rap': 'Hip Hop/Rap',
    'lgbtq+ hip hop': 'Hip Hop/Rap',
    'florida drill': 'Hip Hop/Rap',
    'alternative hip hop': 'Hip Hop/Rap',
    'battle rap': 'Hip Hop/Rap',
    'southern hip hop': 'Hip Hop/Rap',
    'hardcore hip hop': 'Hip Hop/Rap',
    'cloud rap': 'Hip Hop/Rap',
    'comedy rap': 'Hip Hop/Rap',
    'g funk': 'Hip Hop/Rap',
    'crunk': 'Hip Hop/Rap',
    'chopped and screwed': 'Hip Hop/Rap',
    'hyphy': 'Hip Hop/Rap',
    'wu fam': 'Hip Hop/Rap',
    'bronx drill': 'Hip Hop/Rap',

    # Pop
    'pop': 'Pop',
    'dance pop': 'Pop',
    'canadian pop': 'Pop',
    'art pop': 'Pop',
    'k-pop': 'Pop',
    'pop rock': 'Pop',
    'barbadian pop': 'Pop',
    'acoustic pop': 'Pop',
    'candy pop': 'Pop',
    'post-teen pop': 'Pop',
    'colombian pop': 'Pop',
    'electropop': 'Pop',
    'australian pop': 'Pop',
    'puerto rican pop': 'Pop',
    'singer-songwriter pop': 'Pop',
    'europop': 'Pop',
    'pop r&b': 'Pop',
    'alternative pop': 'Pop',
    'indie pop': 'Pop',
    'country pop': 'Pop',
    'pop soul': 'Pop',
    'bedroom pop': 'Pop',
    'boy band': 'Pop',
    'new wave pop': 'Pop',
    'adult standards': 'Pop',
    'girl group': 'Pop',

    # R&B
    'contemporary r&b': 'R&B',
    'r&b': 'R&B',
    'neo soul': 'R&B',
    'canadian contemporary r&b': 'R&B',
    'british soul': 'R&B',
    'alternative r&b': 'R&B',
    'quiet storm': 'R&B',
    'classic soul': 'R&B',
    'new jack swing': 'R&B',
    'motown': 'R&B',
    'bedroom soul': 'R&B',
    'post-disco soul': 'R&B',
    'retro soul': 'R&B',
    'new jack smooth': 'R&B',
    'souldies': 'R&B',

    # Country
    'contemporary country': 'Country',
    'country': 'Country',
    'classic oklahoma country': 'Country',
    'classic texas country': 'Country',
    'arkansas country': 'Country',
    'modern country pop': 'Country',
    'country road': 'Country',
    'canadian country': 'Country',
    'country dawn': 'Country',
    'country rock': 'Country',
    'alberta country': 'Country',
    'classic country pop': 'Country',
    'bakersfield sound': 'Country',
    'western americana': 'Country',
    'bluegrass': 'Country',

    # Electronic/Dance
    'edm': 'Electronic/Dance',
    'eurodance': 'Electronic/Dance',
    'diva house': 'Electronic/Dance',
    'dance rock': 'Electronic/Dance',
    'brostep': 'Electronic/Dance',
    'big room': 'Electronic/Dance',
    'complextro': 'Electronic/Dance',
    'big beat': 'Electronic/Dance',
    'classic house': 'Electronic/Dance',
    'disco house': 'Electronic/Dance',
    'electro': 'Electronic/Dance',
    'dutch edm': 'Electronic/Dance',
    'acid house': 'Electronic/Dance',
    'dance': 'Electronic/Dance',
    'house': 'Electronic/Dance',
    'australian dance': 'Electronic/Dance',
    'downtempo': 'Electronic/Dance',
    'bubblegum dance': 'Electronic/Dance',
    'bass house': 'Electronic/Dance',
    'dutch house': 'Electronic/Dance',
    'chicago house': 'Electronic/Dance',
    'filter house': 'Electronic/Dance',
    'bouncy house': 'Electronic/Dance',
    'chill house': 'Electronic/Dance',
    'detroit house': 'Electronic/Dance',
    'new beat': 'Electronic/Dance',
    'german techno': 'Electronic/Dance',
    'hardcore techno': 'Electronic/Dance',
    'new italo disco': 'Electronic/Dance',
    'euphoric hardstyle': 'Electronic/Dance',
    'classic progressive house': 'Electronic/Dance',
    'bmore': 'Electronic/Dance',
    'belgian dance': 'Electronic/Dance',
    'kids dance party': 'Electronic/Dance',
    'dream trance': 'Electronic/Dance',

    # Rock
    'alternative metal': 'Rock',
    'alternative rock': 'Rock',
    'modern rock': 'Rock',
    'glam metal': 'Rock',
    'permanent wave': 'Rock',
    'album rock': 'Rock',
    'hard rock': 'Rock',
    'irish rock': 'Rock',
    'dance rock': 'Rock',
    'glam rock': 'Rock',
    'classic rock': 'Rock',
    'rock drums': 'Rock',
    'modern folk rock': 'Rock',
    'australian rock': 'Rock',
    'garage rock': 'Rock',
    'soft rock': 'Rock',
    'grunge': 'Rock',
    'rock': 'Rock',
    'punk rock': 'Rock',
    'psychedelic rock': 'Rock',
    'folk rock': 'Rock',
    'progressive rock': 'Rock',
    'country rock': 'Rock',
    'heartland rock': 'Rock',
    'celtic rock': 'Rock',
    'funk metal': 'Rock',
    'dance-punk': 'Rock',
    'nu metal': 'Rock',
    'industrial': 'Rock',
    'cowpunk': 'Rock',
    'comedy rock': 'Rock',
    'j-division': 'Rock',
    'classic canadian rock': 'Rock',
    'canadian rock': 'Rock',
    'russian metal': 'Rock',
    'art rock': 'Rock',
    'beatlesque': 'Rock',
    'new wave': 'Rock',
    'british invasion': 'Rock',
    'rock-and-roll': 'Rock',
    'madchester': 'Rock',
    'glam punk': 'Rock',
    'deep new wave': 'Rock',
    'experimental guitar': 'Rock',
    'vocaloid metal': 'Rock',
    'comic metal': 'Rock',
    'metal': 'Rock',

    # Latin
    'reggaeton': 'Latin',
    'latin pop': 'Latin',
    'bachata': 'Latin',
    'corrido': 'Latin',
    'corridos tumbados': 'Latin',
    'latin hip hop': 'Latin',
    'latin arena pop': 'Latin',
    'pop reggaeton': 'Latin',
    'canadian latin': 'Latin',
    'tejano': 'Latin',
    'urbano mexicano': 'Latin',
    'urbano latino': 'Latin',
    'mambo chileno': 'Latin',
    'cubaton': 'Latin',
    'sertanejo': 'Latin',
    'dembow': 'Latin',

    # Alternative/Indie
    'neo mellow': 'Alternative/Indie',
    'indietronica': 'Alternative/Indie',
    'modern alternative rock': 'Alternative/Indie',
    'pov: indie': 'Alternative/Indie',
    'alaska indie': 'Alternative/Indie',
    'indie soul': 'Alternative/Indie',
    'modern indie pop': 'Alternative/Indie',
    'indie rock': 'Alternative/Indie',
    'emo': 'Alternative/Indie',
    'canadian indie': 'Alternative/Indie',
    'la indie': 'Alternative/Indie',
    'brooklyn indie': 'Alternative/Indie',
    'chicago indie': 'Alternative/Indie',
    'san marcos tx indie': 'Alternative/Indie',
    'nashville indie': 'Alternative/Indie',
    'boston indie': 'Alternative/Indie',
    'kentucky indie': 'Alternative/Indie',
    'el paso indie': 'Alternative/Indie',
    'new jersey indie': 'Alternative/Indie',
    'atlanta indie': 'Alternative/Indie',
    'eau claire indie': 'Alternative/Indie',
    'albuquerque indie': 'Alternative/Indie',
    'cologne indie': 'Alternative/Indie',
    'derby indie': 'Alternative/Indie',
    'bath indie': 'Alternative/Indie',
    'athens indie': 'Alternative/Indie',
    'alabama indie': 'Alternative/Indie',

    # Jazz
    'jazz': 'Jazz',
    'cool jazz': 'Jazz',
    'contemporary jazz': 'Jazz',
    'soul jazz': 'Jazz',
    'jazz funk': 'Jazz',
    'jazz trio': 'Jazz',
    'bebop': 'Jazz',

    # Blues
    'blues': 'Blues',
    'modern blues': 'Blues',
    'modern blues rock': 'Blues',
    'classic blues': 'Blues',
    'blues rock': 'Blues',
    'delta blues': 'Blues',
    'southern soul blues': 'Blues',

    # Reggae
    'reggae': 'Reggae',
    'reggae fusion': 'Reggae',
    'dancehall': 'Reggae',
    'dub': 'Reggae',
    'ska': 'Reggae',
    'roots reggae': 'Reggae',
    'lovers rock': 'Reggae',

    # Gospel
    'canadian ccm': 'Gospel',
    'praise': 'Gospel',
    'family gospel': 'Gospel',
    'christian a cappella': 'Gospel',
    'roots worship': 'Gospel',

    # Folk
    'american folk revival': 'Folk',
    'folk': 'Folk',
    'canadian celtic': 'Folk',
    'celtic': 'Folk',
    'irish singer-songwriter': 'Folk',
    'progressive bluegrass': 'Folk',
    'black americana': 'Folk',

    # Other
    'glee club': 'Other',
    'freestyle': 'Other', 
    'deep talent show': 'Other',
    'idol': 'Other',
    'alt z': 'Other',
    'lilith': 'Other',
    'funk': 'Other',
    'ectofolk': 'Other',
    'new romantic': 'Other',
    'hollywood': 'Other',
    'ccm': 'Other',
    'movie tunes': 'Other',
    'afrofuturism': 'Other',
    'mellow gold': 'Other',
    'hip house': 'Other',
    'piano rock': 'Other',
    'gospel': 'Other',
    'broadway': 'Other',
    'bounce': 'Other',
    'disco': 'Other',
    'miami bass': 'Other',
    'pluggnb': 'Other',
    'jam band': 'Other',
    'escape room': 'Other',
    'christian music': 'Other',
    'sad sierreno': 'Other',
    'atlanta bass': 'Other',
    'afrobeats': 'Other',
    'anime': 'Other',
    'other': 'Other',
    'comic': 'Other',
    'tropical': 'Other',
    'bboy': 'Other',
    'cartoon': 'Other',
    'talent show': 'Other',
    'hi-nrg': 'Other',
    'lo-fi vgm': 'Other',
    'minneapolis sound': 'Other',
    'novelty': 'Other',
    'jersey club': 'Other',
    'chamber ensemble': 'Other',
    'white noise': 'Other',
    'clean comedy': 'Other',
    'doo-wop': 'Other',
    'talentschau': 'Other',
    'orthodox chant': 'Other',
    'healing': 'Other',
    'new orleans funk': 'Other',
    'bossbeat': 'Other',
    'wu fam': 'Other',
    'pixel': 'Other',
    'swazi traditional': 'Other',
    'mezmur': 'Other',
    '432hz': 'Other',
    'electra': 'Other',
    'acoustic cover': 'Other',
    'chanson': 'Other',
    'ambeat': 'Other',
    'idol kayo': 'Other'
    }
    
    return genre_clusters

def cluster_genres(df, genre_column, case_sensitive=False):
    """
    Clusters genres in a new column called Final genre.
    """
    # Create genre clusters mapping
    genre_clusters = create_genre_clusters()
    
    # Function to find matching genre
    def match_genre(genre):
        if not isinstance(genre, str):
            return 'Unknown'
            
        # Convert to lowercase if case-insensitive
        search_genre = genre if case_sensitive else genre.lower()
        
        # Try direct match
        if search_genre in genre_clusters:
            return genre_clusters[search_genre]
        
        # Try partial match (remove brackets for partial matching)
        search_genre_clean = search_genre.replace('[', '').replace(']', '')
        for key, value in genre_clusters.items():
            key_clean = key.replace('[', '').replace(']', '')
            if key_clean in search_genre_clean:
                return value
        
        return 'Other'
    

In [14]:
df['Primary genre'] = df['genres'].apply(lambda x: x.split(';')[0].strip() if isinstance(x, str) and x else None)


In [15]:
df['final genre'] = cluster_genres(df, 'Primary genre')

In [16]:
# Apply the genre clustering
genre_clusters = create_genre_clusters()
df['final genre'] = df['Primary genre'].map(genre_clusters).fillna('Other')


In [17]:
df.to_csv('spotify_songs.csv', index=False)

In [None]:
df.head(50)