In [1]:
# Standard libraries
import sys

# Scientific libraries
import pandas as pd

# Spotify
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import spotify_credentials

# Utility functions
from helpers.helpers_spotify import get_tracks_from_playlist, get_tracks_from_id, cleaned_track_data

In [2]:
def clear_spotify_cache():
    """Clears the Spotify API cache."""
    from helpers.helpers_spotify import clear_cache
    clear_cache()

# Import Spotify Data

## Identification

In [3]:
# read about scopes here: 
# https://developer.spotify.com/documentation/general/guides/authorization/scopes/#playlist-read-private

scope = "user-library-read"
spotify = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=spotify_credentials.client_ID, 
                                                client_secret= spotify_credentials.client_SECRET, 
                                                redirect_uri=spotify_credentials.redirect_url, 
                                                scope=scope,
                                                ),
                            requests_timeout=5,  # Tell Requests to stop waiting for a response after X seconds
                            retries=1,           # Total number of retries to allow
                            status_retries=3,    # Number of times to retry on bad status codes
                            backoff_factor=30,   # A backoff factor to apply between attempts after the second try
                        )

In [4]:
# Define the Spotify API scope for user library read access
# https://developer.spotify.com/documentation/general/guides/authorization/scopes/#playlist-read-private
spotify_scope = "user-library-read"

# Set up authentication with Spotify OAuth and API requests configuration
spotify_auth_manager = SpotifyOAuth(
    client_id=spotify_credentials.client_ID,
    client_secret=spotify_credentials.client_SECRET,
    redirect_uri=spotify_credentials.redirect_url,
    scope=spotify_scope
)

spotify_api = spotipy.Spotify(
    auth_manager=spotify_auth_manager,
    requests_timeout=5,   # Stop waiting for a response after X seconds
    retries=1,            # Total number of retries allowed
    status_retries=3,     # Number of times to retry on bad status codes
    backoff_factor=30     # Backoff factor to apply between attempts after the second try
)

## Scrap saved tracks data

In [5]:
def get_saved_tracks(sp):
    """
    Retrieves the metadata for all saved tracks for the authenticated user from Spotify API.
    More info here: https://github.com/alicebarbe/SoundtrackofLife/blob/main/spotify.py
    
    Parameters:
    - sp: A Spotipy client object with authorized access to user's Spotify account.
    
    Returns:
    - A list of dictionaries containing the metadata for all saved tracks.
    """
    tracks = []
    results = sp.current_user_saved_tracks(limit=50)
    tracks.extend(results['items'])
    
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    
    return tracks

In [6]:
def spotify_analyzer(data, spotipy_client):
    """
    Analyzes audio features of Spotify tracks and returns a pandas DataFrame.

    Parameters:
    -----------
    data : dict
        Dictionary with metadata of a Spotify playlist.
    spotipy_client : spotipy.Spotify
        A spotipy client object with valid authentication.

    Returns:
    --------
    pd.DataFrame
        A pandas DataFrame with audio features of tracks from the specified playlist.
    """
    
    # List of audio features to extract
    audio_keys = (
        "energy",
        "valence",
        "danceability",
        "acousticness",
        "danceability",
        "loudness",
        "speechiness",
        "instrumentalness",
        "liveness",
        "tempo",
        "key",
        "mode",
        "time_signature",
        "duration_ms",
    )

    # List to store extracted data
    music_data = []

    # Get a generator of tracks from the specified playlist
    track_generator = get_tracks_from_playlist(playlist_metadata=data)  # # when data is a dict
    # gen = get_tracks_from_id(tracks_id=data, spotipy_client=spotipy_client)  # when data is a list of id

    # Iterate over the tracks and extract audio features
    for i, track_metadata in enumerate(track_generator):
        # Display the track index
        # sys.stdout.flush()
        # print(f"Processing track #{i}")
        # sys.stdout.flush()
        # Extract info about the track
        music_data.append(
            cleaned_track_data(
                track_metadata=track_metadata,
                audio_keys=audio_keys,
                spotipy_client=spotipy_client,
            )
        )

    # Return the extracted data as a pandas DataFrame
    return pd.DataFrame(music_data)

In [7]:
# Retrieve metadata for all saved tracks in a user's Spotify library 
playlist_metadata = get_saved_tracks(spotify)

In [8]:
# # PARALLEL PROCESS (MIGHT RUN INTO 'RATE LIMIT' BY SPOTIFY API)
# from joblib import Parallel, delayed
# results = Parallel(n_jobs=-1)(delayed(spotify_analyzer)(data, spotify) for data in playlist_metadata)
# df = pd.concat(results).reset_index(drop=True)

# # LINEAR PROCESS
df = spotify_analyzer(data=playlist_metadata, spotipy_client=spotify)
df

Could not find related genres for the track: 'Mobiagse - Mixed' (id: '4NHeCNltciImyZexSMNRwc')
Could not find related genres for the track: 'Dancar' (id: '0UwcqaMHoBRbCKFj2NZ2AC')
Could not find related genres for the track: 'House of Illusion' (id: '3XRkMJIgAOGMoB5wlptXaC')
Could not find related genres for the track: 'Forever Walking' (id: '7KbA07FXaTcl6CZ63J9AdI')
Could not find related genres for the track: 'Souls of Sorrow (555)' (id: '3HxQsBASZPsg3FUIpeGdE5')
Could not find related genres for the track: 'Spirit Voices' (id: '1c32e7sA6sxY5GmOxvn7ya')


Unnamed: 0,spotify_id,title,artists,artists_id,genres,popularity,energy,valence,danceability,acousticness,loudness,speechiness,instrumentalness,liveness,tempo,key,mode,time_signature,duration_ms
0,2TU4uc1YAQrWV57fJ7TPk4,Blue Gold,[Far Orange],[1SIt7IjD8Q9RpwEklyMlUO],"[beach house, chill beats, deep deep house, pr...",53,0.586,0.1570,0.727,0.07830,-9.451,0.0427,0.9020,0.1080,110.001,5,0,4,224909
1,3aIv0tjMJqDzONyBFtUa9g,For The First Time,[Slow],[5II01coLXrJeSFThmONDoB],"[chillhop, lo-fi jazzhop]",47,0.709,0.7430,0.807,0.11500,-13.109,0.0456,0.8960,0.2590,110.983,9,0,4,197000
2,1yMiMLEVBWals38tiLThLR,Coquelicot,[OYOANNE],[54SxhcvIgW6e7A4R2UKNrL],"[ambient, background jazz]",51,0.535,0.1820,0.826,0.00641,-9.885,0.1730,0.6980,0.1590,115.024,3,0,4,153511
3,1gr3wnx384k5kcQAs2S1FA,memphis,[Remy Van Kesteren],[5x183GdNFhX9FETwGNpOZE],[classical harp],34,0.694,0.4170,0.466,0.67000,-15.459,0.0440,0.8390,0.1040,199.963,4,0,4,194027
4,4qzIWW9yArNdpuqc9SbtJH,Floating,[Klur],[5Y1YwWzFX7BIxBbdAOXOEJ],[progressive house],60,0.671,0.0763,0.706,0.07900,-12.190,0.0391,0.8720,0.1020,120.002,1,1,4,252552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2158,4cIDajUp2dMjKiDmDvFDg8,Tarlabasi - Be Svendsen Remix,"[Oceanvs Orientalis, Be Svendsen]","[3gNEIgLeknpwkNViU8WAhg, 4BaLB5aiExO29BEGVUisru]","[deep euro house, ethnotronica, organic electr...",53,0.599,0.2160,0.854,0.20600,-12.259,0.0604,0.8610,0.0899,114.001,4,0,4,545813
2159,2wqyAaRNAZBO9OncYwAQSR,Japan (Instrumental),[Tycho],[5oOhM2DFWab8XhSdQiITry],"[chillwave, downtempo, electronica, indietroni...",22,0.433,0.3400,0.562,0.21900,-12.291,0.0323,0.7980,0.3860,166.036,0,1,4,374569
2160,1jHSmrYPQsr5qGRxLmEQ52,Tuur mang Welten,[Niklas Paschburg],[4dTw5svKFBPnfijbi3H9eI],"[compositional ambient, neo-classical]",0,0.348,0.2610,0.290,0.96600,-16.575,0.0437,0.8560,0.3710,179.919,1,0,3,277597
2161,6Vhtb9RU1thtzBcsy3bI9q,Dance of Kali,[Prem Joshua],[1Bs9FqmJBHrAJN6DLFIPt1],"[indian fusion, kirtan, world fusion]",0,0.697,0.4870,0.614,0.02590,-8.212,0.0322,0.0897,0.0575,154.995,8,1,4,511200


In [9]:
# number of tracks with no genres
(~df['genres'].astype(bool)).sum()  # empty lists translate to False as a boolean

6

In [10]:
# show tracks without genres
no_genres = ~df['genres'].astype(bool)
df[no_genres]

Unnamed: 0,spotify_id,title,artists,artists_id,genres,popularity,energy,valence,danceability,acousticness,loudness,speechiness,instrumentalness,liveness,tempo,key,mode,time_signature,duration_ms
360,4NHeCNltciImyZexSMNRwc,Mobiagse - Mixed,[Ulf Alexander],[64MJxMlRak8Xgh3N9vePD1],[],0,0.611,0.132,0.847,0.0274,-12.289,0.0996,0.485,0.0979,122.01,4,0,4,424918
1059,0UwcqaMHoBRbCKFj2NZ2AC,Dancar,[REA SOM],[7jgK8I3O10R3QKxxGS9WJ7],[],2,0.598,0.684,0.777,0.739,-12.49,0.0974,7.9e-05,0.0998,111.068,10,0,4,212440
1217,3XRkMJIgAOGMoB5wlptXaC,House of Illusion,[SONNY],[7woO5xoM5KGReQEEqdexGj],[],0,0.754,0.692,0.478,0.0224,-10.807,0.0432,0.532,0.335,181.966,4,0,4,528941
1315,7KbA07FXaTcl6CZ63J9AdI,Forever Walking,[Max Weis],[3MYl2oyRRRieXiROUzhE3z],[],0,0.428,0.228,0.758,0.179,-11.706,0.0793,0.589,0.3,120.001,7,0,4,285161
1411,3HxQsBASZPsg3FUIpeGdE5,Souls of Sorrow (555),[Paya],[21KKHztU2Tpix1EFOg7xBI],[],4,0.711,0.237,0.712,0.204,-8.781,0.0648,0.4,0.165,100.007,7,1,4,355200
1414,1c32e7sA6sxY5GmOxvn7ya,Spirit Voices,[Karam],[6sb4xnx2GsiNxIx4A4kjDm],[],2,0.488,0.385,0.679,0.125,-11.2,0.0497,0.908,0.124,105.0,7,1,4,329875


In [11]:
# Check if there is any missing values
df.isnull().values.any()

False

In [None]:
# Save all the data
df.to_csv('../Data/csv/01_spotify_data_(all).csv')  # For visualization
df.to_pickle('../Data/pkl/01_spotify_data_(all).pkl')  # for backup


# Save apart the tracks without genres fron the complete ones
df[no_genres].to_pickle('../Data/pkl/01_spotify_data_(isna).pkl')
df[~no_genres].to_pickle('../Data/pkl/01_spotify_data_(notna).pkl')