#### Install Spotipy library to use Spotify Web API & dotenv for the environment variables

In [1]:
pip install spotipy



In [2]:
!pip install colab-env -qU

#### Import OS functionality & load environment variables from the local `.env`

In [3]:
import os # for operating system functionalities
import spotipy # for Spotify Web API
import timeit # for runtime checking
import pandas as pd # for data manipulation & analysis
import numpy as np # for linear algebra
import IPython # for audio previews
import colab_env # for Google Colab environment variables
from spotipy.oauth2 import SpotifyClientCredentials # for Spotify authentication
from tqdm.notebook import trange, tqdm # for progress bars
from IPython.display import Image # for images display


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### Import spotipy library to use Spotify Web API
https://spotipy.readthedocs.io/

## Authentication

In [4]:
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')

client_credentials_manager = SpotifyClientCredentials(client_id=client_id,
                                                      client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## List of playlists

In [5]:
USER_NAME = 'eargasmusic'
user = sp.user(USER_NAME)
playlist_dict = sp.user_playlists(USER_NAME)
playlist_items = playlist_dict['items']

# for item in playlist_items:
#     print(item['id'])
# all_playlists = pd.DataFrame(playlist_items,
#                          columns=['id', 'name', 'external_urls', 'images', 'tracks'])

playlist_details = [[item['id'],
                     item['name'],
                     item['external_urls']['spotify'],
                     item['images'][0]['url'],
                     item['tracks']['total']] for item in playlist_items]

all_playlists = pd.DataFrame(playlist_details,
                             columns=['id', 'name', 'url', 'image', 'tracks'])

# Create boolean filter series to mask named and unnamed playlists
named = all_playlists['name'].str.startswith('eargasm | ')
unnamed = all_playlists['name'].str.startswith('eargasm music ')

# Let's keep these playlists as unlabeled for later experiments
unnamed_playlists = all_playlists[unnamed.values]
# Extract all named playlists
named_playlists = all_playlists[named.values]

#### Save or the basic info dataframe to csv file

In [6]:
named_playlists.to_csv('named_playlists.csv')
unnamed_playlists.to_csv('unnamed_playlists.csv')

#### Load the previously fetched data

In [7]:
# named_playlists = pd.read_csv('named_playlists.csv', index_col=0)
# unnamed_playlists = pd.read_csv('unnamed_playlists.csv', index_col=0)

In [8]:
named_playlists.head()

Unnamed: 0,id,name,url,image,tracks
0,5apHWYcigR3lSZpyzyGKEa,eargasm | breathe easy,https://open.spotify.com/playlist/5apHWYcigR3l...,https://i.scdn.co/image/ab67706c0000bebbd3ccf5...,173
1,3MXM4ca1b3bT198F7mG9ms,eargasm | city walk,https://open.spotify.com/playlist/3MXM4ca1b3bT...,https://i.scdn.co/image/ab67706c0000da84e10d9c...,242
2,2QdM3NBe7lkOzC7OqWXfNI,eargasm | curvatronik,https://open.spotify.com/playlist/2QdM3NBe7lkO...,https://i.scdn.co/image/ab67706c0000bebb2aa390...,213
3,1CwPTyGbQDSda6m7vTys1d,eargasm | decadency,https://open.spotify.com/playlist/1CwPTyGbQDSd...,https://i.scdn.co/image/ab67706c0000da84c70dd4...,68
4,6pGQQZ4PITmFnSC0rTnmXp,eargasm | deep water,https://open.spotify.com/playlist/6pGQQZ4PITmF...,https://i.scdn.co/image/ab67706c0000da846e34ff...,93


In [9]:
unnamed_playlists.head()

Unnamed: 0,id,name,url,image,tracks
29,43754bIdP7b0ygh8tTMenW,eargasm music 2020,https://open.spotify.com/playlist/43754bIdP7b0...,https://i.scdn.co/image/ab67706c0000da84c881c9...,254
30,0MsxZLGhAKJyBMXAfD03db,eargasm music 2019,https://open.spotify.com/playlist/0MsxZLGhAKJy...,https://i.scdn.co/image/ab67706c0000da84eabc26...,815
31,4tFrGBRcTYsrz5BwCGZS8L,eargasm music 2018,https://open.spotify.com/playlist/4tFrGBRcTYsr...,https://i.scdn.co/image/ab67706c0000bebb96c0b2...,826
32,0tNl58CSFwviwg7LxWzdwy,eargasm music 2017,https://open.spotify.com/playlist/0tNl58CSFwvi...,https://i.scdn.co/image/ab67706c0000bebbb7c821...,876
33,2CDNi9K1M0ilAUQn1FTVp4,eargasm music 2016,https://open.spotify.com/playlist/2CDNi9K1M0il...,https://i.scdn.co/image/ab67706c0000bebbde6035...,61


#### Random named playlist info

In [12]:
random_playlist_id = named_playlists['id'][np.random.randint(0, len(named_playlists['id']))]
random_filter = named_playlists['id'] == random_playlist_id
random_playlist_info = named_playlists[random_filter.values].iloc[0]
for key, value in random_playlist_info.iteritems():
    if key == 'image': IMAGE_URL=value
    else: print('{}: {}'.format(key, value))
Image(url = IMAGE_URL, width = 300, height = 300)

id: 1YfxkylLN0ecX19mAquHGy
name: eargasm | dust settling
url: https://open.spotify.com/playlist/1YfxkylLN0ecX19mAquHGy
tracks: 85


## Basic song information

#### Run to retrieve fresh data

In [11]:
start = timeit.default_timer()

track_id = []
track_artists = []
track_name = []
track_duration = []
track_popularity = []
track_releasedate = []
track_preview = []
track_url = []
track_playlist = []


for _, (_id, _name) in tqdm(named_playlists.iterrows()):
    
    # Run in 100-item chunks to bypass Spotify API query limit
    for i in tqdm(range(0,10000,100)):
        
        playlist_items = sp.playlist_items(_id, limit=100, offset=i)['items']        
        for item in playlist_items:
            track_id.append(item['track']['id'])
            track_artists.append(item['track']['artists'][0]['name'])
            track_name.append(item['track']['name'])
            track_duration.append(item['track']['duration_ms'])
            track_popularity.append(item['track']['popularity'])
            track_releasedate.append(item['track']['album']['release_date'])
            track_preview.append(item['track']['preview_url'])
            track_url.append(item['track']['external_urls']['spotify'])
            track_playlist.append(_name)        
    
basic_info_df = pd.DataFrame({'track_id': track_id,
                              'track_artists': track_artists,
                              'track_name': track_name,
                              'track_duration': track_duration,
                              'track_popularity': track_popularity,
                              'track_releasedate': track_releasedate,
                              'track_preview': track_preview,
                              'track_url': track_url,
                              'track_playlist': track_playlist})

stop = timeit.default_timer()
print('Runtime: {} seconds.'.format(stop-start))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

ValueError: ignored

#### Save or the basic info dataframe to csv file

In [None]:
basic_info_df.to_csv('basic_info.csv')

#### Load the previously fetched data

In [None]:
# basic_info_df = pd.read_csv('basic_info.csv', index_col=0)

#### Check the dataframe info

In [None]:
basic_info_df.head()

In [None]:
basic_info_df.shape

In [None]:
basic_info_df.info()

#### Random track basic info

In [None]:
random_track_id = basic_info_df['track_id'][np.random.randint(0, len(basic_info_df['track_id']))]
random_filter = basic_info_df['track_id'] == random_track_id
random_track_info = basic_info_df[random_filter.values].iloc[0]
for key, value in random_track_info.iteritems():
    if key == 'track_preview' and value is not np.nan:
        PREVIEW_URL = value
    print('{}: {}'.format(key, value))
IPython.display.Audio(url=PREVIEW_URL, embed=True)

## Audio features

In [None]:
def audio_features(id):
    all_features = sp.audio_features(id)[0]
    columns_to_keep = ['id',
                       'danceability',
                       'energy',
                       'speechiness',
                       'acousticness',
                       'instrumentalness',
                       'liveness',
                       'valence']
    selected_features = { column: all_features[column] for column in columns_to_keep }
    
    return selected_features

In [None]:
print('Random track'audio_features('0hXzxTABL0Q85WhdzS62Fc'))

In [None]:
start = timeit.default_timer()

audio_features_df = pd.DataFrame()

for track_id in tqdm(labeled_songs['track_id']):
    features = audio_features(track_id)   
    audio_features_df = audio_features_df.append(features, ignore_index=True)

stop = timeit.default_timer()
print('Runtime: {} seconds.'.format(stop-start))

In [None]:
audio_features_df

## Audio analysis

In [None]:
def audio_analysis(id):
    track_features = sp.audio_analysis(id)['track']  
    
    columns_to_keep = ['tempo',
                       'tempo_confidence',
                       'time_signature',
                       'time_signature_confidence',
                       'key',
                       'key_confidence',
                       'mode',
                       'mode_confidence']
    
    selected_analysis = { column: track_features[column] for column in columns_to_keep }
    
    selected_analysis['number_of_sections'] = len(sp.audio_analysis(id)['sections'][0])
    selected_analysis['track_id'] = id
    
    return selected_analysis

audio_analysis('0hXzxTABL0Q85WhdzS62Fc')

In [None]:
start = timeit.default_timer()
audio_analysis_df = pd.DataFrame()


for track_id in tqdm(labeled_songs['track_id']):
    # Continue when encountered an error
    try:
        analysis = audio_analysis(track_id)
        audio_analysis_df = audio_analysis_df.append(analysis, ignore_index=True)
    except:
        audio_analysis_df = audio_analysis_df.append({'tempo': np.nan,
                                                      'tempo_confidence': np.nan,
                                                      'time_signature': np.nan,
                                                      'time_signature_confidence': np.nan,
                                                      'key': np.nan,
                                                      'key_confidence': np.nan,
                                                      'mode': np.nan,
                                                      'mode_confidence': np.nan,
                                                      'number_of_sections': np.nan,
                                                      'track_id': track_id},
                                                      ignore_index=True)

stop = timeit.default_timer()
print('Runtime: {} seconds.'.format(stop-start))

In [None]:
audio_analysis_df

## Merge basic info with audio features & audio analysis

In [None]:
print('Labeled songs:', labeled_songs.shape)
print('Audio features:', audio_features_df.shape)
print('Audio analysis:', audio_analysis_df.shape)

In [None]:
df = pd.merge(labeled_songs, audio_features_df, )
df

In [None]:
labeled_songs.reset_index(inplace=True)
# labeled_songs.set_index(['track_id'], inplace=True)

audio_features_df.reset_index(inplace=True)
# audio_features_df.set_index(['track_id'], inplace=True)

audio_analysis_df.reset_index(inplace=True)
# audio_analysis_df.set_index(['track_id'], inplace=True)

In [None]:
features_df = pd.merge(audio_analysis_df,
                    audio_features_df)
final_df = pd.merge(labeled_songs,
                   features_df)
final_df

## TO DO

- duplicate songs
- similarities between playlists
- new songs recommendations/prediction
- audio features
- add genre
- preview_url
- uri
- external_urls
- add random playlist (with number of songs and background image)

## References
- https://github.com/tgel0/spotify-data
- https://www.kaggle.com/aeryan/spotify-music-analysis
- https://www.kaggle.com/arpita28/analysis-of-spotify-trends
- https://www.kaggle.com/jsongunsw/spotify-datasets
- https://www.kaggle.com/mohitkr05/spotify-data-visualization
- https://www.kaggle.com/karthiknc/spotify-visualisation-analysis
- https://www.kaggle.com/souhardyaganguly/spotify-svm
- https://www.kaggle.com/lowkimhoe/prediction-model-on-spotify-classification
- https://www.kaggle.com/pavansanagapati/birds-sounds-eda-spotify-urban-sound-eda
- https://www.kaggle.com/shadey/spotify
- https://medium.com/swlh/creating-waveforms-out-of-spotify-tracks-b22030dd442b
- https://vsupalov.com/analyze-spotify-music-library-with-jupyter-pandas/
- https://www.youtube.com/watch?v=v-9Mpe7NhkM
- http://harpolea.github.io/2018-05-09-spotipy
- https://medium.com/analytics-vidhya/music-genre-classification-with-python-51bff77adfd6

In [None]:
recommendations(seed_artists=None, seed_genres=None, seed_tracks=None, limit=20, country=None, **kwargs)