In [89]:
# !pip install textblob
# !pip install spotipy
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth # this not used in this notebook but it allows username authentication

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import re


In [90]:
# Define environment variables
client_id = '0915369c658043269dc91da45f19445e' #replace with your client ID
client_secret = '27431e58338e44f1b74b281c78f0b472'#replace with your client secret
playlist_personal = '6pECpDEcLeymUolYVGSxy6' #customize to your playlists, this is my test playlist
playlist_compare = '5ka88Ozu1JdSOwR4R6GQs6' # Customize to the playlist you want to compare, this is the BB top 200

In [91]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [92]:
a = sp.audio_features(tracks=[playlist_personal,playlist_compare])

In [93]:
genre_lst = sp.recommendation_genre_seeds()['genres']

In [94]:
genre_lst

['acoustic',
 'afrobeat',
 'alt-rock',
 'alternative',
 'ambient',
 'anime',
 'black-metal',
 'bluegrass',
 'blues',
 'bossanova',
 'brazil',
 'breakbeat',
 'british',
 'cantopop',
 'chicago-house',
 'children',
 'chill',
 'classical',
 'club',
 'comedy',
 'country',
 'dance',
 'dancehall',
 'death-metal',
 'deep-house',
 'detroit-techno',
 'disco',
 'disney',
 'drum-and-bass',
 'dub',
 'dubstep',
 'edm',
 'electro',
 'electronic',
 'emo',
 'folk',
 'forro',
 'french',
 'funk',
 'garage',
 'german',
 'gospel',
 'goth',
 'grindcore',
 'groove',
 'grunge',
 'guitar',
 'happy',
 'hard-rock',
 'hardcore',
 'hardstyle',
 'heavy-metal',
 'hip-hop',
 'holidays',
 'honky-tonk',
 'house',
 'idm',
 'indian',
 'indie',
 'indie-pop',
 'industrial',
 'iranian',
 'j-dance',
 'j-idol',
 'j-pop',
 'j-rock',
 'jazz',
 'k-pop',
 'kids',
 'latin',
 'latino',
 'malay',
 'mandopop',
 'metal',
 'metal-misc',
 'metalcore',
 'minimal-techno',
 'movies',
 'mpb',
 'new-age',
 'new-release',
 'opera',
 'pagode',

In [95]:
len(genre_lst)

126

In [96]:
# Empty lists for desired features
track_name = []
artist_name = []
album_name = []
genre = []
duration_ms = []
popularity = []
explicit = []
track_id = []
artist_id = []

In [97]:
# Iterate through each genre
for g in genre_lst:
    # Requests are limited to 50 units, so we need multiple API requests to get 1000 songs per genre
    for i in range(0,1000,50):
        q = 'genre:'+str(g)
        # Store API request results in a variable for extraction
        genre_results = sp.search(q=q, type='track', limit=50,offset=i)
        # Iterate through tracks and store relevant information in lists
        for i, t in enumerate(genre_results['tracks']['items']):
            track_name.append(t['name'])
            artist_name.append(t['artists'][0]['name'])
            album_name.append(t['album']['name'])
            genre.append(g)
            duration_ms.append(t['duration_ms'])
            popularity.append(t['popularity'])
            explicit.append(t['explicit'])
            track_id.append(t['id'])
            artist_id.append(t['artists'][0]['id'])

In [98]:
# Initialize DataFrame with data
df = pd.DataFrame({'track_name':track_name,'artist_name':artist_name,
                   'album_name':album_name,'genre':genre,'duration_ms':duration_ms,
                   'popularity':popularity,'explicit':explicit,
                   'track_id' : track_id,'artist_id':artist_id})

In [99]:
df.head()

Unnamed: 0,track_name,artist_name,album_name,genre,duration_ms,popularity,explicit,track_id,artist_id
0,All I Want - Acoustic Covers Versions of Popul...,Covers Culture,Covers - Chill Covers Of Popular Songs,acoustic,124010,26,False,5xZ6rxWy8Npyic9pbUM700,5WqAC2eKe7UnQifadzN9r7
1,Wah Heat,Albert Collins,All Time Favorites: Albert Collins,acoustic,648213,2,False,1ZVLO6DiMLrSwX5Ifere1Y,1uFixbBAduJkFAeRKznkvW
2,Teenage Dirtbag - Acoustic Covers Versions,Roostz,Chill Covers of Popular Songs - Acustico Relax...,acoustic,118043,0,False,3dLDsIVjWAeTx8idxQ6ZYL,6YZ0HqYCSfHOR7yCMr1aUk
3,My Favorite Things,YOUMOU TO OHANA,LIVE IN LIVING for Good Night,acoustic,179186,10,False,4k6JuN2UPFfChCzsaeI3qJ,6mQtCRObmxmcYwZr0bUN8e
4,Crazy Ever After,The Rescues,Crazy Ever After,acoustic,249800,22,False,2n4HM5xsEOoWA8mHL65V25,28xUSGD80Bh3CuLlg0GVqf


In [100]:
df.to_csv('../../../data/csv/myplaylist_data.csv')

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113000 entries, 0 to 112999
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   track_name   113000 non-null  object
 1   artist_name  113000 non-null  object
 2   album_name   113000 non-null  object
 3   genre        113000 non-null  object
 4   duration_ms  113000 non-null  int64 
 5   popularity   113000 non-null  int64 
 6   explicit     113000 non-null  bool  
 7   track_id     113000 non-null  object
 8   artist_id    113000 non-null  object
dtypes: bool(1), int64(2), object(6)
memory usage: 7.0+ MB


In [103]:
df['genre'].unique()


array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'black-metal', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago-house', 'children',
       'chill', 'classical', 'club', 'comedy', 'country', 'dance',
       'dancehall', 'death-metal', 'deep-house', 'detroit-techno',
       'disco', 'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro',
       'electronic', 'emo', 'folk', 'forro', 'french', 'funk', 'garage',
       'german', 'gospel', 'goth', 'grindcore', 'groove', 'grunge',
       'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian',
       'indie', 'indie-pop', 'industrial', 'iranian', 'j-dance', 'j-idol',
       'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino',
       'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
       'new-age', 'opera', 'pagode', 'party', 'piano', 'pop', 'pop-film',
       'power-pop', '

In [104]:
# Pull audio features
tracks = df.track_id.to_list()
audio_features = []
batchsize = 100

# Iterate over 100 song batches (due to API limit per request)
for i in tqdm(range(0,len(tracks),batchsize)):
    batch = tracks[i:i+batchsize]
    # Collect features for 100 tracks
    feature_results = sp.audio_features(batch)
    # Store individual track info in list
    for track in feature_results:
        if track is not None:
            audio_features.append(track)

100%|██████████████████████████████████████████████████████████████████████████████| 1130/1130 [03:25<00:00,  5.49it/s]


In [105]:
audio_features[0]


{'danceability': 0.522,
 'energy': 0.196,
 'key': 1,
 'loudness': -10.809,
 'mode': 1,
 'speechiness': 0.0346,
 'acousticness': 0.852,
 'instrumentalness': 0,
 'liveness': 0.235,
 'valence': 0.3,
 'tempo': 119.382,
 'type': 'audio_features',
 'id': '5xZ6rxWy8Npyic9pbUM700',
 'uri': 'spotify:track:5xZ6rxWy8Npyic9pbUM700',
 'track_href': 'https://api.spotify.com/v1/tracks/5xZ6rxWy8Npyic9pbUM700',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/5xZ6rxWy8Npyic9pbUM700',
 'duration_ms': 124010,
 'time_signature': 4}

In [106]:
af_df = pd.DataFrame.from_dict(data=audio_features,orient='columns')

In [107]:
af_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.522,0.196,1,-10.809,1,0.0346,0.852,0.0,0.235,0.3,119.382,audio_features,5xZ6rxWy8Npyic9pbUM700,spotify:track:5xZ6rxWy8Npyic9pbUM700,https://api.spotify.com/v1/tracks/5xZ6rxWy8Npy...,https://api.spotify.com/v1/audio-analysis/5xZ6...,124010,4
1,0.389,0.479,7,-13.355,1,0.0667,0.581,0.894,0.681,0.612,83.31,audio_features,1ZVLO6DiMLrSwX5Ifere1Y,spotify:track:1ZVLO6DiMLrSwX5Ifere1Y,https://api.spotify.com/v1/tracks/1ZVLO6DiMLrS...,https://api.spotify.com/v1/audio-analysis/1ZVL...,648213,4
2,0.743,0.196,4,-11.533,1,0.0318,0.876,0.0,0.419,0.766,91.954,audio_features,3dLDsIVjWAeTx8idxQ6ZYL,spotify:track:3dLDsIVjWAeTx8idxQ6ZYL,https://api.spotify.com/v1/tracks/3dLDsIVjWAeT...,https://api.spotify.com/v1/audio-analysis/3dLD...,118043,4
3,0.542,0.264,3,-12.414,1,0.0913,0.911,0.000369,0.0958,0.464,206.791,audio_features,4k6JuN2UPFfChCzsaeI3qJ,spotify:track:4k6JuN2UPFfChCzsaeI3qJ,https://api.spotify.com/v1/tracks/4k6JuN2UPFfC...,https://api.spotify.com/v1/audio-analysis/4k6J...,179187,3
4,0.743,0.587,5,-6.476,1,0.0325,0.0909,1.3e-05,0.111,0.217,105.01,audio_features,2n4HM5xsEOoWA8mHL65V25,spotify:track:2n4HM5xsEOoWA8mHL65V25,https://api.spotify.com/v1/tracks/2n4HM5xsEOoW...,https://api.spotify.com/v1/audio-analysis/2n4H...,249800,4


In [110]:
# Remove unwanted features
af_df.drop(['type','uri','track_href','analysis_url','duration_ms','time_signature'],axis=1,inplace=True)

In [111]:
# Match join feature name
af_df.rename(columns={'id':'track_id'}, inplace=True)

In [112]:
final = pd.merge(df,af_df,how='inner',on='track_id')

In [113]:
final.head()

Unnamed: 0,track_name,artist_name,album_name,genre,duration_ms,popularity,explicit,track_id,artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,All I Want - Acoustic Covers Versions of Popul...,Covers Culture,Covers - Chill Covers Of Popular Songs,acoustic,124010,26,False,5xZ6rxWy8Npyic9pbUM700,5WqAC2eKe7UnQifadzN9r7,0.522,0.196,1,-10.809,1,0.0346,0.852,0.0,0.235,0.3,119.382
1,Wah Heat,Albert Collins,All Time Favorites: Albert Collins,acoustic,648213,2,False,1ZVLO6DiMLrSwX5Ifere1Y,1uFixbBAduJkFAeRKznkvW,0.389,0.479,7,-13.355,1,0.0667,0.581,0.894,0.681,0.612,83.31
2,Wah Heat,Albert Collins,All Time Favorites: Albert Collins,acoustic,648213,2,False,1ZVLO6DiMLrSwX5Ifere1Y,1uFixbBAduJkFAeRKznkvW,0.389,0.479,7,-13.355,1,0.0667,0.581,0.894,0.681,0.612,83.31
3,Wah Heat,Albert Collins,All Time Favorites: Albert Collins,blues,648213,2,False,1ZVLO6DiMLrSwX5Ifere1Y,1uFixbBAduJkFAeRKznkvW,0.389,0.479,7,-13.355,1,0.0667,0.581,0.894,0.681,0.612,83.31
4,Wah Heat,Albert Collins,All Time Favorites: Albert Collins,blues,648213,2,False,1ZVLO6DiMLrSwX5Ifere1Y,1uFixbBAduJkFAeRKznkvW,0.389,0.479,7,-13.355,1,0.0667,0.581,0.894,0.681,0.612,83.31


In [114]:
final.to_csv('../../../data/csv/mySpotifyData.csv')

In [115]:
final.shape

(254392, 20)

In [116]:
final.isnull().sum()

track_name          0
artist_name         0
album_name          0
genre               0
duration_ms         0
popularity          0
explicit            0
track_id            0
artist_id           0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
dtype: int64