In [1]:
# dependencies
import requests
import json
import pandas as pd
import time
import os

# import spotipy
#!pip install spotipy
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
from config import spotify_client_ID as sp_client, spotify_client_secret as sp_secret

In [3]:
# set up client credentials 
# https://spotipy.readthedocs.io/en/master/?highlight=spotifyclientcredentials#client-credentials-flow
# API info: https://developer.spotify.com/documentation/web-api/reference/#/

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(sp_client, sp_secret))

In [4]:
# API info: https://developer.spotify.com/documentation/web-api/
## https://developer.spotify.com/documentation/web-api/reference/#/

sp_root_url = 'https://api.spotify.com/v1/'

In [5]:
# timeit library to measure the time needed to run this code
import timeit
start = timeit.default_timer()


# empty lists to hold desired audio feature scores
acousticness = []
danceability = []
energy = []
instrumentalness = []
key = []
liveness = []
loudness = []
mode = []
speechiness = []
tempo = []
time_signature = []
valence = []


# empty list to hold all artist data
artist_name = []
artist_URIs = []
artist_genre = []
popularity = []
artist_followers = []

# empty list to hold track daata
track_URIs = []
track_name = []
track_id = []


    
    
for i in range(0,1000,50):
    track_results = sp.search(q='year:2018', type='track', limit=50,offset=i)
    for i, t in enumerate(track_results['tracks']['items']):
        artist_name.append(t['artists'][0]['name'])
        track_name.append(t['name'])
        track_id.append(t['id'])
        popularity.append(t['popularity'])
      

stop = timeit.default_timer()
print ('Time to run this code (in seconds):', stop - start)

Time to run this code (in seconds): 7.9502476


In [6]:
import pandas as pd
track_dataframe = pd.DataFrame({'artist_name' : artist_name, 'track_name' : track_name, 'track_id' : track_id, 'popularity' : popularity})
print(track_dataframe.shape)
track_dataframe.head(10)

(1000, 4)


Unnamed: 0,artist_name,track_name,track_id,popularity
0,Morgan Wallen,Whiskey Glasses,6foY66mWZN0pSRjZ408c00,82
1,The Weeknd,I Was Never There,1cKHdTo9u0ZymJdPGSh6nq,91
2,Joji,SLOW DANCING IN THE DARK,0rKtyWc8bvkriBthvHKY8d,86
3,The Weeknd,Call Out My Name,09mEdoA6zrmBPgTEN5qXmN,90
4,Lil Baby,Drip Too Hard (Lil Baby & Gunna),78QR3Wp35dqAhFEc2qAGjE,86
5,Lil Baby,Yes Indeed,6vN77lE9LK6HP2DewaN6HZ,83
6,Morgan Wallen,Chasin' You,5MwynWK9s4hlyKHqhkNn4A,80
7,Juice WRLD,All Girls Are The Same,4VXIryQMWpIdGgYR4TrjT1,84
8,Juice WRLD,Lucid Dreams,285pBltuF7vW8TeWk8hdRR,86
9,21 Savage,ball w/o you,50a8bKqlwDEqeiEknrzkTO,82


In [7]:
# get all genres associated with artists

q_search = []
for a, artist in enumerate(track_dataframe['artist_name']):
    
    # Spotify 'item' search
    # https://developer.spotify.com/documentation/web-api/reference/#/operations/search
    # https://spotipy.readthedocs.io/en/master/
    artist_search = sp.search(q='artist:' + artist, type='track', limit=1, offset=a)
    try:
        artist_uri = artist_search['tracks']['items'][0]['artists'][0]['uri']
    except:
        pass
    q_search.append(artist_uri)

q_search

KeyboardInterrupt: 

In [None]:
track_dataframe['artist_uri'] = q_search
track_dataframe.head()

In [None]:
# get artist genre data

# empty list and dataframe of genres
all_genres = []

# empty dictionary to hold all genres associated with each artist
artist_genre_dict = {}

# get genres associated with all artists    

artist_URIs = q_search
for artist in artist_URIs:
    
    # get the artist's genre(s)
    artist_genres = sp.artist(artist)['genres']
    
    # format genre strings
    artist_genres = [x.replace(' ', '_').lower() for x in artist_genres]
    
    artist_genre_dict[artist] = artist_genres
    
    # save new genres to the list of all top40 genres
    for genre in artist_genres:
        if genre not in all_genres:
            all_genres.append(genre)

            
# fill the new genre dataframe
all_genres_dict = {}
for genre in all_genres:
    all_genres_dict[genre] = [0] * len(track_dataframe)

# create dataframe with genre data, sort df columns
genre_df = pd.DataFrame(all_genres_dict)
genre_df.columns = sorted(genre_df.columns.tolist())
genre_df['artist_uri'] = artist_URIs
genre_df.index = artist_URIs

# assign 1's to each artist's genre                
for column in genre_df:
    
    for artist in genre_df.index:
        
        for genre in artist_genre_dict[artist]:
            
            if genre == column:
            
                genre_df.loc[artist, column] = 1

# concatenate all data into one dataframe
full_df = track_dataframe.merge(genre_df, how="inner", on="artist_uri")
full_df.drop_duplicates(inplace=True)

In [None]:
full_df.sample(10)

In [None]:
print(len(full_df))
print(len(full_df.columns))

In [None]:
# keep the 50 top genres

# get the sum of each genre column
genre_counts = full_df.sum(axis=0)[5:]
genre_counts_df = pd.DataFrame(genre_counts, columns=['count'])
genre_counts_df

In [None]:
# get the 50 top genres
top_genres_df = genre_counts_df.sort_values(['count'],ascending=False)[:50]
top_genres = top_genres_df.index
top_genres_df.head(8)

In [None]:
# get genres to drop from the dataset
drop_genres = genre_counts_df.sort_values(['count'],ascending=False)[50:].index

# remove low-count genres from the dataset
genres_df = full_df.drop(columns=drop_genres)
len(genres_df.columns)

In [None]:
# get audio features for all tracks

# empty lists to hold desired audio feature scores
acousticness = []
danceability = []
energy = []
instrumentalness = []
key = []
liveness = []
loudness = []
mode = []
speechiness = []
tempo = []
time_signature = []
valence = []

track_features = []

# make the API call for track audio features
# https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
for track in genres_df['track_id']:
    
        track_features.append(sp.audio_features(track))
        
# save each feature score to the appropriate list
for track in track_features:

    acousticness.append(track[0]['acousticness'])
    danceability.append(track[0]['danceability'])
    energy.append(track[0]['energy'])
    instrumentalness.append(track[0]['instrumentalness'])
    key.append(track[0]['key'])
    liveness.append(track[0]['liveness'])
    loudness.append(track[0]['loudness'])
    mode.append(track[0]['mode'])
    speechiness.append(track[0]['speechiness'])
    tempo.append(track[0]['tempo'])
    time_signature.append(track[0]['time_signature'])
    valence.append(track[0]['valence'])

# save feature data to the dataframe
genres_df['acousticness'] = acousticness
genres_df['danceability'] = danceability
genres_df['energy'] = energy
genres_df['instrumentalness'] = instrumentalness
genres_df['key'] = key
genres_df['liveness'] = liveness
genres_df['loudness'] = loudness
genres_df['mode'] = mode
genres_df['speechiness'] = speechiness
genres_df['tempo'] = tempo
genres_df['time_signature'] = time_signature
genres_df['valence'] = valence


In [None]:
genres_df.head()

In [None]:
# write dataframe to csv
genres_df.to_csv("00_data/")