In [141]:
import pandas as pd
import time
from tqdm import tqdm
import pickle
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.oauth2 as oauth2

# Spotify Data Collection

## 1. Establish credentials

In [None]:
#spotify API credentials

CLIENT_ID = "my_client_id"
CLIENT_SECRET = "my_client_secret"

credentials = oauth2.SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET)

token = credentials.get_access_token()
sp = spotipy.Spotify(auth=token)

## 2. Import and format data from Pitchfork

In [143]:
#import dataframe from pitchfork web scraping
pitchfork_df = pd.read_pickle('pitchfork_df.pickle')

In [144]:
pitchfork_df.head()

Unnamed: 0,Album,Artist,Score,Genres,Summary
0,Petals for Armor,Hayley Williams,7.2,Pop/R&BRock,The Paramore singer’s debut solo album is emot...
1,Regresa,Buscabulla,7.7,Electronic,The Puerto Rican duo moved back to the island ...
2,"What’s New, Tomboy?",Damien Jurado,7.3,Rock,Channeling stripped-back pop songs into hushed...
3,Shadow Talk,Cafe Racer,7.4,Rock,The Chicago band’s third album brings wiry new...
4,It Was Good Until It Wasn’t,Kehlani,7.7,Pop/R&B,The cloudy grooves of the Oakland singer’s sec...


In [145]:
#creat list of lists for album-artist pairs in order to search spotify
alb_art_pairs = [[pitchfork_df.Album[x], pitchfork_df.Artist[x]] for x in range(len(pitchfork_df))]

In [146]:
len(alb_art_pairs)

6516

In [147]:
alb_art_pairs[0:5]

[['Petals for Armor', 'Hayley Williams'],
 ['Regresa', 'Buscabulla'],
 ['What’s New, Tomboy?', 'Damien Jurado'],
 ['Shadow Talk', 'Cafe Racer'],
 ['It Was Good Until It Wasn’t', 'Kehlani']]

## 3. Gather Spotify data about albums

In [138]:
#function for getting spotify album ID's from album name and artist pair
def GetAlbumID(album, artist):
    results = sp.search(q = "album:" + album + ' artist:' + artist, type = "album")
    return results['albums']['items'][0]['uri']

In [148]:
#test function
a = GetAlbumID('A Moon Shaped Pool', "Radiohead")
a

'spotify:album:6vuykQgDLUCiZ7YggIpLM9'

In [None]:
album_IDs2_combo = []
album_IDs2 = []

In [None]:
#loop through every pair of artist-album to get the spotify albums ID
for pair in tqdm(alb_art_pairs):
    time.sleep(1)
    try:
        album_IDs2_combo.append([GetAlbumID(pair[0],pair[1]),pair[0],pair[1]])
        album_IDs2.append([GetAlbumID(pair[0],pair[1])])
    except:
        album_IDs2_combo.append(['Spotify features Not found',pair[0],pair[1]])
        album_IDs2.append(['Spotify features Not found'])  

In [149]:
#function for getting the track ID's for all of the songs in an album
def GetTrackIDs(album_id):
    track_id_list = []
    track_ids = sp.album_tracks(album_id)
    for track in track_ids['items']:
        track_id_list.append(track['id'])
    return track_id_list

In [150]:
#test function
c = GetTrackIDs(a)
c

['3pcCifdPTc2BbqmWpEhtUd',
 '1uRxyAup7OYrlh2SHJb80N',
 '5rIhBK9aaVMck0W2YtOwci',
 '1kBGeOp1CDUHVdbK4ergqo',
 '4CzTgOmc3Sdm4EgKQWzjQl',
 '0eZN5WsQfmNFICHuw59Zfz',
 '3LhtqibvTtjOUrzKs7Vsz1',
 '6f6pEjgfTtuRROmJ4a7Gf3',
 '4eruRiSfDY1jdT03hjyi0i',
 '3cual6JOG286qZJmCxKRAT',
 '01ZpFhrMMqKPVCwyqXneVp']

In [None]:
#create lists to append song ids and their respective album
song_ids_album2 = []
song_ids2 = []

In [None]:
#get spotify ID's for all of the tracks in all albums
for album in tqdm(album_IDs2):
    for alb in album:
        time.sleep(1)
        try:
            song_ids_album2.append([GetTrackIDs(alb),alb])
            song_ids2.append([GetTrackIDs(alb)])
        except:
            song_ids_album2.append(['Spotify features Not found',alb])
            song_ids2.append(['Spotify features Not found'])        

In [None]:
#save the song ids
with open('song_ids.pkl', 'wb') as f:
    pickle.dump(song_ids2, f)

## 4. Get Spotify data about songs

In [151]:
#function for getting information for a list of songs from song IDs
def GetTrackDescription(track_ids):
    track_descriptions = []
    for track_id in track_ids:
        description = sp.track(track_id)
        track_descriptions.append(description)
    return track_descriptions

In [None]:
#test function
d = GetTrackDescription(c)
d

In [153]:
#function for getting audio features for a list of tracks
def GetTrackInfo(track_ids):
    tracks_info = []
    for track_id in track_ids:
        info = sp.audio_features(track_id)
        tracks_info.append(info)
    return tracks_info

In [154]:
#test function
e = GetTrackInfo(c)
e

[[{'danceability': 0.541,
   'energy': 0.847,
   'key': 11,
   'loudness': -6.52,
   'mode': 1,
   'speechiness': 0.0297,
   'acousticness': 0.303,
   'instrumentalness': 0.271,
   'liveness': 0.109,
   'valence': 0.62,
   'tempo': 148.937,
   'type': 'audio_features',
   'id': '3pcCifdPTc2BbqmWpEhtUd',
   'uri': 'spotify:track:3pcCifdPTc2BbqmWpEhtUd',
   'track_href': 'https://api.spotify.com/v1/tracks/3pcCifdPTc2BbqmWpEhtUd',
   'analysis_url': 'https://api.spotify.com/v1/audio-analysis/3pcCifdPTc2BbqmWpEhtUd',
   'duration_ms': 220609,
   'time_signature': 4}],
 [{'danceability': 0.299,
   'energy': 0.263,
   'key': 9,
   'loudness': -13.207,
   'mode': 0,
   'speechiness': 0.0323,
   'acousticness': 0.968,
   'instrumentalness': 0.857,
   'liveness': 0.126,
   'valence': 0.113,
   'tempo': 137.848,
   'type': 'audio_features',
   'id': '1uRxyAup7OYrlh2SHJb80N',
   'uri': 'spotify:track:1uRxyAup7OYrlh2SHJb80N',
   'track_href': 'https://api.spotify.com/v1/tracks/1uRxyAup7OYrlh2SHJb8

In [None]:
tracks_objects = []
tracks_audio_features = []

In [None]:
#get song objects and audio features
for songs in tqdm(song_ids2):
    for song in songs:
        if song != 'Spotify features Not found':
            time.sleep(1)
            objects = GetTrackDescription(song)
            tracks_objects.append(objects)
            features = GetTrackInfo(song)
            tracks_audio_features.append(features)
            with open('features_spotify.pkl', 'wb') as f:
                pickle.dump(tracks_audio_features, f)
            with open('object_spotify.pkl', 'wb') as f:
                pickle.dump(tracks_objects, f)
        else:
            tracks_objects.append({'status':'Not found'})
            tracks_audio_features.append({'status':'Not found'})
            with open('features_spotify.pkl', 'wb') as f:
                pickle.dump(tracks_audio_features, f)
            with open('object_spotify.pkl', 'wb') as f:
                pickle.dump(tracks_objects, f)

In [None]:
#had to stop at song_ids2[2337] due to time constraints

In [155]:
tracks_audio_features[0][0]

[{'danceability': 0.79,
  'energy': 0.599,
  'key': 7,
  'loudness': -7.417,
  'mode': 1,
  'speechiness': 0.0431,
  'acousticness': 0.218,
  'instrumentalness': 0.00257,
  'liveness': 0.118,
  'valence': 0.362,
  'tempo': 109.978,
  'type': 'audio_features',
  'id': '2moHnkHTSXBe9KjvPSQJvg',
  'uri': 'spotify:track:2moHnkHTSXBe9KjvPSQJvg',
  'track_href': 'https://api.spotify.com/v1/tracks/2moHnkHTSXBe9KjvPSQJvg',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/2moHnkHTSXBe9KjvPSQJvg',
  'duration_ms': 266116,
  'time_signature': 4}]

In [156]:
tracks_objects[0][0]

{'album': {'album_type': 'album',
  'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/6Rx1JKzBrSzoKQtmbVmBnM'},
    'href': 'https://api.spotify.com/v1/artists/6Rx1JKzBrSzoKQtmbVmBnM',
    'id': '6Rx1JKzBrSzoKQtmbVmBnM',
    'name': 'Hayley Williams',
    'type': 'artist',
    'uri': 'spotify:artist:6Rx1JKzBrSzoKQtmbVmBnM'}],
  'available_markets': ['AD',
   'AE',
   'AR',
   'AT',
   'AU',
   'BE',
   'BG',
   'BH',
   'BO',
   'BR',
   'CA',
   'CH',
   'CL',
   'CO',
   'CR',
   'CY',
   'CZ',
   'DE',
   'DK',
   'DO',
   'DZ',
   'EC',
   'EE',
   'EG',
   'ES',
   'FI',
   'FR',
   'GB',
   'GR',
   'GT',
   'HK',
   'HN',
   'HU',
   'ID',
   'IE',
   'IL',
   'IN',
   'IS',
   'IT',
   'JO',
   'JP',
   'KW',
   'LB',
   'LI',
   'LT',
   'LU',
   'LV',
   'MA',
   'MC',
   'MT',
   'MX',
   'MY',
   'NI',
   'NL',
   'NO',
   'NZ',
   'OM',
   'PA',
   'PE',
   'PH',
   'PL',
   'PS',
   'PT',
   'PY',
   'QA',
   'RO',
   'SA',
   'SE',
   'SG',
   'S

## 5. Format collected data into pandas dataframe

In [157]:
#read in pickled data
tracks_audio_features = pd.read_pickle('features_spotify.pkl')
tracks_objects = pd.read_pickle('object_spotify.pkl')

In [158]:
#flatten lists
flat_tracks_objects = [item for sublist in tracks_objects for item in sublist]
flat_tracks_audio_features = [item for sublist in tracks_audio_features for item in sublist]

In [None]:
#create objects dataframe with desired columns
objects_df = pd.DataFrame(columns=[
    'name',
    'duration_ms',
    'popularity',
    'num_markets',
    'album',
    'disc_number',
    'is_explicit',
    'track_number',
    'release_date',
    'artist',
    'song_id'
])

In [None]:
#create song features dataframe with desired columns
features_df = pd.DataFrame(columns=[
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'time_signature',
    'song_id'
])

In [None]:
#get song objects dictionary data into a dataframe
for song in flat_track_objects:
    if song == {'status': 'Not found'} or song == [None]:
        y = {
            'name': None,
            'duration_ms': None,
            'popularity': None,
            'num_markets': None,
            'album': None,
            'disc_number': None,
            'is_explicit': None,
            'track_number': None,
            'release_date': None,
            'artist': None,
            'song_id': None,
            'album_id': None
        }
        objects_df = objects_df.append(y, ignore_index=True)
    else:
        try:
            y = {
                'name': song['name'],
                'duration_ms': song['duration_ms'],
                'popularity':song['popularity'],
                'num_markets': len(song['available_markets']),
                'album': song['album']['name'],
                'disc_number': song['disc_number'],
                'is_explicit': song['explicit'],
                'track_number': song['track_number'],
                'release_date': song['album']['release_date'],
                'artist': song['artists'][0]['name'],
                'song_id': song['id'],
                'album_id': song['album']['id']
        }
        except:
             y = {
                'name': None,
                'duration_ms': None,
                'popularity': None,
                'num_markets': None,
                'album': None,
                'disc_number': None,
                'is_explicit': None,
                'track_number': None,
                'release_date': None,
                'artist': None,
                'song_id': None,
                'album_id': None
        }
        objects_df = objects_df.append(y, ignore_index=True)

In [160]:
objects_df.head()

Unnamed: 0,name,duration_ms,popularity,num_markets,album,disc_number,is_explicit,track_number,release_date,artist,song_id,album_id
0,Simmer,266115,64,79,Petals For Armor,1,True,1,2020-05-08,Hayley Williams,2moHnkHTSXBe9KjvPSQJvg,4HXpQ5KQBVWN25ltjnX7xa
1,Leave It Alone,245076,61,79,Petals For Armor,1,False,2,2020-05-08,Hayley Williams,5ccn0VZg8HQZsZXxGx7UZQ,4HXpQ5KQBVWN25ltjnX7xa
2,Cinnamon,211834,61,79,Petals For Armor,1,False,3,2020-05-08,Hayley Williams,48pL4zB6KXWPvd7Ln33ENy,4HXpQ5KQBVWN25ltjnX7xa
3,Creepin',178398,60,79,Petals For Armor,1,False,4,2020-05-08,Hayley Williams,5ZD0jxyhmdNZ3DwDRXqV5U,4HXpQ5KQBVWN25ltjnX7xa
4,Sudden Desire,187969,60,79,Petals For Armor,1,False,5,2020-05-08,Hayley Williams,1y2GDXFvtHPbCkHFYg49Gn,4HXpQ5KQBVWN25ltjnX7xa


In [161]:
objects_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23897 entries, 0 to 23896
Data columns (total 12 columns):
name            23525 non-null object
duration_ms     23525 non-null object
popularity      23525 non-null object
num_markets     23525 non-null object
album           23525 non-null object
disc_number     23525 non-null object
is_explicit     23525 non-null object
track_number    23525 non-null object
release_date    23525 non-null object
artist          23525 non-null object
song_id         23525 non-null object
album_id        23525 non-null object
dtypes: object(12)
memory usage: 2.2+ MB


In [None]:
#save dataframe
with open('song_objects_df.pkl', 'wb') as f:
    pickle.dump(objects_df, f)

In [None]:
#get song features dictionary data into a dataframe
for song_object in flat_tracks_audio_features:
    for song in song_object:
        if song == {'status': 'Not found'} or song == [None]:
            y = {
                'danceability': None,
                'energy': None,
                'key': None,
                'loudness': None,
                'mode': None,
                'speechiness': None,
                'acousticness': None,
                'instrumentalness': None,
                'liveness': None,
                'valence': None,
                'tempo': None,
                'time_signature': None,
                'song_id': None
                }
            features_df = features_df.append(y, ignore_index=True)
        else:
            try:
                y = {
                    'danceability': song['danceability'],
                    'energy': song['energy'],
                    'key': song['key'],
                    'loudness': song['loudness'],
                    'mode': song['mode'],
                    'speechiness': song['speechiness'],
                    'acousticness': song['acousticness'],
                    'instrumentalness': song['instrumentalness'],
                    'liveness': song['liveness'],
                    'valence': song['valence'],
                    'tempo': song['tempo'],
                    'time_signature': song['time_signature'],
                    'song_id': song['id']
            }
            except:
                 y = {
                    'danceability': None,
                    'energy': None,
                    'key': None,
                    'loudness': None,
                    'mode': None,
                    'speechiness': None,
                    'acousticness': None,
                    'instrumentalness': None,
                    'liveness': None,
                    'valence': None,
                    'tempo': None,
                    'time_signature': None,
                    'song_id': None
            }
            features_df = features_df.append(y, ignore_index=True)

In [163]:
features_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,song_id
0,0.79,0.599,7.0,-7.417,1.0,0.0431,0.218,0.00257,0.118,0.362,109.978,4.0,2moHnkHTSXBe9KjvPSQJvg
1,0.57,0.339,2.0,-11.857,0.0,0.0449,0.569,0.451,0.126,0.442,88.963,4.0,5ccn0VZg8HQZsZXxGx7UZQ
2,,,,,,,,,,,,,
3,0.877,0.598,6.0,-6.292,0.0,0.0296,0.353,0.00132,0.38,0.671,97.012,4.0,5ZD0jxyhmdNZ3DwDRXqV5U
4,0.564,0.391,7.0,-7.816,1.0,0.0785,0.235,5.4e-05,0.11,0.419,101.449,4.0,1y2GDXFvtHPbCkHFYg49Gn


In [164]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25757 entries, 0 to 25756
Data columns (total 13 columns):
danceability        23520 non-null float64
energy              23520 non-null float64
key                 23520 non-null object
loudness            23520 non-null float64
mode                23520 non-null object
speechiness         23520 non-null float64
acousticness        23520 non-null float64
instrumentalness    23520 non-null float64
liveness            23520 non-null float64
valence             23520 non-null float64
tempo               23520 non-null float64
time_signature      23520 non-null object
song_id             23520 non-null object
dtypes: float64(9), object(4)
memory usage: 2.6+ MB


In [None]:
#save dataframe
with open('song_features_df.pkl', 'wb') as f:
    pickle.dump(features_df, f)