## Collecting Data

In [1]:
#Import necessary packages
import json
import config
import sys
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials



In [2]:
#Define function to retrieve API keys from a Json file
def get_keys(path):
    with open(path) as f:
        return json.load(f)

In [3]:
#Retrieve personal keys for Spotify API 
keys = get_keys("/Users/adinasteinman/.secret/spotify_api.json")
client_id = keys['client_id']
client_secret = keys['client_secret']

In [4]:
#Access the Spotipy wrapper with client id and client secret credentials 
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
                                                           client_secret=client_secret))

In [5]:
# playlist_id = 'spotify:user:spotifycharts:playlist:37i9dQZEVXbJiZcmkrIHGU'
# results = sp.playlist(playlist_id)
# print(json.dumps(results, indent=4))

Before we extract our dataset, we will investigate ways to search for artists, songs, albums, etc. through the Spotify API.

In [6]:
#Use the sp.search method to look up songs by the Artist "The Weeknd"
search_str = 'The Weeknd'
result = sp.search(search_str)
print(result)

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=The+Weeknd&type=track&offset=0&limit=10', 'items': [{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1Xyo4u8uXC1ZmMpatF05PJ'}, 'href': 'https://api.spotify.com/v1/artists/1Xyo4u8uXC1ZmMpatF05PJ', 'id': '1Xyo4u8uXC1ZmMpatF05PJ', 'name': 'The Weeknd', 'type': 'artist', 'uri': 'spotify:artist:1Xyo4u8uXC1ZmMpatF05PJ'}], 'available_markets': ['AD', 'AE', 'AL', 'AR', 'AT', 'AU', 'BA', 'BE', 'BG', 'BH', 'BO', 'BR', 'BY', 'CA', 'CH', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DE', 'DK', 'DO', 'DZ', 'EC', 'EE', 'EG', 'ES', 'FI', 'FR', 'GB', 'GR', 'GT', 'HK', 'HN', 'HR', 'HU', 'ID', 'IE', 'IL', 'IN', 'IS', 'IT', 'JO', 'JP', 'KR', 'KW', 'KZ', 'LB', 'LI', 'LT', 'LU', 'LV', 'MA', 'MC', 'MD', 'ME', 'MK', 'MT', 'MX', 'MY', 'NI', 'NL', 'NO', 'NZ', 'OM', 'PA', 'PE', 'PH', 'PL', 'PS', 'PT', 'PY', 'QA', 'RO', 'RS', 'RU', 'SA', 'SE', 'SG', 'SI', 'SK', 'SV', 'TH', 'TN', 'TR', 'TW', 'UA', 'US', 'UY',

In [7]:
#Create a query that looks at top tracks from 2020 
track_results = sp.search(q='year:2020', type='track', limit=50)
track_results

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=year%3A2020&type=track&offset=0&limit=50',
  'items': [{'album': {'album_type': 'single',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/7tYKF4w9nC0nq9CsPZTHyP'},
       'href': 'https://api.spotify.com/v1/artists/7tYKF4w9nC0nq9CsPZTHyP',
       'id': '7tYKF4w9nC0nq9CsPZTHyP',
       'name': 'SZA',
       'type': 'artist',
       'uri': 'spotify:artist:7tYKF4w9nC0nq9CsPZTHyP'}],
     'available_markets': ['AD',
      'AE',
      'AL',
      'AR',
      'AT',
      'AU',
      'BA',
      'BE',
      'BG',
      'BH',
      'BO',
      'BR',
      'BY',
      'CA',
      'CH',
      'CL',
      'CO',
      'CR',
      'CY',
      'CZ',
      'DE',
      'DK',
      'DO',
      'DZ',
      'EC',
      'EE',
      'EG',
      'ES',
      'FI',
      'FR',
      'GB',
      'GR',
      'GT',
      'HK',
      'HN',
      'HR',
      'HU',
      'ID',
      'IE',
      'IL',
      'IN',
      'IS',

In [8]:
# artist_name = []
# track_name = []
# popularity = []
# track_id = []
# for i in range(0,10000):
#     track_results = sp.search(q='year:2018', type='track', limit=50,offset=i)
# #     for i, t in enumerate(track_results['tracks']['items']):
# #         artist_name.append(t['artists'][0]['name'])
# #         track_name.append(t['name'])
# #         track_id.append(t['id'])
# #         popularity.append(t['popularity'])

In [9]:
# import pandas as pd
# track_dataframe = pd.DataFrame({'artist_name' : artist_name, 'track_name' : track_name, 'track_id' : track_id, 'popularity' : popularity})
# print(track_dataframe.shape)
# track_dataframe.head()

Look at all the possible genres in the Spotify database

In [10]:
#Use the recommendation_genres_seeds method to extract genres 
genres = sp.recommendation_genre_seeds()

In [11]:
#Print the list of genres 
genres['genres']

['acoustic',
 'afrobeat',
 'alt-rock',
 'alternative',
 'ambient',
 'anime',
 'black-metal',
 'bluegrass',
 'blues',
 'bossanova',
 'brazil',
 'breakbeat',
 'british',
 'cantopop',
 'chicago-house',
 'children',
 'chill',
 'classical',
 'club',
 'comedy',
 'country',
 'dance',
 'dancehall',
 'death-metal',
 'deep-house',
 'detroit-techno',
 'disco',
 'disney',
 'drum-and-bass',
 'dub',
 'dubstep',
 'edm',
 'electro',
 'electronic',
 'emo',
 'folk',
 'forro',
 'french',
 'funk',
 'garage',
 'german',
 'gospel',
 'goth',
 'grindcore',
 'groove',
 'grunge',
 'guitar',
 'happy',
 'hard-rock',
 'hardcore',
 'hardstyle',
 'heavy-metal',
 'hip-hop',
 'holidays',
 'honky-tonk',
 'house',
 'idm',
 'indian',
 'indie',
 'indie-pop',
 'industrial',
 'iranian',
 'j-dance',
 'j-idol',
 'j-pop',
 'j-rock',
 'jazz',
 'k-pop',
 'kids',
 'latin',
 'latino',
 'malay',
 'mandopop',
 'metal',
 'metal-misc',
 'metalcore',
 'minimal-techno',
 'movies',
 'mpb',
 'new-age',
 'new-release',
 'opera',
 'pagode',

In [12]:
#Extract specific information from the "acoustic" genre: find the name of the third song
acoustics = sp.search(q='genre: acoustic', limit=5, type='track')['tracks']['items']
acoustics[3]['name']

'Come On Get Higher'

In [13]:
#Creative a variable called 'hugeplaylist' that extracts information from a specific playlist using its playlist id  
hugeplaylist = sp.user_playlist_tracks(playlist_id="54nv8jbrm4JoHEZ49Qvjgl", offset=100)["items"]

#Other

In [14]:
hugeplaylist[0]['track']['name']

'Turning Tables'

In [15]:
hugeplaylist[0]['track']['album']['release_date']

'2011-01-19'

In [16]:
# def analyze_playlist(creator, playlist_id):
    
#     # Create empty dataframe
#     playlist_features_list = ["artist","album","track_name",  "track_id","danceability","energy","key","loudness","mode", "speechiness","instrumentalness","liveness","valence","tempo", "duration_ms","time_signature"]
    
#     playlist_df = pd.DataFrame(columns = playlist_features_list)
    
#     # Loop through every track in the playlist, extract features and append the features to the playlist df
    
#     playlist = sp.user_playlist_tracks(creator, playlist_id)["items"]
#     for track in playlist:
#         # Create empty dict
#         playlist_features = {}
#         # Get metadata
#         playlist_features["artist"] = track["track"]["album"]["artists"][0]["name"]
#         playlist_features["album"] = track["track"]["album"]["name"]
#         playlist_features["track_name"] = track["track"]["name"]
#         playlist_features["release_date"] = track["track"]["album"]["release_date"]
#         playlist_features["track_id"] = track["track"]["id"]
        
#         # Get audio features
#         audio_features = sp.audio_features(playlist_features["track_id"])[0]
#         for feature in playlist_features_list[4:]:
#             playlist_features[feature] = audio_features[feature]
        
#         # Concat the dfs
#         track_df = pd.DataFrame(playlist_features, index = [0])
#         playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
        
#     return playlist_df

I will now look to extract data from one playlist on Spotify. The playlist selected was random, however it was chosen due to its large volume (approximately 10,000 songs are in the playlist). I will ues this playlist as our dataset for the remainder of my analysis.  The playlist data can be called from the API using the user_playlist_tracks method, and inserting the playlist's ID. 

In [19]:
#Perform a pagination method that increases the offset by increments of 100 to extract approximately 10,000 songs 
#from the chosen Spotify playlist 

#Start with offset=0
offset = 0 
#Create an empty playlist 
playlist = []
#Apply the user_playlist_tracks method from spotipy to extract playlist data 
p1 = sp.user_playlist_tracks(playlist_id="54nv8jbrm4JoHEZ49Qvjgl", offset=offset)

#Continue to loop through the API call and append results to the empty playlist until 10,000 songs are extracted
while offset<10000:
    for i in p1["items"]:
        playlist.append(i)
    offset+=100
    p1 = sp.user_playlist_tracks(playlist_id="54nv8jbrm4JoHEZ49Qvjgl", offset=offset)    
        

In [20]:
#Print the length of the playlist
len(playlist)

9964

In [21]:
#Create a function to extract features of our playlist and append them to a DataFrame 
def df_playlist(playlist_id):
    
    # Set column names and build empty dataframe
    playlist_features_list = ["artist","album","track_name", "release_date", "track_id"]
    playlist_df = pd.DataFrame(columns = playlist_features_list)
    
    # Create empty dictionary of playlist features 
    playlist_features = {}
    
    #Instantiate a counter = 0 
    counter=0
    
    # Create a for loop that looks through every track in the playlist, 
    # Then, extract relevant features and append the features to a DataFrame
    for track in playlist_id:
        if track["track"]!=None:
            counter+=1
            playlist_features["artist"] = track["track"]["album"]["artists"][0]["name"]
            playlist_features["album"] = track["track"]["album"]["name"]
            playlist_features["track_name"] = track["track"]["name"]
            playlist_features["release_date"] = track["track"]["album"]["release_date"]
            playlist_features["track_id"] = track["track"]["id"]
            
    # Add new features to a DataFrame then continuously add new features to existing DataFrame
    # This method ensures that all songs extracted from playlist result in one final dataframe    
            track_df = pd.DataFrame(playlist_features, index = [0])
            playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
     
    #Return final playlist dataframe
    return playlist_df

In [22]:
#Apply function to the playlist that was previously extracted and set equal to a new dataframe called df 
df = df_playlist(playlist)

In [23]:
#Look at shape of new DataFrame 
df.shape

(9963, 5)

In [24]:
#Look at first 5 rows of our dataset 
df.head()

Unnamed: 0,artist,album,track_name,release_date,track_id
0,Katy Perry,Katy Perry - Teenage Dream: The Complete Confe...,Firework,2012-03-12,4lCv7b86sLynZbXhfScfm2
1,OneRepublic,Dreaming Out Loud,All We Are,2007-01-01,1Jx69b09LKTuBQxkEiFfVX
2,Amy Winehouse,Back To Black,Wake Up Alone,2006-01-01,4u83mwF5tUuWlXS86UOXdu
3,The Script,The Script,The Man Who Can't Be Moved,2008-09-08,4Musyaro0NM5Awx8b5c627
4,Adele,21,Rolling in the Deep,2011-01-19,1CkvWZme3pRgbzaxZnTl5X


In [25]:
#Export new DataFrame to csv file 
df.to_csv('data')