This notebook requests the access token to the Spotify API, collects and saves raw data in the `.csv` format.

In [1]:
# Import necessary packages

from requests import post,get
import json
import numpy as np
import pandas as pd

Token endpoint URI and the format of the request are based on the [information page](https://developer.spotify.com/documentation/web-api/tutorials/getting-started#create-an-app) of Spotify.

In [2]:
with open("../.env", "r") as f:
    credentials = json.load(f)


url = "https://accounts.spotify.com/api/token"

headers = {"Content-Type": "application/x-www-form-urlencoded"}

data = {
    "grant_type": "client_credentials",
    "client_id": credentials["CLIENT_ID"],  
    "client_secret": credentials["CLIENT_SECRET"]
}

# Make the POST request
response = post(url, headers=headers, data=data)
if response.status_code == 200:
    access_token = response.json()['access_token']
else:
    print('Token could not be obtained, status code: ', response.status_code)

    
def fetch_data(request, header):
    response = get(url=request, headers=header)
    if response.status_code == 200:
        return response.json()
    else:
        print('Failure.', response.status_code, response.json())
        return {}
    

Having obtained the access token, we can start data collection. First, we will collect IDs of all Spotify playlists from a chosen categoty for the Netherlands region.

In [3]:
header = {"Authorization": "Bearer " + access_token}

# Pop playlists were selected as they are most popular
category = "pop"

def get_playlists_links(num_of_playlists = 100):

    playlist_ids = []
    # maximum number of items to return is set to 50 by Spotify
    one_request_max = 50

    def extract_links(request, header):
        _data_page = fetch_data(request, header)
        _df = pd.DataFrame(_data_page["playlists"]["items"])
        
        # return the urls if they are present
        if "href" in _df.columns:
            return _df["href"].tolist()
        else:
            return []
            

    # Send repeated requests
    offset = 0
    for num in range(num_of_playlists // one_request_max):
        url = f"https://api.spotify.com/v1/browse/categories/{category}/playlists?offset={offset}&limit=50"
        playlist_ids.extend(extract_links(url, header))
        offset += 50

    # collect remaining playlists with smaller limit
    if num_of_playlists % one_request_max != 0:
        url = f"https://api.spotify.com/v1/browse/categories/{category}/playlists?offset={offset}&limit={num_of_playlists % one_request_max}"
        playlist_ids.extend(extract_links(url, header))
        
    if num_of_playlists != len(set(playlist_ids)):
        print(f"In the category {category}, there are fewer playlists than {num_of_playlists}, namely: {len(set(playlist_ids))}.")
    return list(set(playlist_ids))
    

In [4]:
playlist_urls = get_playlists_links()

In the category pop, there are fewer playlists than 100, namely: 23.


Having obtained all playlists from the prespecified category, we can now proceed to collect all the songs that are in those playlists.

In [5]:
def get_songs_from_playlist(url):

    _data = fetch_data(url, header)
    
    row = {}
    row["playlist_href"] = url
    row["followers"] = _data["followers"]["total"]
    row['name'] = _data['name']
    row['description'] = _data['description']
    # To investigate who is the owner of the playslist -> possible EDA
    if _data['owner']['display_name'] != 'Spotify':
        print("Different owner: " + _data['owner']['display'])
    ## In the end it looked like only Spotify playlists were available with my credentials but
    ## I decided to keep the two lines of code above anyway


    # A list of song dictionaries to be dealt with in the cleaning process
    row['songs'] = _data['tracks']['items']
    # Collects remaining songs if the playlist had more than 100 songs
    _data = _data['tracks']
    
    while _data['next'] is not None:
        _data = fetch_data(_data['next'], header)
        row['songs'].extend(_data['items'])

    return row



In [6]:
# Now we apply this function for every playlist
playlist_urls_series = pd.Series(playlist_urls)

ser = playlist_urls_series.apply(get_songs_from_playlist)

# And transform it to a data frame
series_data = pd.Series(ser)
list_of_dicts = series_data.tolist()
df = pd.DataFrame(list_of_dicts)

Having collected all the data we were interested in, it is time to save it to a file for further processing.

In [7]:
df.to_csv('../data/raw/spotify_playlists.csv', index=False)
