# Data Collection
- This notebook is responsible for collecting the dataset of tracks including audio features to be used in model training and test.
- Make sure you already have created a Spotify app, have your ***client_id*** and ***client_secret*** ready. They are necessary for getting an access token.
- Create two environment variables with exact same name on your machine and the notebook will automatically include them in the context.
- Access token is included in requests header and is valid for an hour.

In [1]:
# import libraries
import os
import json
import requests
import pandas as pd
from pathlib import Path

## Get Access Token

In [37]:
def get_access_token(client_id: str, client_secret: str, grant_type: str = 'client_credentials'):
    url = 'https://accounts.spotify.com/api/token?grant_type={}&client_id={}&client_secret={}'.format(grant_type, client_id, client_secret)
    response = requests.post(url, headers={'Content-Type':'application/x-www-form-urlencoded'})
    access_token = 'Bearer ' + json.loads(response.text)['access_token']

    return access_token

In [38]:
# get access token
grant_type = 'client_credentials'
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

access_token = get_access_token(client_id, client_secret, grant_type)

In [28]:
def get_data(url: str, access_token: str, verbose: bool = False):
    response = requests.get(url, headers={'Authorization': access_token})
    result = json.loads(response.text)

    if verbose:
        print('Response body:\n', result)

    return result

## Get Tracks
|track_id|track_name|artist_name|popularity|genre|
|---|---|---|---|---|

In [21]:
def get_tracks(genres_list: list, steps: int, limit: int, offset: int,access_token: str):
    tracks_df = pd.DataFrame()

    for genre in genres_list:

        for step in range(steps):
            url = 'https://api.spotify.com/v1/search?q=genre:{}&type=track&limit={}&offset={}'.format(genre, limit, offset)
            search_item = get_data(url, access_token)

            for n in range(limit):
                track_id = search_item['tracks']['items'][n]['id']
                track_name = search_item['tracks']['items'][n]['name']
                artist_name = search_item['tracks']['items'][n]['artists'][0]['name']
                popularity = search_item['tracks']['items'][n]['popularity']

                tracks_df = tracks_df.append({
                    'track_id': track_id,
                    'track_name': track_name,
                    'artist_name': artist_name,
                    'popularity': popularity,
                    'genre': genre
                }, ignore_index=True)

            offset += limit

    return tracks_df

In [29]:
steps = 2
limit = 10 # max 50 allowed
offset = 0 # takes the n-1th value of limit
genres_list = ['rap', 'rock', 'metal', 'blues', 'jazz', 'classical', 'funk', 'techno', 'electronic', 'r&b']

In [30]:
tracks_df = get_tracks(genres_list, steps=steps, limit=limit, offset=offset, access_token=access_token)
tracks_df

Unnamed: 0,track_id,track_name,artist_name,popularity,genre
0,4FyesJzVpA39hbYvcseO2d,Just Wanna Rock,Lil Uzi Vert,90.0,rap
1,7aRCf5cLOFN1U7kvtChY1G,Search & Rescue,Drake,91.0,rap
2,7KA4W4McWYRpgf0fWsJZWB,See You Again (feat. Kali Uchis),"Tyler, The Creator",92.0,rap
3,1Qrg8KqiBpW07V7PNxwwwL,Kill Bill,SZA,94.0,rap
4,2dHHgzDwk4BJdRwy9uXhTO,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,96.0,rap
...,...,...,...,...,...
195,2iUmqdfGZcHIhS3b9E9EWq,Everybody Talks,Neon Trees,80.0,r&b
196,6DCZcSspjsKoFjzjrWoCdn,God's Plan,Drake,86.0,r&b
197,2K7xn816oNHJZ0aVqdQsha,Softcore,The Neighbourhood,86.0,r&b
198,4yugZvBYaoREkJKtbG08Qr,Take It Easy - 2013 Remaster,Eagles,79.0,r&b


## Get Track Features

In [34]:
def get_track_features(tracks_df: pd.DataFrame, access_token: str):
    track_features_df = pd.DataFrame()

    for index, row in tracks_df.iterrows():
        track_id = tracks_df.iloc[index]['track_id']
        url = 'https://api.spotify.com/v1/audio-features/' + track_id
        track_features = get_data(url, access_token)
        track_features_df = track_features_df.append(track_features, ignore_index=True)

    # drop negligible features
    track_features_df.drop(columns=['type', 'uri', 'track_href', 'analysis_url'], inplace=True)
    track_features_df.rename(columns={'id':'track_id'}, inplace=True)

    return track_features_df

In [35]:
track_features_df = get_track_features(tracks_df, access_token)
track_features_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_id,duration_ms,time_signature
0,0.486,0.545,11.0,-7.924,1.0,0.0336,0.06520,0.004740,0.0642,0.0385,150.187,4FyesJzVpA39hbYvcseO2d,123891.0,4.0
1,0.817,0.440,10.0,-8.482,0.0,0.0734,0.06030,0.000001,0.3300,0.5440,142.024,7aRCf5cLOFN1U7kvtChY1G,272113.0,4.0
2,0.558,0.559,6.0,-9.222,1.0,0.0959,0.37100,0.000007,0.1090,0.6200,78.558,7KA4W4McWYRpgf0fWsJZWB,180387.0,4.0
3,0.644,0.735,8.0,-5.747,1.0,0.0391,0.05210,0.144000,0.1610,0.4180,88.980,1Qrg8KqiBpW07V7PNxwwwL,153947.0,4.0
4,0.715,0.620,1.0,-6.005,0.0,0.0484,0.41700,0.000000,0.0822,0.1720,97.950,2dHHgzDwk4BJdRwy9uXhTO,221520.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.471,0.924,8.0,-3.906,1.0,0.0586,0.00301,0.000000,0.3130,0.7250,154.961,2iUmqdfGZcHIhS3b9E9EWq,177280.0,4.0
196,0.754,0.449,7.0,-9.211,1.0,0.1090,0.03320,0.000083,0.5520,0.3570,77.169,6DCZcSspjsKoFjzjrWoCdn,198973.0,4.0
197,0.575,0.568,9.0,-5.509,0.0,0.0300,0.04840,0.000417,0.2860,0.3700,93.986,2K7xn816oNHJZ0aVqdQsha,206280.0,4.0
198,0.575,0.670,7.0,-10.390,1.0,0.0318,0.34300,0.000005,0.1290,0.7400,139.191,4yugZvBYaoREkJKtbG08Qr,211578.0,4.0


In [36]:
# merge tracks & features
df = tracks_df.merge(track_features_df, on='track_id')
df

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,4FyesJzVpA39hbYvcseO2d,Just Wanna Rock,Lil Uzi Vert,90.0,rap,0.486,0.545,11.0,-7.924,1.0,0.0336,0.0652,0.004740,0.0642,0.0385,150.187,123891.0,4.0
1,7aRCf5cLOFN1U7kvtChY1G,Search & Rescue,Drake,91.0,rap,0.817,0.440,10.0,-8.482,0.0,0.0734,0.0603,0.000001,0.3300,0.5440,142.024,272113.0,4.0
2,7KA4W4McWYRpgf0fWsJZWB,See You Again (feat. Kali Uchis),"Tyler, The Creator",92.0,rap,0.558,0.559,6.0,-9.222,1.0,0.0959,0.3710,0.000007,0.1090,0.6200,78.558,180387.0,4.0
3,1Qrg8KqiBpW07V7PNxwwwL,Kill Bill,SZA,94.0,rap,0.644,0.735,8.0,-5.747,1.0,0.0391,0.0521,0.144000,0.1610,0.4180,88.980,153947.0,4.0
4,2dHHgzDwk4BJdRwy9uXhTO,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,96.0,rap,0.715,0.620,1.0,-6.005,0.0,0.0484,0.4170,0.000000,0.0822,0.1720,97.950,221520.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,37F7E7BKEw2E4O2L7u0IEp,Limbo,Freddie Dredd,84.0,r&b,0.802,0.623,11.0,-5.862,0.0,0.4590,0.0292,0.055000,0.1130,0.4570,74.987,169947.0,4.0
210,1PS1QMdUqOal0ai3Gt7sDQ,Gold Digger,Kanye West,81.0,r&b,0.629,0.696,1.0,-5.572,0.0,0.3480,0.0195,0.000000,0.0554,0.6230,93.034,207627.0,4.0
211,1e1JKLEDKP7hEQzJfNAgPl,Magnolia,Playboi Carti,80.0,r&b,0.791,0.582,11.0,-7.323,0.0,0.2860,0.0114,0.000000,0.3500,0.4430,162.991,181812.0,4.0
212,6DCZcSspjsKoFjzjrWoCdn,God's Plan,Drake,86.0,r&b,0.754,0.449,7.0,-9.211,1.0,0.1090,0.0332,0.000083,0.5520,0.3570,77.169,198973.0,4.0


In [None]:
# export data
file_path = Path('genre-classification/data/tracks.csv')
file_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(file_path)