# Data Collection
- This notebook is responsible for collecting / processing the dataset of tracks including audio features to be used in model training and test.
- Make sure you already have created a Spotify app, have your ***client_id*** and ***client_secret*** ready. They are necessary for getting an access token.
- Create two environment variables with exact same name on your machine and the notebook will automatically include them in the context.
- Access token is included in requests header and is valid for an hour.

In [1]:
# import libraries

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # supress future warnings
import os
import json
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm

## Get Access Token

In [2]:
def get_access_token(client_id: str, client_secret: str, grant_type: str = 'client_credentials'):
    url = 'https://accounts.spotify.com/api/token?grant_type={}&client_id={}&client_secret={}'.format(grant_type, client_id, client_secret)
    response = requests.post(url, headers={'Content-Type':'application/x-www-form-urlencoded'})
    access_token = 'Bearer ' + json.loads(response.text)['access_token']

    return access_token

In [3]:
# get access token
grant_type = 'client_credentials'
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

access_token = get_access_token(client_id, client_secret, grant_type)

In [4]:
print(access_token)

Bearer BQAWCnYDeO7FO0mLBa-F0VqsZlaglhMrltsMegmkrWgBzN209GFxr0kz0YwZwC1Cr6aNvzlbDx_a-WfGWvj4oGhVg6IYUj76pENR-_Tf_JxgVF6ZxpQ


In [5]:
def get_data(url: str, access_token: str, verbose: bool = False):
    response = requests.get(url, headers={'Authorization': access_token})
    result = json.loads(response.text)

    if verbose:
        print('Response body:\n', result)

    return result

## Get Tracks
|track_id|track_name|artist_name|popularity|genre|
|---|---|---|---|---|

In [6]:
def get_tracks(genres_list: list, steps: int, limit: int, offset: int,access_token: str):
    tracks_df = pd.DataFrame()
    _initial_offset = offset
    progress_bar = tqdm(total= len(genres_list) * steps * limit)

    for genre in genres_list:
        
        for step in range(steps):
            url = 'https://api.spotify.com/v1/search?q=genre:{}&type=track&limit={}&offset={}'.format(genre, limit, offset)
            search_item = get_data(url, access_token)

            for n in range(limit):   
                # print(search_item.get("error",""))
                track_id = search_item['tracks']['items'][n]['id']
                track_name = search_item['tracks']['items'][n]['name']
                artist_name = search_item['tracks']['items'][n]['artists'][0]['name']
                popularity = search_item['tracks']['items'][n]['popularity']

                tracks_df = tracks_df.append({
                    'track_id': track_id,
                    'track_name': track_name,
                    'artist_name': artist_name,
                    'popularity': popularity,
                    'genre': genre
                }, ignore_index=True)
                progress_bar.update(1)

            offset += limit
        offset = _initial_offset
    progress_bar.close()

    return tracks_df

In [7]:
steps = 20 # 20 * 50 => 1000 per category
limit = 50 # max 50 allowed
offset = 0 # takes the n-1th value of limit
genres_list = ['rock', 'rap', 'metal', 'blues', 'jazz', 'classical', 'funk', 'techno', 'electronic', 'r&b']

In [8]:
tracks_df = get_tracks(genres_list, steps=steps, limit=limit, offset=offset, access_token=access_token)
tracks_df

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:40<00:00, 99.21it/s]


Unnamed: 0,track_id,track_name,artist_name,popularity,genre
0,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,rock
1,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,rock
2,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82,rock
3,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,87,rock
4,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,94,rock
...,...,...,...,...,...
9995,6cEguiQecbXrFlsnMi2ysr,Come and See Me (feat. Drake),PARTYNEXTDOOR,75,r&b
9996,6ihL9TjfRjadfEePzXXyVF,Gives You Hell,The All-American Rejects,73,r&b
9997,67eX1ovaHyVPUinMHeUtIM,Hurts So Good,John Mellencamp,74,r&b
9998,5ht9obvnnWeW4eoRtPAoQD,Fact (feat. Lil Uzi Vert),Ghostluvme,73,r&b


In [9]:
tracks_df['duplicated'] = tracks_df['track_id'].duplicated(keep='first') # add column about duplicate
tracks_df.groupby(['duplicated'])['duplicated'].count()

duplicated
False    8421
True     1579
Name: duplicated, dtype: int64

In [10]:
# drop duplicate tracks
tracks_df_no_duplicates = tracks_df.drop_duplicates('track_id', keep='first')
tracks_df_no_duplicates.reset_index(drop=True, inplace=True)
len(tracks_df_no_duplicates)

8421

In [11]:
tracks_df_no_duplicates.groupby('genre')['genre'].count()

genre
blues          948
classical      891
electronic     969
funk           817
jazz           883
metal          765
r&b            184
rap            965
rock           999
techno        1000
Name: genre, dtype: int64

## Get Track Features

In [12]:
def get_track_features(tracks_df: pd.DataFrame, access_token: str):
    track_features_df = pd.DataFrame()
    progress_bar = tqdm(total= len(tracks_df))

    with progress_bar:
            for index, row in tracks_df.iterrows():
                track_id = tracks_df.iloc[index]['track_id']
                url = 'https://api.spotify.com/v1/audio-features/' + track_id
                track_features = get_data(url, access_token)
                track_features_df = track_features_df.append(track_features, ignore_index=True)
                progress_bar.update(1)

            progress_bar.close()

    # drop negligible features
    track_features_df.drop(columns=['type', 'uri', 'track_href', 'analysis_url'], inplace=True)
    track_features_df.rename(columns={'id':'track_id'}, inplace=True)

    return track_features_df

In [13]:
track_features_df = get_track_features(tracks_df, access_token)
track_features_df

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [19:05<00:00,  8.73it/s]


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_id,duration_ms,time_signature
0,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.3980,124.053,2QjOHCTQ1Jl3zawyYOpxh6,240400,4
1,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.8250,139.994,4h9wh7iOZ0GGn8QVp4RAOB,148486,4
2,0.520,0.852,0,-5.866,1,0.0543,0.00237,0.000058,0.0733,0.2340,140.267,58ge6dfP91o9oXMzq3XkIS,253587,4
3,0.352,0.911,1,-5.230,1,0.0747,0.00121,0.000000,0.0995,0.2360,148.033,003vvx7Niy0yvhvHt4a68B,222973,4
4,0.464,0.417,0,-9.345,0,0.0256,0.13600,0.022000,0.0974,0.4790,67.528,5XeFesFbtLpXzIVDNQP22n,183956,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.770,0.347,2,-10.265,1,0.0623,0.05110,0.000003,0.0870,0.0583,84.957,6cEguiQecbXrFlsnMi2ysr,235477,4
9996,0.718,0.691,4,-6.440,1,0.0387,0.01590,0.000000,0.0627,0.5520,100.008,6ihL9TjfRjadfEePzXXyVF,213107,4
9997,0.785,0.737,9,-5.306,1,0.0363,0.04200,0.000107,0.1080,0.9710,125.447,67eX1ovaHyVPUinMHeUtIM,218960,4
9998,0.779,0.684,8,-4.963,1,0.1080,0.18100,0.000000,0.0691,0.5260,124.034,5ht9obvnnWeW4eoRtPAoQD,174240,4


In [14]:
track_features_df['duplicated'] = track_features_df['track_id'].duplicated(keep='first') # add column about duplicate
track_features_df.groupby(['duplicated'])['duplicated'].count()

duplicated
False    8421
True     1579
Name: duplicated, dtype: int64

In [15]:
# drop duplicate tracks
track_features_df_no_duplicates = track_features_df.drop_duplicates('track_id', keep='first')
track_features_df_no_duplicates.reset_index(drop=True, inplace=True)
len(track_features_df_no_duplicates)

8421

### Merge datasets

In [16]:
df = tracks_df.merge(track_features_df_no_duplicates, on='track_id')
df

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated_x,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duplicated_y
0,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,rock,False,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.398,124.053,240400,4,False
1,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,r&b,True,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.398,124.053,240400,4,False
2,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,rock,False,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.825,139.994,148486,4,False
3,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,r&b,True,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.825,139.994,148486,4,False
4,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82,rock,False,0.520,0.852,0,-5.866,1,0.0543,0.00237,0.000058,0.0733,0.234,140.267,253587,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,22I3h5AOENlH4CqXJsEbFR,Collide (feat. Tyga),Justine Skye,81,r&b,False,0.418,0.310,11,-9.869,0,0.0625,0.09500,0.000000,0.2160,0.131,139.614,260170,4,False
9996,2BJy4svtrGACqRB5BFLOK6,Way of the Triune God - Hallelujah Version,Tyler Childers,73,r&b,False,0.696,0.583,10,-6.384,1,0.0307,0.62900,0.000000,0.1230,0.543,79.057,208813,4,False
9997,47EiUVwUp4C9fGccaPuUCS,DÁKITI,Bad Bunny,82,r&b,False,0.731,0.573,4,-10.059,0,0.0544,0.40100,0.000052,0.1130,0.145,109.928,205090,4,False
9998,3bqSvjiSGtzNvxhlAPDBWA,JAMAICA,Feid,0,r&b,False,0.520,0.542,1,-7.404,0,0.1550,0.03830,0.000000,0.3710,0.669,81.131,224240,4,False


# Data Processing
- Remove duplicates and irrelevant attributes.

In [17]:
print('Number of rows:\t', len(df))

Number of rows:	 10000


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          10000 non-null  object 
 1   track_name        10000 non-null  object 
 2   artist_name       10000 non-null  object 
 3   popularity        10000 non-null  int64  
 4   genre             10000 non-null  object 
 5   duplicated_x      10000 non-null  bool   
 6   danceability      10000 non-null  float64
 7   energy            10000 non-null  float64
 8   key               10000 non-null  int64  
 9   loudness          10000 non-null  float64
 10  mode              10000 non-null  int64  
 11  speechiness       10000 non-null  float64
 12  acousticness      10000 non-null  float64
 13  instrumentalness  10000 non-null  float64
 14  liveness          10000 non-null  float64
 15  valence           10000 non-null  float64
 16  tempo             10000 non-null  float64

In [19]:
# get duplicates
df[df[['track_name', 'artist_name']].duplicated()]

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated_x,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duplicated_y
1,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,r&b,True,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.398,124.053,240400,4,False
3,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,r&b,True,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.825,139.994,148486,4,False
5,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82,r&b,True,0.520,0.852,0,-5.866,1,0.0543,0.00237,0.000058,0.0733,0.234,140.267,253587,4,False
7,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,87,r&b,True,0.352,0.911,1,-5.230,1,0.0747,0.00121,0.000000,0.0995,0.236,148.033,222973,4,False
9,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,94,r&b,True,0.464,0.417,0,-9.345,0,0.0256,0.13600,0.022000,0.0974,0.479,67.528,183956,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9979,4EDijkJdHBZZ0GwJ12iTAj,Stargirl Interlude,The Weeknd,82,r&b,False,0.638,0.474,5,-12.700,0,0.1240,0.35100,0.191000,0.1050,0.441,179.771,111640,4,False
9982,4FAKtPVycI4DxoOHC01YqD,Yandel 150,Yandel,90,r&b,False,0.783,0.729,6,-3.549,0,0.0691,0.04920,0.000272,0.1000,0.580,167.968,216148,4,False
9987,2LlOeW5rVcvl3QcPNPcDus,Always,Daniel Caesar,85,r&b,True,0.603,0.441,9,-8.178,1,0.0518,0.69200,0.000001,0.1050,0.167,136.137,225313,4,False
9992,7dSZ6zGTQx66c2GF91xCrb,PROVENZA,KAROL G,84,r&b,False,0.870,0.516,1,-8.006,1,0.0541,0.65600,0.008230,0.1100,0.530,111.005,210200,4,False


In [20]:
df[df[['track_name']].duplicated()]

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated_x,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duplicated_y
1,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,r&b,True,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.398,124.053,240400,4,False
3,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,r&b,True,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.825,139.994,148486,4,False
5,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82,r&b,True,0.520,0.852,0,-5.866,1,0.0543,0.00237,0.000058,0.0733,0.234,140.267,253587,4,False
7,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,87,r&b,True,0.352,0.911,1,-5.230,1,0.0747,0.00121,0.000000,0.0995,0.236,148.033,222973,4,False
9,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,94,r&b,True,0.464,0.417,0,-9.345,0,0.0256,0.13600,0.022000,0.0974,0.479,67.528,183956,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9982,4FAKtPVycI4DxoOHC01YqD,Yandel 150,Yandel,90,r&b,False,0.783,0.729,6,-3.549,0,0.0691,0.04920,0.000272,0.1000,0.580,167.968,216148,4,False
9986,2LlOeW5rVcvl3QcPNPcDus,Always,Daniel Caesar,85,r&b,False,0.603,0.441,9,-8.178,1,0.0518,0.69200,0.000001,0.1050,0.167,136.137,225313,4,False
9987,2LlOeW5rVcvl3QcPNPcDus,Always,Daniel Caesar,85,r&b,True,0.603,0.441,9,-8.178,1,0.0518,0.69200,0.000001,0.1050,0.167,136.137,225313,4,False
9992,7dSZ6zGTQx66c2GF91xCrb,PROVENZA,KAROL G,84,r&b,False,0.870,0.516,1,-8.006,1,0.0541,0.65600,0.008230,0.1100,0.530,111.005,210200,4,False


In [21]:
t = tracks_df.loc[tracks_df['genre']=='r&b']
t[t[['track_name']].duplicated()]

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated
9011,3OHfY25tqY28d16oZczHc8,Kill Bill,SZA,92,r&b,True
9151,2Ch7LmS7r2Gy2kc64wv3Bz,Die For You,The Weeknd,87,r&b,False
9204,0RiRZpuVRbi7oqRdSMwhQY,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,84,r&b,True
9224,6bnF93Rx87YqUBLSgjiMU8,Heartless,The Weeknd,86,r&b,False
9296,6OfOzTitafSnsaunQLuNFw,DOGTOOTH,"Tyler, The Creator",80,r&b,True
9357,4PMqSO5qyjpvzhlLI5GnID,Good Days,SZA,80,r&b,True
9541,54ipXppHLA8U4yqpOFTUhr,Bones,Imagine Dragons,87,r&b,True
9564,2mEXFcbDPRVN5yEGaerasl,Starboy,The Weeknd,5,r&b,False
9568,0NbWSA51peslttxjmRosj5,Save Your Tears,The Weeknd,0,r&b,False
9569,0JvmRjqQsw922wI7u19UoP,Save Your Tears,The Weeknd,5,r&b,False


There appears to be multiple scenarios causing duplicates:
- Totally identical rows: Some songs are published multiple times e.g. in the original album release and best of compilation of a band.
- Same song different versions: Probably original version, radio edit and remix under the same name.
- A song associated with multiple genres
- A joint song of multiple artists. Column `artist_name` represents only one artist if it's a collab song and same track can come twice under each artist's name.

Strategy:
- Remove duplicates of `track_name` & `artist_name`.

In [22]:
print('Number of duplicate tracks:\t', len(df[df[['track_name']].duplicated()]))
df = df.drop_duplicates(['track_name', 'artist_name'], keep='first')

Number of duplicate tracks:	 2461


In [23]:
print('Number of tracks by genre:')
df.groupby(by=['genre'])['genre'].count().sort_values()

Number of tracks by genre:


genre
r&b           154
funk          634
metal         696
jazz          836
classical     861
blues         888
rap           904
electronic    928
techno        948
rock          970
Name: genre, dtype: int64

In [24]:
print('Average values by genre')
genre_averages_df = df.groupby(by=['genre']).mean().sort_values(by='genre')
genre_averages_df.reset_index(drop=False, inplace=True)
genre_averages_df

Average values by genre


Unnamed: 0,genre,popularity,duplicated_x,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duplicated_y
0,blues,56.407658,0.0,0.546552,0.600044,5.315315,-8.716926,0.689189,0.055616,0.327383,0.071403,0.179546,0.587301,119.127936,220466.631757,3.887387,0.0
1,classical,23.077816,0.0,0.350112,0.126068,5.004646,-23.972818,0.616725,0.045524,0.96521,0.706961,0.135561,0.374249,105.991048,227573.547038,3.66899,0.0
2,electronic,54.28556,0.0,0.616458,0.681744,5.268319,-7.60318,0.488147,0.079093,0.185209,0.30659,0.194092,0.380393,123.64379,236311.121767,3.962284,0.0
3,funk,54.646688,0.0,0.691402,0.686662,5.65142,-7.405505,0.580442,0.110622,0.189028,0.045641,0.19138,0.645836,115.807639,242920.697161,3.9653,0.0
4,jazz,55.991627,0.0,0.570301,0.366922,5.301435,-13.082016,0.576555,0.066554,0.642248,0.385451,0.154144,0.458837,108.131738,218634.322967,3.87799,0.0
5,metal,66.469828,0.0,0.461443,0.857368,5.277299,-5.129343,0.576149,0.078955,0.021849,0.061271,0.213192,0.418952,127.908055,242896.433908,3.942529,0.0
6,r&b,78.798701,0.0,0.668617,0.600325,5.350649,-6.94774,0.558442,0.086279,0.286477,0.022228,0.175051,0.460697,115.931045,211424.12987,3.902597,0.0
7,rap,76.719027,0.0,0.711086,0.622854,5.255531,-6.645815,0.526549,0.187509,0.177098,0.012932,0.197434,0.433099,122.520257,203059.284292,3.978982,0.0
8,rock,75.430928,0.0,0.543754,0.71568,5.2,-7.183482,0.748454,0.054232,0.155517,0.037255,0.178193,0.56182,123.319913,234586.670103,3.948454,0.0
9,techno,48.270042,0.0,0.678075,0.76817,5.409283,-7.916874,0.549578,0.065848,0.080923,0.545062,0.168372,0.371679,128.43585,290686.738397,3.96308,0.0


In [26]:
# export data
file_path = Path('genre-classification/data/tracks_1000+_with_duplicates.csv')
file_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(file_path)