# Data Collection
- This notebook is responsible for collecting / processing the dataset of tracks including audio features to be used in model training and test.
- Make sure you already have created a Spotify app, have your ***client_id*** and ***client_secret*** ready. They are necessary for getting an access token.
- Create two environment variables with exact same name on your machine and the notebook will automatically include them in the context.
- Access token is included in requests header and is valid for an hour.

In [1]:
# import libraries

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # supress future warnings
import os
import json
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm

## Get Access Token

In [2]:
def get_access_token(client_id: str, client_secret: str, grant_type: str = 'client_credentials'):
    url = 'https://accounts.spotify.com/api/token?grant_type={}&client_id={}&client_secret={}'.format(grant_type, client_id, client_secret)
    response = requests.post(url, headers={'Content-Type':'application/x-www-form-urlencoded'})
    access_token = 'Bearer ' + json.loads(response.text)['access_token']

    return access_token

In [3]:
# get access token
grant_type = 'client_credentials'
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

access_token = get_access_token(client_id, client_secret, grant_type)

In [4]:
def get_data(url: str, access_token: str, verbose: bool = False):
    response = requests.get(url, headers={'Authorization': access_token})
    result = json.loads(response.text)

    if verbose:
        print('Response body:\n', result)

    return result

## Get Tracks
|track_id|track_name|artist_name|popularity|genre|
|---|---|---|---|---|

In [5]:
def get_tracks(genres_list: list, steps: int, limit: int, offset: int,access_token: str):
    tracks_df = pd.DataFrame()
    _initial_offset = offset
    progress_bar = tqdm(total= len(genres_list) * steps * limit)

    for genre in genres_list:
        
        for step in range(steps):
            url = 'https://api.spotify.com/v1/search?q=genre:{}&type=track&limit={}&offset={}'.format(genre, limit, offset)
            search_item = get_data(url, access_token)

            for n in range(limit):   
                # print(search_item.get("error",""))
                track_id = search_item['tracks']['items'][n]['id']
                track_name = search_item['tracks']['items'][n]['name']
                artist_name = search_item['tracks']['items'][n]['artists'][0]['name']
                popularity = search_item['tracks']['items'][n]['popularity']

                tracks_df = tracks_df.append({
                    'track_id': track_id,
                    'track_name': track_name,
                    'artist_name': artist_name,
                    'popularity': popularity,
                    'genre': genre
                }, ignore_index=True)
                progress_bar.update(1)

            offset += limit
        offset = _initial_offset
    progress_bar.close()

    return tracks_df

In [6]:
steps = 20 # 20 * 50 => 1000 per category
limit = 50 # max 50 allowed
offset = 0 # takes the n-1th value of limit
genres_list = ['rock', 'rap', 'metal', 'blues', 'jazz', 'classical', 'funk', 'techno', 'electronic', 'r&b']

In [7]:
tracks_df = get_tracks(genres_list, steps=steps, limit=limit, offset=offset, access_token=access_token)
tracks_df

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:23<00:00, 119.75it/s]


Unnamed: 0,track_id,track_name,artist_name,popularity,genre
0,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,rock
1,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,rock
2,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82,rock
3,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,87,rock
4,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,94,rock
...,...,...,...,...,...
9995,22I3h5AOENlH4CqXJsEbFR,Collide (feat. Tyga),Justine Skye,81,r&b
9996,124NFj84ppZ5pAxTuVQYCQ,Take Care,Drake,78,r&b
9997,2BJy4svtrGACqRB5BFLOK6,Way of the Triune God - Hallelujah Version,Tyler Childers,73,r&b
9998,5HQVUIKwCEXpe7JIHyY734,"Young, Wild & Free (feat. Bruno Mars)",Snoop Dogg,79,r&b


In [8]:
tracks_df['duplicated'] = tracks_df['track_id'].duplicated(keep='first') # add column about duplicate
tracks_df.groupby(['duplicated'])['duplicated'].count()

duplicated
False    8480
True     1520
Name: duplicated, dtype: int64

In [9]:
# drop duplicate tracks
tracks_df_no_duplicates = tracks_df.drop_duplicates('track_id', keep='first')
tracks_df_no_duplicates.reset_index(drop=True, inplace=True)
len(tracks_df_no_duplicates)

8480

In [10]:
tracks_df_no_duplicates.groupby('genre')['genre'].count()

genre
blues          949
classical      936
electronic     968
funk           819
jazz           882
metal          764
r&b            199
rap            965
rock           998
techno        1000
Name: genre, dtype: int64

## Get Track Features

In [25]:
def get_track_features(tracks_df: pd.DataFrame, access_token: str):
    track_features_df = pd.DataFrame()
    progress_bar = tqdm(total= len(tracks_df))

    with progress_bar: 
            idsToRequest = []
            for index, row in tracks_df.iterrows():
                track_id = tracks_df.iloc[index]['track_id']
                idsToRequest += [track_id]
            
            for i in range(len(idsToRequest) // 100 + 1):
                _list = idsToRequest[i*100:(i+1)*100]
                if len(_list) == 0: 
                    break
                request_text = ",".join(_list) 
                url = 'https://api.spotify.com/v1/audio-features?ids=' + request_text
                result = get_data(url, access_token)
                track_features_list = result["audio_features"]
                for track_features in track_features_list:
                    track_features_df = track_features_df.append(track_features, ignore_index=True)
                progress_bar.update(len(_list))

            progress_bar.close()

    # drop negligible features
    track_features_df.drop(columns=['type', 'uri', 'track_href', 'analysis_url'], inplace=True)
    track_features_df.rename(columns={'id':'track_id'}, inplace=True)

    return track_features_df

In [26]:
track_features_df = get_track_features(tracks_df, access_token)
track_features_df

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:57<00:00, 173.24it/s]


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_id,duration_ms,time_signature
0,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.398,124.053,2QjOHCTQ1Jl3zawyYOpxh6,240400,4
1,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.825,139.994,4h9wh7iOZ0GGn8QVp4RAOB,148486,4
2,0.520,0.852,0,-5.866,1,0.0543,0.00237,0.000058,0.0733,0.234,140.267,58ge6dfP91o9oXMzq3XkIS,253587,4
3,0.352,0.911,1,-5.230,1,0.0747,0.00121,0.000000,0.0995,0.236,148.033,003vvx7Niy0yvhvHt4a68B,222973,4
4,0.464,0.417,0,-9.345,0,0.0256,0.13600,0.022000,0.0974,0.479,67.528,5XeFesFbtLpXzIVDNQP22n,183956,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.418,0.310,11,-9.869,0,0.0625,0.09500,0.000000,0.2160,0.131,139.614,22I3h5AOENlH4CqXJsEbFR,260170,4
9996,0.629,0.515,0,-10.358,0,0.2650,0.02670,0.000012,0.0888,0.299,121.845,124NFj84ppZ5pAxTuVQYCQ,277387,4
9997,0.696,0.583,10,-6.384,1,0.0307,0.62900,0.000000,0.1230,0.543,79.057,2BJy4svtrGACqRB5BFLOK6,208813,4
9998,0.715,0.655,0,-6.425,1,0.1370,0.05250,0.000000,0.1150,0.531,95.078,5HQVUIKwCEXpe7JIHyY734,207333,4


In [27]:
track_features_df['duplicated'] = track_features_df['track_id'].duplicated(keep='first') # add column about duplicate
track_features_df.groupby(['duplicated'])['duplicated'].count()

duplicated
False    8480
True     1520
Name: duplicated, dtype: int64

In [28]:
# drop duplicate tracks
track_features_df_no_duplicates = track_features_df.drop_duplicates('track_id', keep='first')
track_features_df_no_duplicates.reset_index(drop=True, inplace=True)
len(track_features_df_no_duplicates)

8480

### Merge datasets

In [29]:
df = tracks_df.merge(track_features_df_no_duplicates, on='track_id')
df

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated_x,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duplicated_y
0,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,rock,False,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.398,124.053,240400,4,False
1,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,r&b,True,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.398,124.053,240400,4,False
2,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,rock,False,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.825,139.994,148486,4,False
3,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,r&b,True,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.825,139.994,148486,4,False
4,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82,rock,False,0.520,0.852,0,-5.866,1,0.0543,0.00237,0.000058,0.0733,0.234,140.267,253587,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,7dSZ6zGTQx66c2GF91xCrb,PROVENZA,KAROL G,84,r&b,False,0.870,0.516,1,-8.006,1,0.0541,0.65600,0.008230,0.1100,0.530,111.005,210200,4,False
9996,6leiB1fEsTnVCuPiielde5,Something Like That,Tim McGraw,74,r&b,False,0.507,0.850,5,-5.679,1,0.0472,0.37800,0.000000,0.0567,0.847,171.800,183733,4,False
9997,4qwKWGKzuq8mgIunO6EaA1,Sleep Well,d4vd,79,r&b,False,0.449,0.377,8,-9.438,1,0.0343,0.81500,0.000012,0.1420,0.181,155.560,175875,3,False
9998,22I3h5AOENlH4CqXJsEbFR,Collide (feat. Tyga),Justine Skye,81,r&b,False,0.418,0.310,11,-9.869,0,0.0625,0.09500,0.000000,0.2160,0.131,139.614,260170,4,False


# Data Processing
- Remove duplicates and irrelevant attributes.

In [30]:
print('Number of rows:\t', len(df))

Number of rows:	 10000


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          10000 non-null  object 
 1   track_name        10000 non-null  object 
 2   artist_name       10000 non-null  object 
 3   popularity        10000 non-null  int64  
 4   genre             10000 non-null  object 
 5   duplicated_x      10000 non-null  bool   
 6   danceability      10000 non-null  float64
 7   energy            10000 non-null  float64
 8   key               10000 non-null  int64  
 9   loudness          10000 non-null  float64
 10  mode              10000 non-null  int64  
 11  speechiness       10000 non-null  float64
 12  acousticness      10000 non-null  float64
 13  instrumentalness  10000 non-null  float64
 14  liveness          10000 non-null  float64
 15  valence           10000 non-null  float64
 16  tempo             10000 non-null  float64

In [32]:
# get duplicates
df[df[['track_name', 'artist_name']].duplicated()]

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated_x,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duplicated_y
1,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,r&b,True,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.398,124.053,240400,4,False
3,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,r&b,True,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.825,139.994,148486,4,False
5,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82,r&b,True,0.520,0.852,0,-5.866,1,0.0543,0.00237,0.000058,0.0733,0.234,140.267,253587,4,False
7,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,87,r&b,True,0.352,0.911,1,-5.230,1,0.0747,0.00121,0.000000,0.0995,0.236,148.033,222973,4,False
9,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,94,r&b,True,0.464,0.417,0,-9.345,0,0.0256,0.13600,0.022000,0.0974,0.479,67.528,183956,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9980,4EDijkJdHBZZ0GwJ12iTAj,Stargirl Interlude,The Weeknd,82,r&b,False,0.638,0.474,5,-12.700,0,0.1240,0.35100,0.191000,0.1050,0.441,179.771,111640,4,False
9983,3NZJlJemX3mzjf56MqC5ML,Forever,Chris Brown,79,r&b,True,0.672,0.820,11,-4.456,1,0.0459,0.03680,0.000188,0.1840,0.438,120.005,278573,4,False
9985,4FAKtPVycI4DxoOHC01YqD,Yandel 150,Yandel,90,r&b,False,0.783,0.729,6,-3.549,0,0.0691,0.04920,0.000272,0.1000,0.580,167.968,216148,4,False
9988,2SLwbpExuoBDZBpjfefCtV,Out of Time,The Weeknd,82,r&b,False,0.650,0.760,0,-4.422,0,0.0446,0.25500,0.000000,0.3390,0.838,93.057,214194,4,False


In [33]:
df[df[['track_name']].duplicated()]

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated_x,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duplicated_y
1,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91,r&b,True,0.612,0.807,10,-2.810,1,0.0336,0.04950,0.017700,0.1010,0.398,124.053,240400,4,False
3,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93,r&b,True,0.704,0.797,0,-5.927,1,0.0475,0.08260,0.000745,0.0546,0.825,139.994,148486,4,False
5,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82,r&b,True,0.520,0.852,0,-5.866,1,0.0543,0.00237,0.000058,0.0733,0.234,140.267,253587,4,False
7,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,87,r&b,True,0.352,0.911,1,-5.230,1,0.0747,0.00121,0.000000,0.0995,0.236,148.033,222973,4,False
9,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,94,r&b,True,0.464,0.417,0,-9.345,0,0.0256,0.13600,0.022000,0.0974,0.479,67.528,183956,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9983,3NZJlJemX3mzjf56MqC5ML,Forever,Chris Brown,79,r&b,True,0.672,0.820,11,-4.456,1,0.0459,0.03680,0.000188,0.1840,0.438,120.005,278573,4,False
9985,4FAKtPVycI4DxoOHC01YqD,Yandel 150,Yandel,90,r&b,False,0.783,0.729,6,-3.549,0,0.0691,0.04920,0.000272,0.1000,0.580,167.968,216148,4,False
9988,2SLwbpExuoBDZBpjfefCtV,Out of Time,The Weeknd,82,r&b,False,0.650,0.760,0,-4.422,0,0.0446,0.25500,0.000000,0.3390,0.838,93.057,214194,4,False
9990,2LlOeW5rVcvl3QcPNPcDus,Always,Daniel Caesar,85,r&b,False,0.603,0.441,9,-8.178,1,0.0518,0.69200,0.000001,0.1050,0.167,136.137,225313,4,False


There appears to be multiple scenarios causing duplicates:
- Totally identical rows: Some songs are published multiple times e.g. in the original album release and best of compilation of a band.
- Same song different versions: Probably original version, radio edit and remix under the same name.
- A song associated with multiple genres
- A joint song of multiple artists. Column `artist_name` represents only one artist if it's a collab song and same track can come twice under each artist's name.

Strategy:
- Remove duplicates of `track_name` & `artist_name`.

In [34]:
print('Number of duplicate tracks:\t', len(df[df[['track_name']].duplicated()]))

print('Number of unique tracks by genre:')
df_no_duplicates = df.drop_duplicates(['track_name', 'artist_name'], keep='first')
df_no_duplicates.groupby(by=['genre'])['genre'].count().sort_values()

Number of duplicate tracks:	 2437
Number of unique tracks by genre:


genre
r&b           159
funk          635
metal         697
jazz          834
blues         885
rap           898
classical     902
electronic    925
techno        943
rock          965
Name: genre, dtype: int64

In [35]:
print('Average values by genre')
genre_averages_df = df.groupby(by=['genre']).mean().sort_values(by='genre')
genre_averages_df.reset_index(drop=False, inplace=True)
genre_averages_df

Average values by genre


Unnamed: 0,genre,popularity,duplicated_x,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,duplicated_y
0,blues,55.791,0.051,0.543802,0.601717,5.243,-8.63088,0.698,0.056043,0.327326,0.070233,0.178929,0.583396,119.759554,221849.909,3.887,0.0
1,classical,21.368,0.064,0.345031,0.126276,4.948,-23.839342,0.615,0.045461,0.96414,0.695128,0.137121,0.369812,106.204014,230685.476,3.671,0.0
2,electronic,53.579,0.032,0.61693,0.684143,5.267,-7.52876,0.486,0.07873,0.181976,0.304557,0.192859,0.380322,123.631644,235003.708,3.963,0.0
3,funk,48.276,0.181,0.685648,0.699717,5.652,-6.969145,0.558,0.102108,0.183514,0.04015,0.186871,0.628613,115.268049,239096.486,3.976,0.0
4,jazz,53.732,0.118,0.56373,0.367953,5.29,-12.694265,0.596,0.065719,0.649845,0.320657,0.158348,0.468011,109.344007,215243.236,3.854,0.0
5,metal,65.306,0.236,0.470822,0.84026,5.277,-5.403389,0.589,0.073214,0.026662,0.054505,0.207564,0.436505,125.892015,244562.235,3.947,0.0
6,r&b,77.226,0.801,0.648535,0.654196,5.248,-6.736796,0.599,0.117449,0.184566,0.014643,0.185817,0.476308,120.735264,217613.361,3.967,0.0
7,rap,74.803,0.035,0.700609,0.6332,5.163,-6.550656,0.539,0.180702,0.170246,0.013576,0.197647,0.432464,122.374513,203548.452,3.977,0.0
8,rock,73.675,0.002,0.548334,0.717793,5.193,-7.104495,0.738,0.055335,0.153429,0.036507,0.177654,0.563378,123.019054,233140.48,3.95,0.0
9,techno,47.679,0.0,0.676588,0.76841,5.391,-7.833261,0.549,0.065055,0.081783,0.529936,0.168762,0.372563,128.487445,288143.704,3.964,0.0


In [37]:
# export data
file_path = Path('genre-classification/data/tracks_1000+_with_duplicates.csv')
file_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(file_path)