# Data Collection
- This notebook is responsible for collecting / processing the dataset of tracks including audio features to be used in model training and test.
- Make sure you already have created a Spotify app, have your ***client_id*** and ***client_secret*** ready. They are necessary for getting an access token.
- Create two environment variables with exact same name on your machine and the notebook will automatically include them in the context.
- Access token is included in requests header and is valid for an hour.

In [27]:
# import libraries

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # supress future warnings
import os
import json
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm

## Get Access Token

In [28]:
def get_access_token(client_id: str, client_secret: str, grant_type: str = 'client_credentials'):
    url = 'https://accounts.spotify.com/api/token?grant_type={}&client_id={}&client_secret={}'.format(grant_type, client_id, client_secret)
    response = requests.post(url, headers={'Content-Type':'application/x-www-form-urlencoded'})
    access_token = 'Bearer ' + json.loads(response.text)['access_token']

    return access_token

In [29]:
# get access token
grant_type = 'client_credentials'
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

access_token = get_access_token(client_id, client_secret, grant_type)

In [31]:
def get_data(url: str, access_token: str, verbose: bool = False):
    response = requests.get(url, headers={'Authorization': access_token})
    result = json.loads(response.text)

    if verbose:
        print('Response body:\n', result)

    return result

## Get Tracks
|track_id|track_name|artist_name|popularity|genre|
|---|---|---|---|---|

In [32]:
def get_tracks(genres_list: list, steps: int, limit: int, offset: int,access_token: str):
    tracks_df = pd.DataFrame()
    _initial_offset = offset
    progress_bar = tqdm(total= len(genres_list) * steps * limit)

    for genre in genres_list:
        
        for step in range(steps):
            url = 'https://api.spotify.com/v1/search?q=genre:{}&type=track&limit={}&offset={}'.format(genre, limit, offset)
            search_item = get_data(url, access_token)

            for n in range(limit):   
                # print(search_item.get("error",""))
                track_id = search_item['tracks']['items'][n]['id']
                track_name = search_item['tracks']['items'][n]['name']
                artist_name = search_item['tracks']['items'][n]['artists'][0]['name']
                popularity = search_item['tracks']['items'][n]['popularity']

                tracks_df = tracks_df.append({
                    'track_id': track_id,
                    'track_name': track_name,
                    'artist_name': artist_name,
                    'popularity': popularity,
                    'genre': genre
                }, ignore_index=True)
                progress_bar.update(1)

            offset += limit
        offset = _initial_offset
    progress_bar.close()

    return tracks_df

In [33]:
steps = 20 # 20 * 50 => 1000 per category
limit = 50 # max 50 allowed
offset = 0 # takes the n-1th value of limit
genres_list = ['rock', 'rap', 'metal', 'blues', 'jazz', 'classical', 'funk', 'techno', 'electronic', 'r&b']

In [34]:
tracks_df = get_tracks(genres_list, steps=steps, limit=limit, offset=offset, access_token=access_token)
tracks_df

100%|██████████| 10000/10000 [01:44<00:00, 95.94it/s]


Unnamed: 0,track_id,track_name,artist_name,popularity,genre
0,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,91.0,rock
1,4h9wh7iOZ0GGn8QVp4RAOB,I Ain't Worried,OneRepublic,93.0,rock
2,58ge6dfP91o9oXMzq3XkIS,505,Arctic Monkeys,82.0,rock
3,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,87.0,rock
4,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,Arctic Monkeys,94.0,rock
...,...,...,...,...,...
9995,6cEguiQecbXrFlsnMi2ysr,Come and See Me (feat. Drake),PARTYNEXTDOOR,75.0,r&b
9996,6ihL9TjfRjadfEePzXXyVF,Gives You Hell,The All-American Rejects,73.0,r&b
9997,67eX1ovaHyVPUinMHeUtIM,Hurts So Good,John Mellencamp,74.0,r&b
9998,5ht9obvnnWeW4eoRtPAoQD,Fact (feat. Lil Uzi Vert),Ghostluvme,73.0,r&b


In [35]:
tracks_df['duplicated'] = tracks_df['track_id'].duplicated(keep='first') # add column about duplicate
tracks_df.groupby(['duplicated'])['duplicated'].count()

duplicated
False    8431
True     1569
Name: duplicated, dtype: int64

In [36]:
# drop duplicate tracks
tracks_df_no_duplicates = tracks_df.drop_duplicates('track_id', keep='first')
tracks_df_no_duplicates.reset_index(drop=True, inplace=True)
len(tracks_df_no_duplicates)

8431

In [37]:
tracks_df_no_duplicates.groupby('genre')['genre'].count()

genre
blues          948
classical      900
electronic     970
funk           815
jazz           885
metal          765
r&b            185
rap            964
rock           999
techno        1000
Name: genre, dtype: int64

## Get Track Features

In [38]:
def get_track_features(tracks_df: pd.DataFrame, access_token: str):
    track_features_df = pd.DataFrame()
    progress_bar = tqdm(total= len(tracks_df))

    with progress_bar:
            for index, row in tracks_df.iterrows():
                track_id = tracks_df.iloc[index]['track_id']
                url = 'https://api.spotify.com/v1/audio-features/' + track_id
                track_features = get_data(url, access_token)
                track_features_df = track_features_df.append(track_features, ignore_index=True)
                progress_bar.update(1)

            progress_bar.close()

    # drop negligible features
    track_features_df.drop(columns=['type', 'uri', 'track_href', 'analysis_url'], inplace=True)
    track_features_df.rename(columns={'id':'track_id'}, inplace=True)

    return track_features_df

In [39]:
track_features_df = get_track_features(tracks_df, access_token)
track_features_df

100%|██████████| 10000/10000 [24:53<00:00,  6.70it/s]


Unnamed: 0,error,acousticness,danceability,duration_ms,energy,track_id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,{'status': 429},,,,,,,,,,,,,,
1,{'status': 429},,,,,,,,,,,,,,
2,{'status': 429},,,,,,,,,,,,,,
3,{'status': 429},,,,,,,,,,,,,,
4,{'status': 429},,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,{'status': 429},,,,,,,,,,,,,,
9996,{'status': 429},,,,,,,,,,,,,,
9997,{'status': 429},,,,,,,,,,,,,,
9998,{'status': 429},,,,,,,,,,,,,,


In [40]:
track_features_df['duplicated'] = track_features_df['track_id'].duplicated(keep='first') # add column about duplicate
track_features_df.groupby(['duplicated'])['duplicated'].count()

duplicated
False       5
True     9995
Name: duplicated, dtype: int64

In [41]:
# drop duplicate tracks
track_features_df_no_duplicates = track_features_df.drop_duplicates('track_id', keep='first')
track_features_df_no_duplicates.reset_index(drop=True, inplace=True)
len(track_features_df_no_duplicates)

5

### Merge datasets

In [42]:
df = tracks_df.merge(track_features_df_no_duplicates, on='track_id')
df

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated_x,error,acousticness,danceability,duration_ms,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,duplicated_y
0,4iW98ZK1gNzcXmQ5TgTD9X,I Just Want To Celebrate,Rare Earth,51.0,blues,False,,0.105,0.435,217560.0,...,2e-06,9.0,0.015,-7.7,1.0,0.235,175.448,4.0,0.65,False
1,5Iev69m1Jnf7oc79VvPjpd,Cupid,Sam Cooke,60.0,jazz,False,,0.476,0.485,158014.0,...,0.0,7.0,0.0861,-10.503,1.0,0.0342,120.471,4.0,0.825,False
2,6cgSmS2mTPOMmplqUfQaeZ,"Prelude & Fugue in C-Sharp Minor, BWV 849: I. ...",Johann Sebastian Bach,22.0,classical,False,,0.996,0.297,176080.0,...,0.927,1.0,0.0882,-27.217,0.0,0.0533,182.862,4.0,0.277,False
3,0KubEonULpUEsK2O8Nzhj3,"Piano Concerto No. 20 in D Minor, K. 466: II. ...",Wolfgang Amadeus Mozart,14.0,classical,False,,0.987,0.204,598782.0,...,0.873,10.0,0.0983,-21.589,1.0,0.0317,80.701,4.0,0.118,False


# Data Processing
- Remove duplicates and irrelevant attributes.

In [43]:
print('Number of rows:\t', len(df))

Number of rows:	 4


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          4 non-null      object 
 1   track_name        4 non-null      object 
 2   artist_name       4 non-null      object 
 3   popularity        4 non-null      float64
 4   genre             4 non-null      object 
 5   duplicated_x      4 non-null      bool   
 6   error             0 non-null      object 
 7   acousticness      4 non-null      float64
 8   danceability      4 non-null      float64
 9   duration_ms       4 non-null      float64
 10  energy            4 non-null      float64
 11  instrumentalness  4 non-null      float64
 12  key               4 non-null      float64
 13  liveness          4 non-null      float64
 14  loudness          4 non-null      float64
 15  mode              4 non-null      float64
 16  speechiness       4 non-null      float64
 17  t

In [45]:
# get duplicates
df[df[['track_name', 'artist_name']].duplicated()]

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated_x,error,acousticness,danceability,duration_ms,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,duplicated_y


In [46]:
df[df[['track_name']].duplicated()]

Unnamed: 0,track_id,track_name,artist_name,popularity,genre,duplicated_x,error,acousticness,danceability,duration_ms,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,duplicated_y


There appears to be multiple scenarios causing duplicates:
- Totally identical rows: Some songs are published multiple times e.g. in the original album release and best of compilation of a band.
- Same song different versions: Probably original version, radio edit and remix under the same name.
- A song associated with multiple genres
- A joint song of multiple artists. Column `artist_name` represents only one artist if it's a collab song and same track can come twice under each artist's name.

Strategy:
- Remove duplicates of `track_name` & `artist_name`.

In [47]:
print('Number of duplicate tracks:\t', len(df[df[['track_name']].duplicated()]))

print('Number of unique tracks by genre:')
df_no_duplicates = df.drop_duplicates(['track_name', 'artist_name'], keep='first')
df_no_duplicates.groupby(by=['genre'])['genre'].count().sort_values()

Number of duplicate tracks:	 0
Number of unique tracks by genre:


genre
blues        1
jazz         1
classical    2
Name: genre, dtype: int64

In [48]:
print('Average values by genre')
genre_averages_df = df.groupby(by=['genre']).mean().sort_values(by='genre')
genre_averages_df.reset_index(drop=False, inplace=True)
genre_averages_df

Average values by genre


Unnamed: 0,genre,popularity,duplicated_x,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,duplicated_y
0,blues,51.0,0.0,0.105,0.435,217560.0,0.929,2e-06,9.0,0.015,-7.7,1.0,0.235,175.448,4.0,0.65,0.0
1,classical,18.0,0.0,0.9915,0.2505,387431.0,0.08245,0.9,5.5,0.09325,-24.403,0.5,0.0425,131.7815,4.0,0.1975,0.0
2,jazz,60.0,0.0,0.476,0.485,158014.0,0.433,0.0,7.0,0.0861,-10.503,1.0,0.0342,120.471,4.0,0.825,0.0


In [49]:
# export data
file_path = Path('genre-classification/data/tracks_1000+_with_duplicates.csv')
file_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(file_path)