Using the spotify API, we will attempt to gather the same datapoints on additional songs, primarily from earlier years

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

auth_manager = SpotifyClientCredentials(client_id='aced1b25cbb44328924111ae0db5ea05', client_secret='1bc0d6a0d6fc4c70ad35653a4e3a748e')
sp = spotipy.Spotify(auth_manager=auth_manager)

In [148]:
data_extension = pd.DataFrame()
songs_per_year = 50
limit=50 # <=50
pages_per_year = max(int(songs_per_year/limit), 1)

for year in tqdm(range(1900, 2021)):
    response = sp.search(q=f'year:{year}', limit=limit, type='track')
    
    collector = []
    
    for page in (range(pages_per_year)):
        tracks = response['tracks']
        for i, track in enumerate(tracks['items']):
            name = track['name']
            track_id = track['id']
        
            collector.append([year, name, track_id, track['duration_ms'], track['explicit']])
            
        if tracks['next']:
            tracks = sp.next(tracks)
        else:
            break
    data_extension = pd.concat([data_extension, pd.DataFrame(collector)])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 121/121 [00:47<00:00,  2.53it/s]


In [149]:
data_extension.columns = ['year', 'name', 'id', 'duration_ms', 'explicit']                
data_extension

Unnamed: 0,year,name,id,duration_ms,explicit
0,1900,Sledgehammer,3wLZ69kr5J2sb934Kpv02c,295653,False
1,1900,I Was Drunk,1SbSX9AWecOf7v0CFTHZ4Q,243840,False
2,1900,Mighty Mighty Man,5tBDBrsPypLVJ9Rbpy3MNm,143230,False
3,1900,Mercy Street,20tRwxVyguyMwijkNtH8oP,283373,False
4,1900,Mighty Mighty Man (Alternate Take),6Y35mTzWPIztc6FhLu4ZaN,147160,False
...,...,...,...,...,...
45,2020,Gravity - Remastered 2020,4zN1YCljFntHRccaX35RNg,244448,False
46,2020,1 Scale (feat. G Herbo),5gmkw2031VVIboYqPIjSWB,176040,True
47,2020,Year 2020,03VrhBjoCp0rSjajdhNE5N,329142,False
48,2020,Godzilla (feat. Juice WRLD),7FIWs0pqAYbP91WWM0vlTQ,210800,True


Here we sort out all songs that are already in the originial dataset
    

In [150]:
orig_data = pd.read_csv('../data/tracks_features.csv')
orig_data = orig_data[orig_data.year != 0].reset_index(drop=True)

In [156]:
clean_data_extension = data_extension[~data_extension.id.isin(orig_data.id)]
clean_data_extension = clean_data_extension.drop_duplicates()
clean_data_extension

Unnamed: 0,year,name,id,duration_ms,explicit
0,1900,Sledgehammer,3wLZ69kr5J2sb934Kpv02c,295653,False
1,1900,I Was Drunk,1SbSX9AWecOf7v0CFTHZ4Q,243840,False
2,1900,Mighty Mighty Man,5tBDBrsPypLVJ9Rbpy3MNm,143230,False
3,1900,Mercy Street,20tRwxVyguyMwijkNtH8oP,283373,False
4,1900,Mighty Mighty Man (Alternate Take),6Y35mTzWPIztc6FhLu4ZaN,147160,False
...,...,...,...,...,...
44,2020,Laugh Now Cry Later (feat. Lil Durk),2SAqBLGA283SUiwJ3xOUVI,261492,True
45,2020,Gravity - Remastered 2020,4zN1YCljFntHRccaX35RNg,244448,False
47,2020,Year 2020,03VrhBjoCp0rSjajdhNE5N,329142,False
48,2020,Godzilla (feat. Juice WRLD),7FIWs0pqAYbP91WWM0vlTQ,210800,True


In [157]:
# this cell makes an effort of marking audiobooks in the data
chapter_lang = ['chapter', 'capítulo', 'kapitel', 'глава']
clean_data_extension['book'] = [np.any([chapter.lower() in name.lower() for chapter in chapter_lang]) for name in clean_data_extension.name]

In [158]:
clean_data_extension = clean_data_extension[clean_data_extension.book != True]

In [159]:
clean_data_extension

Unnamed: 0,year,name,id,duration_ms,explicit,book
0,1900,Sledgehammer,3wLZ69kr5J2sb934Kpv02c,295653,False,False
1,1900,I Was Drunk,1SbSX9AWecOf7v0CFTHZ4Q,243840,False,False
2,1900,Mighty Mighty Man,5tBDBrsPypLVJ9Rbpy3MNm,143230,False,False
3,1900,Mercy Street,20tRwxVyguyMwijkNtH8oP,283373,False,False
4,1900,Mighty Mighty Man (Alternate Take),6Y35mTzWPIztc6FhLu4ZaN,147160,False,False
...,...,...,...,...,...,...
44,2020,Laugh Now Cry Later (feat. Lil Durk),2SAqBLGA283SUiwJ3xOUVI,261492,True,False
45,2020,Gravity - Remastered 2020,4zN1YCljFntHRccaX35RNg,244448,False,False
47,2020,Year 2020,03VrhBjoCp0rSjajdhNE5N,329142,False,False
48,2020,Godzilla (feat. Juice WRLD),7FIWs0pqAYbP91WWM0vlTQ,210800,True,False


Now it is time to get the audio features for all of these songs

In [217]:
no_of_songs = clean_data_extension.shape[0]
limit_ = 100
no_of_requests = int(clean_data_extension.shape[0] / limit_) +1

audio_features = pd.DataFrame()

for i in tqdm(range(1)):
    lower_limit = i * limit_
    upper_limit = min((i+1)*limit_,no_of_songs)
    ids = clean_data_extension.id[lower_limit:upper_limit]
    response = sp.audio_features(tracks=ids)
    
    print(response[25])
    #short_df = pd.DataFrame(response[49:50])


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.84it/s]

{'danceability': 0.889, 'energy': 0.563, 'key': 9, 'loudness': -8.954, 'mode': 1, 'speechiness': 0.051, 'acousticness': 0.218, 'instrumentalness': 0, 'liveness': 0.0735, 'valence': 0.795, 'tempo': 98.99, 'type': 'audio_features', 'id': '3rrAvv30aPruW6oHhsRtBQ', 'uri': 'spotify:track:3rrAvv30aPruW6oHhsRtBQ', 'track_href': 'https://api.spotify.com/v1/tracks/3rrAvv30aPruW6oHhsRtBQ', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/3rrAvv30aPruW6oHhsRtBQ', 'duration_ms': 176600, 'time_signature': 4}



