In [1]:
import spotipy

In [2]:
from tqdm import tqdm

In [3]:
from pathlib import Path

In [24]:
project_dir = Path().absolute().parent
data_dir = Path(project_dir, "data")

In [5]:
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv, find_dotenv
import os
import math
from itertools import chain
import time
load_dotenv(find_dotenv())
scope = "user-top-read%20user-read-currently-playing%20user-read-playback-state%20playlist-read-collaborative%20playlist-read-private%20user-library-read%20user-read-recently-played%20user-follow-read"

In [6]:
import flatdict
import pandas as pd
tqdm.pandas()

  from pandas import Panel


In [7]:
def chunk(data, n):
    return [data[x:x+n] for x in range(0, len(data), n)]

In [8]:
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [9]:
def get_saved_tracks(page):
    return sp.current_user_saved_tracks(limit=50, offset=page*50)['items']

In [10]:
def get_track_features(track_ids):
    if len(track_ids)>100:
        print("Too many tracks")
    else:
        return sp.audio_features(track_ids)

In [11]:
all_results = []
results = sp.current_user_saved_tracks(limit=50)
total_results = results['total']
total_pages = math.ceil(total_results/50)
all_tracks = list(chain.from_iterable([get_saved_tracks(page) for page in tqdm(list(range(total_pages)))]))

Couldn't read cache at: .cache


Enter the URL you were redirected to:  https://spot-yir.herokuapp.com/?code=AQAtF_D-NmE7NAbo92UrYkWcvf1qpcftRmHUJpxarVXaghohUKSGOfY_NsxbdEelNAIIPByaw-Z4dOjZIv_5lyDKMVcQWZFpvyWESwcgL5UdWNR21TYbeMJsK6qas0v_C00CItwJ6-ep02qeTY2O5-R_XqKvzWA-B6d-Btml0Ua9Xy-G4BF-8KYXzYQyluUokmolxGXQh3_zUYQD8-B7ffBmFm8bBFDnNOFxcs8O82ozv9pfsdyCakRqGqtN5EB0hPc8SsLxyVJ-ll2i8gWl8-w0WT32Sf-1VK9aLE-GqE1LnC68Qacnku1OheDcWst7_WG3A72VfBXAocbaUBx3gysiDVQsR2x1Ys092hkwKjdYPFIHmYa7WabNKrqHsQtuoy-OOFB24A5RXSmbeXlbQ_5K2hqvu9tMZb8Acb0


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [00:05<00:00,  8.06it/s]


In [12]:
all_track_ids = [track['track']['id'] for track in all_tracks]

In [13]:
chunked_track_ids = chunk(all_track_ids, 100)

In [14]:
all_audio_features = [val for val in list(chain.from_iterable([get_track_features(chunked_tracks) for chunked_tracks in tqdm(chunked_track_ids)])) if val]

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:04<00:00,  4.85it/s]


In [15]:
flattened_tracks = [dict(flatdict.FlatterDict(track)) for track in tqdm(all_tracks)]

100%|████████████████████████████████████████████████████████████████████████████| 2281/2281 [00:01<00:00, 1386.41it/s]


In [16]:
tracks_df = pd.DataFrame(flattened_tracks)
market_cols = [col for col in tracks_df.columns if "market" in col]
#Drop columns with more than 60% nulls
subset_tracks_df = tracks_df.drop(market_cols, axis=1).dropna(axis=1, thresh=int(0.6*len(tracks_df)))

In [17]:
subset_tracks_df.head(1)

Unnamed: 0,added_at,track:album:album_type,track:album:artists:0:external_urls:spotify,track:album:artists:0:href,track:album:artists:0:id,track:album:artists:0:name,track:album:artists:0:type,track:album:artists:0:uri,track:album:external_urls:spotify,track:album:href,...,track:external_urls:spotify,track:href,track:id,track:is_local,track:name,track:popularity,track:preview_url,track:track_number,track:type,track:uri
0,2020-12-21T06:12:47Z,single,https://open.spotify.com/artist/1Z4kMiUwBM1dko...,https://api.spotify.com/v1/artists/1Z4kMiUwBM1...,1Z4kMiUwBM1dkoO6TyXXhn,Savon,artist,spotify:artist:1Z4kMiUwBM1dkoO6TyXXhn,https://open.spotify.com/album/1hUyVK6jMsMeCSL...,https://api.spotify.com/v1/albums/1hUyVK6jMsMe...,...,https://open.spotify.com/track/52GpvIHeTyRAQGR...,https://api.spotify.com/v1/tracks/52GpvIHeTyRA...,52GpvIHeTyRAQGR7SskbzU,False,Forecast,12,https://p.scdn.co/mp3-preview/4c76430958f5a1f9...,3,track,spotify:track:52GpvIHeTyRAQGR7SskbzU


In [18]:
track_features_df = pd.DataFrame(all_audio_features)

In [19]:
track_features_df.head(1)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.78,0.703,11,-6.651,1,0.147,0.57,0.0172,0.206,0.386,130.009,audio_features,52GpvIHeTyRAQGR7SskbzU,spotify:track:52GpvIHeTyRAQGR7SskbzU,https://api.spotify.com/v1/tracks/52GpvIHeTyRA...,https://api.spotify.com/v1/audio-analysis/52Gp...,256620,4


In [20]:
tracks_and_features_df = pd.merge(subset_tracks_df, track_features_df, left_on="track:id", right_on="id")

In [21]:
tracks_and_features_df.head(1)

Unnamed: 0,added_at,track:album:album_type,track:album:artists:0:external_urls:spotify,track:album:artists:0:href,track:album:artists:0:id,track:album:artists:0:name,track:album:artists:0:type,track:album:artists:0:uri,track:album:external_urls:spotify,track:album:href,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,2020-12-21T06:12:47Z,single,https://open.spotify.com/artist/1Z4kMiUwBM1dko...,https://api.spotify.com/v1/artists/1Z4kMiUwBM1...,1Z4kMiUwBM1dkoO6TyXXhn,Savon,artist,spotify:artist:1Z4kMiUwBM1dkoO6TyXXhn,https://open.spotify.com/album/1hUyVK6jMsMeCSL...,https://api.spotify.com/v1/albums/1hUyVK6jMsMe...,...,0.206,0.386,130.009,audio_features,52GpvIHeTyRAQGR7SskbzU,spotify:track:52GpvIHeTyRAQGR7SskbzU,https://api.spotify.com/v1/tracks/52GpvIHeTyRA...,https://api.spotify.com/v1/audio-analysis/52Gp...,256620,4


In [25]:
spotify_path = Path(data_dir, "SpotifyLiked.csv")
tracks_and_features_df.to_csv(spotify_path, index=False)

In [None]:
#TODO: Also include songs in playlists that arent in liked songs?