In [1]:
import spotipy

In [2]:
from tqdm import tqdm

In [3]:
from pathlib import Path

In [4]:
project_dir = Path().absolute().parent
data_dir = Path(project_dir, "data")

In [5]:
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv, find_dotenv
import os
import math
from itertools import chain
import time
load_dotenv(find_dotenv())
scope = "user-top-read%20user-read-currently-playing%20user-read-playback-state%20playlist-read-collaborative%20playlist-read-private%20user-library-read%20user-read-recently-played%20user-follow-read"

In [6]:
import flatdict
import pandas as pd
tqdm.pandas()

  from pandas import Panel


In [7]:
def chunk(data, n):
    return [data[x:x+n] for x in range(0, len(data), n)]

In [8]:
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [9]:
def get_saved_tracks(page):
    #Force 1 second sleep
    time.sleep(1)
    return sp.current_user_saved_tracks(limit=50, offset=page*50)['items']

In [10]:
all_results = []
results = sp.current_user_saved_tracks(limit=50)
total_results = results['total']
total_pages = math.ceil(total_results/50)
all_tracks = list(chain.from_iterable([get_saved_tracks(page) for page in tqdm(list(range(total_pages)))]))

100%|██████████| 48/48 [00:53<00:00,  1.12s/it]


In [11]:
all_track_ids = [track['track']['id'] for track in all_tracks]
len(all_track_ids)

2371

In [12]:
chunked_track_ids = chunk(all_track_ids, 100)

In [13]:
def get_track_features(track_ids):
    #Force 2 second sleep
    time.sleep(2)
    if len(track_ids)>100:
        print("Too many tracks")
    else:
        return sp.audio_features(track_ids)

In [14]:
all_audio_features = [val for val in list(chain.from_iterable([get_track_features(chunked_tracks) for chunked_tracks in tqdm(chunked_track_ids)])) if val]

100%|██████████| 24/24 [00:54<00:00,  2.28s/it]


In [15]:
flattened_tracks = [dict(flatdict.FlatterDict(track)) for track in tqdm(all_tracks)]

100%|██████████| 2371/2371 [00:02<00:00, 936.94it/s] 


In [16]:
tracks_df = pd.DataFrame(flattened_tracks)
market_cols = [col for col in tracks_df.columns if "market" in col]

In [17]:
#Drop columns with more than 60% nulls and market cols
#subset_tracks_df = tracks_df.drop(market_cols, axis=1).dropna(axis=1, thresh=int(0.6*len(tracks_df)))

In [120]:
track_col_renames = {
    "track:album:album_type" : "album_type",
    "track:album:artists:0:external_urls:spotify": "album_artist_spurl",
    "track:album:artists:0:id": "album_artist_spid",
    "track:album:artists:0:name": "album_artist_name",
    "track:album:artists:0:type": "album_artist_type",
    "track:album:external_urls:spotify": "album_spurl",
    "track:album:id": "album_spid",
    "track:album:images:0:url": "album_img_url",
    "track:album:name": "album_name",
    "track:album:release_date": "album_release_date",
    "track:album:total_tracks": "album_tracks_count",
    "track:album:type": "album_track_type",
    "track:artists:0:external_urls:spotify": "artist_spurl",
    "track:artists:0:id": "artist_spid",
    "track:artists:0:name": "artist_name",
    "track:artists:0:type": "artist_type",
#    "track:duration_ms": "track_duration_ms",
    "track:explicit": "track_explicit",
    "track:external_ids:isrc": "track_isrc",
    "track:external_urls:spotify": "track_spurl",
    "track:id": "track_spid",
    "track:is_local": "track_is_local",
    "track:name": "track_name",
    "track:popularity": "track_popularity",
    "track:preview_url": "track_preview_url",
    "track:track_number": "track_number",
    "track:type": "track_type"
}

In [121]:
des_tracks_cols = [
    "added_at"
] + list(track_col_renames.values())

In [122]:
subset_tracks_df = tracks_df.rename(track_col_renames, axis=1)[des_tracks_cols]

In [123]:
track_features_df = pd.DataFrame(all_audio_features).drop([
    "uri",
    "track_href",
    "analysis_url",
    "type"
], axis=1).rename({"id": "spid"}, axis=1).add_prefix("track_")

In [125]:
tracks_and_features_df = pd.merge(subset_tracks_df, track_features_df, on="track_spid")

In [126]:
tracks_and_features_df.head(1)

Unnamed: 0,added_at,album_type,album_artist_spurl,album_artist_spid,album_artist_name,album_artist_type,album_spurl,album_spid,album_img_url,album_name,...,track_loudness,track_mode,track_speechiness,track_acousticness,track_instrumentalness,track_liveness,track_valence,track_tempo,track_duration_ms,track_time_signature
0,2021-01-19T03:58:25Z,album,https://open.spotify.com/artist/5snNHNlYT2UrtZ...,5snNHNlYT2UrtZo5HCJkiw,Epik High,artist,https://open.spotify.com/album/5U1iQJeJurJANhd...,5U1iQJeJurJANhdos9GugT,https://i.scdn.co/image/ab67616d0000b27321f706...,Epik High Is Here 上 (Part 1),...,-7.729,0,0.0777,0.302,0.0,0.0798,0.736,76.712,214642,4


In [72]:
tracks_and_features_df.columns

Index(['added_at', 'album_type', 'album_artist_spurl', 'album_artist_spid',
       'album_artist_name', 'album_artist_type', 'album_spurl', 'album_spid',
       'album_img_url', 'album_name', 'album_release_date',
       'album_tracks_count', 'album_track_type', 'artist_spurl', 'artist_spid',
       'artist_name', 'artist_type', 'track_duration_ms', 'track_explicit',
       'track_isrc', 'track_spurl', 'track_spid', 'track_is_local',
       'track_name', 'track_popularity', 'track_preview_url', 'track_number',
       'track_type', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature'],
      dtype='object')

In [75]:
#artist_cols = [col for col in tracks_and_features_df.columns if "artist" in col and "spid" in col]

In [76]:
#artist_cols

['album_artist_spid', 'artist_spid']

In [78]:
# all_artist_ids = set() 
# for col in artist_cols:
#     all_artist_ids.update(tracks_and_features_df[col].tolist())
# all_artist_ids = list(all_artist_ids)
# len(all_artist_ids)

1072

In [132]:
all_artist_ids = tracks_and_features_df["artist_spid"].unique().tolist()

In [133]:
len(all_artist_ids)

1061

In [134]:
def get_artist_features(artist_ids):
    #Force 2 second sleep
    time.sleep(2)
    if len(artist_ids)>50:
        print("Too many tracks")
    else:
        return sp.artists(artist_ids)

In [135]:
chunked_artist_ids = chunk(all_artist_ids, 50)

In [136]:
intermediate_list = [get_artist_features(chunked_artists)['artists'] for chunked_artists in tqdm(chunked_artist_ids)]

100%|██████████| 22/22 [00:45<00:00,  2.08s/it]


In [137]:
all_artist_features = [val for val in list(chain.from_iterable(intermediate_list)) if val]

In [138]:
len(all_artist_features)

1061

In [153]:
def flatten_artist_features(artist_features):
    
    artist_follower_total = artist_features.get("followers", {}).get("total")
    artist_genres = artist_features.get("genres", [])
    artist_spid = artist_features.get("id")
    artist_img_urls = artist_features.get("images", [{"url": None}])
    if len(artist_img_urls) == 0:
        artist_img_url = None
    else:
        artist_img_url = artist_img_urls[0].get("url")
    artist_popularity = artist_features.get("popularity")

    flattened_artist_features = {
        "artist_follower_total": artist_follower_total,
        "artist_genres": artist_genres,
        "artist_spid": artist_spid,
        "artist_img_url": artist_img_url,
        "artist_popularity": artist_popularity
    }
    
    return flattened_artist_features

In [155]:
flattened_artist_features = [flatten_artist_features(artist) for artist in all_artist_features]

In [156]:
artist_features_df = pd.DataFrame(flattened_artist_features)

In [157]:
artist_features_df.head(1)

Unnamed: 0,artist_follower_total,artist_genres,artist_spid,artist_img_url,artist_popularity
0,418931,"[k-indie, k-pop, k-rap, korean pop]",5snNHNlYT2UrtZo5HCJkiw,https://i.scdn.co/image/853b1c7e60b3231eafe65f...,59


In [22]:
spotify_path = Path(data_dir, "SpotifyLikedFeaturesAndArtists.csv")
#FeaturesAndArtiststracks_and_features_df.to_csv(spotify_path, index=False)

In [23]:
#TODO: Also include songs in playlists that arent in liked songs?