In [1]:
import spotipy

In [2]:
from tqdm import tqdm

In [3]:
from pathlib import Path

In [4]:
project_dir = Path().absolute().parent
data_dir = Path(project_dir, "data")

In [5]:
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv, find_dotenv
import os
import math
from itertools import chain
import time
load_dotenv(find_dotenv())

True

In [6]:
import flatdict
import pandas as pd
tqdm.pandas()

In [7]:
def chunk(data, n):
    return [data[x:x+n] for x in range(0, len(data), n)]

In [8]:
scope = "user-top-read%20user-read-currently-playing%20user-read-playback-state%20playlist-read-collaborative%20playlist-read-private%20user-library-read%20user-read-recently-played%20user-follow-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

# Get Liked Songs

In [9]:
def get_saved_track_page_count():
    first_page_saved_tracks = sp.current_user_saved_tracks(limit=50)
    count_saved_songs = first_page_saved_tracks['total'] #Tells total amount of saved tracks
    total_pages_saved_songs = math.ceil(count_saved_songs/50) #Can get 50 tracks at a time
    return total_pages_saved_songs

In [10]:
def get_saved_tracks(page_num):
    time.sleep(0.25)
    return sp.current_user_saved_tracks(limit=50, offset=page_num*50)['items']

In [11]:
total_pages_saved_songs = get_saved_track_page_count()
liked_tracks = list(chain.from_iterable([get_saved_tracks(page_num) for page_num in tqdm(list(range(total_pages_saved_songs)))]))

100%|██████████| 55/55 [00:22<00:00,  2.49it/s]


In [12]:
flattened_liked_tracks = [dict(flatdict.FlatterDict(track)) for track in liked_tracks]

In [13]:
full_liked_tracks_df = pd.DataFrame(flattened_liked_tracks)

In [14]:
track_col_renames = {
    "track:album:album_type" : "album_type",
    "track:album:artists:0:external_urls:spotify": "album_artist_spurl",
    "track:album:artists:0:id": "album_artist_spid",
    "track:album:artists:0:name": "album_artist_name",
    "track:album:artists:0:type": "album_artist_type",
    "track:album:external_urls:spotify": "album_spurl",
    "track:album:id": "album_spid",
    "track:album:images:0:url": "album_img_url",
    "track:album:name": "album_name",
    "track:album:release_date": "album_release_date",
    "track:album:total_tracks": "album_tracks_count",
    "track:album:type": "album_track_type",
    "track:artists:0:external_urls:spotify": "artist_spurl",
    "track:artists:0:id": "artist_spid",
    "track:artists:0:name": "artist_name",
    "track:artists:0:type": "artist_type",
#    "track:duration_ms": "track_duration_ms",
    "track:explicit": "track_explicit",
    "track:external_ids:isrc": "track_isrc",
    "track:external_urls:spotify": "track_spurl",
    "track:id": "track_spid",
    "track:is_local": "track_is_local",
    "track:name": "track_name",
    "track:popularity": "track_popularity",
    "track:preview_url": "track_preview_url",
    "track:track_number": "track_number",
    "track:type": "track_type"
}

In [15]:
des_tracks_cols = [
    "added_at"
] + list(track_col_renames.values())

In [16]:
liked_tracks_df = full_liked_tracks_df.rename(track_col_renames, axis=1)[des_tracks_cols]

In [17]:
liked_tracks_df.head(1)

Unnamed: 0,added_at,album_type,album_artist_spurl,album_artist_spid,album_artist_name,album_artist_type,album_spurl,album_spid,album_img_url,album_name,...,track_explicit,track_isrc,track_spurl,track_spid,track_is_local,track_name,track_popularity,track_preview_url,track_number,track_type
0,2021-03-07T06:31:46Z,single,https://open.spotify.com/artist/3TVXtAsR1Inumw...,3TVXtAsR1Inumwj472S9r4,Drake,artist,https://open.spotify.com/album/5LuoozUhs2pl3gl...,5LuoozUhs2pl3glZeAJl89,https://i.scdn.co/image/ab67616d0000b2738b20e4...,Scary Hours 2,...,True,USUG12101041,https://open.spotify.com/track/3aQem4jVGdhtg11...,3aQem4jVGdhtg116TmJnHz,False,What’s Next,79,https://p.scdn.co/mp3-preview/5385c2f0cc631996...,1,track


# Get Liked Songs Features

In [18]:
liked_track_ids = liked_tracks_df["track_spid"].unique().tolist()
len(liked_track_ids)

2737

In [19]:
chunked_liked_track_ids = chunk(liked_track_ids, 100)

In [20]:
def get_track_features(track_ids):
    time.sleep(0.25)
    if len(track_ids)>100:
        print("Too many tracks")
    else:
        return sp.audio_features(track_ids)

In [21]:
chunked_liked_track_features = [get_track_features(chunked_tracks) for chunked_tracks in tqdm(chunked_liked_track_ids)]

100%|██████████| 28/28 [00:13<00:00,  2.01it/s]


In [22]:
liked_track_features = [val for val in list(chain.from_iterable(chunked_liked_track_features)) if val]

In [23]:
#Drop columns with more than 60% nulls and market cols
#subset_tracks_df = tracks_df.drop(market_cols, axis=1).dropna(axis=1, thresh=int(0.6*len(tracks_df)))

In [24]:
liked_track_features_df = pd.DataFrame(liked_track_features).drop([
    "uri",
    "track_href",
    "analysis_url",
    "type"
], axis=1).rename({"id": "spid"}, axis=1).add_prefix("track_")

In [25]:
liked_track_features_df.head(1)

Unnamed: 0,track_danceability,track_energy,track_key,track_loudness,track_mode,track_speechiness,track_acousticness,track_instrumentalness,track_liveness,track_valence,track_tempo,track_spid,track_duration_ms,track_time_signature
0,0.781,0.594,0,-6.959,0,0.0485,0.0136,0.0,0.162,0.0628,129.895,3aQem4jVGdhtg116TmJnHz,178154,4


# Get Features of Liked Songs Artists

In [26]:
liked_artist_ids = liked_tracks_df["artist_spid"].unique().tolist()
len(liked_artist_ids)

1192

In [27]:
def get_artist_features(artist_ids):
    time.sleep(0.25)
    if len(artist_ids)>50:
        print("Too many tracks")
    else:
        return sp.artists(artist_ids)

In [28]:
chunked_liked_artist_ids = chunk(liked_artist_ids, 50)

In [29]:
chunked_liked_artist_features = [get_artist_features(chunked_artists)['artists'] for chunked_artists in tqdm(chunked_liked_artist_ids)]

100%|██████████| 24/24 [00:08<00:00,  2.93it/s]


In [30]:
liked_artist_features = [val for val in list(chain.from_iterable(chunked_liked_artist_features)) if val]

In [31]:
len(liked_artist_features)

1192

In [32]:
def flatten_artist_features(artist_features):
    
    artist_follower_total = artist_features.get("followers", {}).get("total")
    artist_genres = artist_features.get("genres", [])
    artist_spid = artist_features.get("id")
    artist_img_urls = artist_features.get("images", [{"url": None}])
    if len(artist_img_urls) == 0:
        artist_img_url = None
    else:
        artist_img_url = artist_img_urls[0].get("url")
    artist_popularity = artist_features.get("popularity")

    flattened_artist_features = {
        "artist_follower_total": artist_follower_total,
        "artist_genres": artist_genres,
        "artist_spid": artist_spid,
        "artist_img_url": artist_img_url,
        "artist_popularity": artist_popularity
    }
    
    return flattened_artist_features

In [33]:
flattened_liked_artist_features = [flatten_artist_features(artist) for artist in liked_artist_features]

In [34]:
liked_artist_features_df = pd.DataFrame(flattened_liked_artist_features)

In [35]:
liked_artist_features_df.head(1)

Unnamed: 0,artist_follower_total,artist_genres,artist_spid,artist_img_url,artist_popularity
0,53510664,"[canadian hip hop, canadian pop, hip hop, pop ...",3TVXtAsR1Inumwj472S9r4,https://i.scdn.co/image/60cfab40c6bb160a1906be...,99


# Join and Save Data

In [36]:
liked_songs_info_df = pd.merge(liked_tracks_df, liked_track_features_df, on="track_spid").merge(liked_artist_features_df, on="artist_spid").sort_values("added_at", ascending=False)

In [39]:
liked_songs_info_df['interaction_style'] = "Liked Songs"

In [40]:
liked_songs_info_df.head(1)

Unnamed: 0,added_at,album_type,album_artist_spurl,album_artist_spid,album_artist_name,album_artist_type,album_spurl,album_spid,album_img_url,album_name,...,track_liveness,track_valence,track_tempo,track_duration_ms,track_time_signature,artist_follower_total,artist_genres,artist_img_url,artist_popularity,interaction_style
0,2021-03-07T06:31:46Z,single,https://open.spotify.com/artist/3TVXtAsR1Inumw...,3TVXtAsR1Inumwj472S9r4,Drake,artist,https://open.spotify.com/album/5LuoozUhs2pl3gl...,5LuoozUhs2pl3glZeAJl89,https://i.scdn.co/image/ab67616d0000b2738b20e4...,Scary Hours 2,...,0.162,0.0628,129.895,178154,4,53510664,"[canadian hip hop, canadian pop, hip hop, pop ...",https://i.scdn.co/image/60cfab40c6bb160a1906be...,99,Liked Songs


In [40]:
spotify_path = Path(data_dir, f"{sp.me()['id']}.csv")
liked_songs_info_df.to_csv(spotify_path, index=False)

In [38]:
#TODO: Also include songs in playlists that arent in liked songs?