In [1]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
from local_utils.retrieval import get_table_data

In [2]:
table_name = "temp_saved_songs"
table_columns = ["main_artist", "main_artist_id"]
retrieval_query = f"SELECT DISTINCT {", ".join(table_columns)} from {table_name}"

artists = get_table_data(table_name=table_name, table_columns=table_columns, query=retrieval_query)
df_artists = pd.DataFrame.from_records(artists)
print(df_artists.shape)
df_artists.head()

(1593, 2)


Unnamed: 0,main_artist,main_artist_id
0,damon r.,3LE39lgBs68N41XjxH5cLD
1,Title Fight,2CnhqfjUG0qzsru0SMuhrk
2,yunè pinku,2sY4BbYrbvNVgsNzo6HddD
3,Berhana,0WjtdWS6su0f3jrW9aqEHl
4,africakid,0T3txFkkvO5AI6skW9mHzY


In [3]:
table_name = "temp_saved_songs"
table_columns = ["featured_artists", "featured_artists_ids"]
retrieval_query = f"SELECT DISTINCT {", ".join(table_columns)} from {table_name}"

featured = get_table_data(table_name=table_name, table_columns=table_columns, query=retrieval_query)
df_featured = pd.DataFrame.from_records(featured)
print(df_featured.shape)
df_featured.head()

(875, 2)


Unnamed: 0,featured_artists,featured_artists_ids
0,"[Mechatok, SALEM]","[4poYOxVqlpDTelhhiJcSrW, 09CJcG6ndtL82D8x9VxaeT]"
1,,
2,[Charli xcx],[25uiPmTg16RbhZWAqwLBy5]
3,"[Kim Petras, Slayyyter]","[3Xt3RrJMFv5SZkCfUE8C1J, 4QM5QCHicznALtX885CnZC]"
4,[[bsd.u]],[3m1IaEwT7D9hFKOfpz5VHk]


In [4]:
df_featured = df_featured.explode([ "featured_artists", "featured_artists_ids" ])
df_featured = df_featured.rename({"featured_artists_ids": "main_artist_id"}, axis=1)
df_featured.head()

Unnamed: 0,featured_artists,main_artist_id
0,Mechatok,4poYOxVqlpDTelhhiJcSrW
0,SALEM,09CJcG6ndtL82D8x9VxaeT
1,,
2,Charli xcx,25uiPmTg16RbhZWAqwLBy5
3,Kim Petras,3Xt3RrJMFv5SZkCfUE8C1J


In [5]:
df_all_artists = df_artists.merge(df_featured, "outer", "main_artist_id")
df_all_artists = df_all_artists.drop("featured_artists", axis=1)
df_all_artists

Unnamed: 0,main_artist,main_artist_id
0,Lana Del Rey,00FQb4jTyendYWaN8pK0wa
1,Alexzander Pray,00UxHRUP58SZJUFJKUSrAm
2,Petite League,00XrFl3G12emNX9Qqm6Gd4
3,Levi Ryan,00hxNB9gMEeMFLLaW06F4J
4,The Feminine Complex,01KjnhCy6NazKcRifV68Mm
...,...,...
2477,,7yjWDiLDpsHxobHP1fWYh8
2478,Sedat The Turkish Avenger,7z03fFS6IEEPLlNayMNGBi
2479,Michael Learns To Rock,7zMVPOJPs5jgU8NorRxqJe
2480,Cruza,7zxS4o4zmwxJNe5UvC2Fx5


In [6]:
df_all_artists = df_all_artists.dropna(subset="main_artist_id").drop_duplicates()
df_all_artists["main_artist"].isna().value_counts()

main_artist
False    1593
True      625
Name: count, dtype: int64

In [7]:
artist_ids = df_all_artists["main_artist_id"].tolist()
print(artist_ids[:5])
print(len(artist_ids))

['00FQb4jTyendYWaN8pK0wa', '00UxHRUP58SZJUFJKUSrAm', '00XrFl3G12emNX9Qqm6Gd4', '00hxNB9gMEeMFLLaW06F4J', '01KjnhCy6NazKcRifV68Mm']
2218


## Retrieval of missing artists + genres

In [8]:
import spotipy
import json
from spotipy.oauth2 import SpotifyOAuth

with open("../secrets/data-retriever.json") as f:
        client_json = json.load(f)

sp = spotipy.Spotify(
    auth_manager=SpotifyOAuth(
        client_id=client_json["client_id"],
        client_secret=client_json["client_secret"],
        redirect_uri="http://127.0.0.1:8000/callback",
        )
)

In [9]:
from pathlib import Path

save_dir = "../data/spotify/artists/"
dir_path = Path(save_dir)
if not dir_path.exists():
    dir_path.mkdir()

In [11]:
from local_utils.loading import save_to_jsonl

batch_size = 50
for idx, artist_idx in enumerate(range(0, len(artist_ids), batch_size), 1):
    filename = f"artist_{idx:03d}.jsonl"

    results = sp.artists(artists=artist_ids[artist_idx:artist_idx+batch_size])

    artist_batch = []
    for artist in results["artists"]:
        artist_record = {}

        artist_record["name"] = artist["name"]
        artist_record["artist_id"] = artist["id"]
        
        if artist["genres"]: # check if genres is a populated list
            artist_record["genres"] = artist["genres"]
        else:
            artist_record["genres"] = None
        
        artist_record["followers"] = artist["followers"]["total"]
        artist_record["popularity"] = artist["popularity"]

        artist_batch.append(artist_record)
    
    save_to_jsonl(artist_batch, save_dir+filename)