In [5]:
import os
import base64
import requests
from requests import post, get
import json
from tqdm import tqdm
import pandas as pd
import time
import math
import re

In [6]:
client_credentials = [
('99f2f345b8394b50ac1b8dc1f0c00fe3', '7a2d2e2264114afe8dcd06a9698f7c40'),
('9b3fd46dd05143f3b2a8a1d09d13b3e6', 'a8214552ed424e8f9e77e667938e1569'),
('f1e33b9fd79f4c27bb1959b543397b5b', '24bf3c98a1b64d2c8d0814cc00c395a2'),
('8980a46d6c6f4b04afdc1dc811ab0302', 'a449aeb444ef457bbca21fcba09fb3ff'),
('bfbd3a53eff148899f1c003f71990c62', '083442bd2552477d92d8d956a3dca081'),
('828f192529984132960c2964fdf6c25a', 'd3fbcedb1b2e4446b0aaaa8002542407'),
('c6281585d64c4d77b535b0a5054347ab', '40de31ae41b84f1ca8cc6588ad3738c5'),
('7e803f814d69411eb214f4643fb43d99', '7ba59fa4c14f42efb15679f756ff9d94'),
('6fca169b877c489b82dbbfb1626ecb69', '7f4cefd772a0409d8cf47bdc495e21e7'),
('69b74a932a4e40aba2c07fe8f235a9a1', '609d13e078e3421a866a3a9568e96c47')

]



credential_index = 0

def get_token(client_id, client_secret):
    auth_string = client_id + ":" + client_secret
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes), "utf-8")

    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {"grant_type": "client_credentials"}

    response = requests.post(url, headers=headers, data=data)
    json_result = json.loads(response.content)

    if "access_token" in json_result:
        token = json_result["access_token"]
        return token
    else:
        print("Error: Failed to get access token")
        print(json_result)
        return None

def get_auth_header(token):
    return {"Authorization": "Bearer " + token}

def get_albums_for_artist(token, artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/albums"
    headers = get_auth_header(token)
    params = {"include_groups": "album"}
    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        albums = json.loads(response.text)["items"]
        album_dict = {}
        for album in albums:
            album_id = album["id"]
            album_name = album["name"]
            album_tracks = album["total_tracks"]
            album_type = album["album_type"]
            album_release_date = album["release_date"]
            album_artists = [artist["name"] for artist in album["artists"]]
            album_images = album["images"]

            if album_name in album_dict:
                existing_tracks = album_dict[album_name]["total_tracks"]
                if album_tracks > existing_tracks:
                    album_dict[album_name] = {
                        "album_id": album_id,
                        "album_name": album_name,
                        "total_tracks": album_tracks,
                        "album_type": album_type,
                        "release_date": album_release_date,
                        "artists": album_artists,
                        "images": album_images
                    }
            else:
                album_dict[album_name] = {
                    "album_id": album_id,
                    "album_name": album_name,
                    "total_tracks": album_tracks,
                    "album_type": album_type,
                    "release_date": album_release_date,
                    "artists": album_artists,
                    "images": album_images
                }
        return album_dict
    else:
        print("Error: Failed to get albums for artist")
        print(response.content)
        return None
    
def get_tracks_for_album(token, album_id):
    url = f"https://api.spotify.com/v1/albums/{album_id}/tracks"
    headers = get_auth_header(token)
    params = {"market": "US"}
    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        tracks = json.loads(response.text)["items"]
        track_list = []
        for track in tracks:
            track_name = track["name"]
            track_id = track["id"]
            track_number = track["track_number"]
            track_explicit = track["explicit"]
            track_artists = [artist["name"] for artist in track["artists"]]
            track_dict = {
                "track_name": track_name,
                "track_id": track_id,
                "track_number": track_number,
                "track_explicit": track_explicit,
                "track_artists": track_artists,
            }
            track_list.append(track_dict)
        return track_list
    else:
        print("Error: Failed to get tracks for album")
        print(response.content)
        return None
    
def get_audio_features(token, track_ids):
    audio_features = []

    for i in range(0, len(track_ids), 50):
        chunk = track_ids[i:i + 50]
        ids = ','.join(chunk)
        url = f"https://api.spotify.com/v1/audio-features?ids={ids}"
        headers = {"Authorization": f"Bearer {token}"}
        r = requests.get(url, headers=headers)
        if r.status_code != 200:
            print(f"Error getting audio features for tracks: {r.status_code}")
            return None

        try:
            features = r.json()["audio_features"]
        except KeyError:
            features = []

        audio_features.extend(features)

    audio_features_filtered = []
    for feature in audio_features:
        if feature is not None:
            audio_features_filtered.append(feature)

    return pd.DataFrame(audio_features_filtered)

def load_checkpoint():
    if os.path.exists('checkpoint_f2.json'):
        with open('checkpoint_f2.json', 'r') as f:
            return json.load(f)
    else:
        return {"artist_id": "", "progress": 0}

def save_checkpoint(artist_id, progress):
    with open('checkpoint_f2.json', 'w') as f:
        json.dump({"artist_id": artist_id, "progress": progress}, f)

def load_failed_requests():
    if os.path.exists('failed_requests_f2.parquet'):
        return pd.read_parquet('failed_requests_f2.parquet')
    else:
        return pd.DataFrame(columns=['Spotify ID'])

def save_failed_requests(df):
    df.to_parquet('failed_requests_f2.parquet')

def refresh_token(token_start_time):
    global credential_index
    current_time = time.time()
    elapsed_time = current_time - token_start_time
    if elapsed_time >= 1800:
        credential_index = (credential_index + 1) % len(client_credentials)
        client_id, client_secret = client_credentials[credential_index]
        return get_token(client_id, client_secret), current_time
    return None, token_start_time

def sanitize_filename(filename):
    return re.sub(r'[\\/:"*?<>|]+', '_', filename)

In [7]:
with open('lost_artist_ids_sub_dict_2.json', 'r') as f:
    artist_ids = json.load(f)
len(artist_ids)

3200

In [8]:
token = get_token(client_credentials[0][0], client_credentials[0][1])
token_start_time = time.time()

checkpoint = load_checkpoint()
failed_requests = load_failed_requests()

start_from = checkpoint["artist_id"] if checkpoint["artist_id"] else list(artist_ids.keys())[0]
start_index = list(artist_ids.keys()).index(start_from)


iteration_counter = 0
sleep_interval = 50

for artist_id, artist_name in list(artist_ids.items())[start_index:]:
    sanitized_artist_name = sanitize_filename(artist_name)
    parquet_file_path = f'results/{sanitized_artist_name}_tracks.parquet'
    if os.path.exists(parquet_file_path):
        print(f"Parquet file already exists for {artist_name}, skipping...")
        continue
    try:
        refreshed_token, new_token_start_time = refresh_token(token_start_time)
        if refreshed_token:
            token = refreshed_token
            token_start_time = new_token_start_time

        albums = get_albums_for_artist(token, artist_id)

        if albums is None:
            failed_requests = failed_requests.append({"Spotify ID": artist_id}, ignore_index=True)
            save_failed_requests(failed_requests)
            continue

        album_df = pd.DataFrame.from_dict(albums, orient="index")
        album_df.index.name = "album_name"

        all_tracks_df = pd.DataFrame()
        for album_id in tqdm(album_df['album_id'], desc=f"Processing {artist_name}"):
            time.sleep(1)
            tracks = get_tracks_for_album(token, album_id)
            if tracks is None:
                failed_requests = failed_requests.append({"Spotify ID": artist_id}, ignore_index=True)
                save_failed_requests(failed_requests)
                continue

            tracks = pd.DataFrame(tracks)
            tracks['album_id'] = album_id
            all_tracks_df = pd.concat([all_tracks_df, tracks], axis=0)

        af_df = get_audio_features(token, all_tracks_df['track_id'].tolist())
        af_df = af_df.rename(columns={'id': 'track_id'})
        all_aftracks_df = pd.merge(all_tracks_df, af_df, on='track_id')
        album_all_aftracks_df = pd.merge(album_df, all_aftracks_df, on='album_id')
        sanitized_artist_name = sanitize_filename(artist_name)
        album_all_aftracks_df.to_parquet(f'results/{sanitized_artist_name}_tracks.parquet')

        save_checkpoint(artist_id, artist_name)
        iteration_counter += 1
        if iteration_counter % sleep_interval == 0:
            time.sleep(10)
            
    except Exception as e:
        print(f"Error processing {artist_name}: {e}")
        failed_requests = failed_requests.append({"Spotify ID": artist_id}, ignore_index=True)
        save_failed_requests(failed_requests)
        continue

print("Completed processing all artists.")

Parquet file already exists for The Manhattan Love Suicides, skipping...
Parquet file already exists for Jim Kweskin, skipping...
Parquet file already exists for DJ Shadow, skipping...
Parquet file already exists for Yo Gotti, skipping...
Parquet file already exists for Dead Kennedys, skipping...
Parquet file already exists for Jacques Brel, skipping...
Parquet file already exists for Holy Mountain, skipping...
Parquet file already exists for The Stone Roses, skipping...
Parquet file already exists for Slightly Stoopid, skipping...
Parquet file already exists for Roscoe Holcomb, skipping...
Parquet file already exists for The Well Wishers, skipping...
Parquet file already exists for Baby Island, skipping...
Parquet file already exists for Édith Piaf, skipping...
Parquet file already exists for The Soft Hills, skipping...
Parquet file already exists for Big Boi, skipping...
Parquet file already exists for Husky, skipping...
Parquet file already exists for Bobby McFerrin, skipping...
Par

Processing Gerry Rafferty:   0%|          | 0/14 [00:00<?, ?it/s]

Error processing Joseph Angel: 'album_id'
Parquet file already exists for fun., skipping...
Parquet file already exists for Buck Owens, skipping...
Parquet file already exists for Marshall Crenshaw, skipping...
Parquet file already exists for Bettie Serveert, skipping...
Parquet file already exists for ZZ Top, skipping...
Parquet file already exists for The Soviettes, skipping...
Parquet file already exists for Boston Bun, skipping...
Parquet file already exists for Micah P. Hinson, skipping...
Parquet file already exists for Billy Talent, skipping...
Parquet file already exists for Lester Bowie, skipping...
Parquet file already exists for Enrico Rava, skipping...
Parquet file already exists for Rival Consoles, skipping...
Parquet file already exists for The Fire Theft, skipping...
Parquet file already exists for Elton Britt, skipping...
Parquet file already exists for Donavon Frankenreiter, skipping...
Parquet file already exists for Lafayette Gilchrist, skipping...
Parquet file alrea

Processing Gerry Rafferty: 100%|██████████| 14/14 [00:15<00:00,  1.12s/it]
Processing The Buckinghams:   0%|          | 0/9 [00:00<?, ?it/s]

Parquet file already exists for AlunaGeorge, skipping...
Parquet file already exists for The Edwin Hawkins Singers, skipping...
Parquet file already exists for The Shaggs, skipping...
Parquet file already exists for Danzig, skipping...
Parquet file already exists for Wreckless Eric, skipping...
Parquet file already exists for Concrete Blonde, skipping...
Parquet file already exists for Schlammpeitziger, skipping...
Parquet file already exists for Virgin Prunes, skipping...
Parquet file already exists for Rhonda Smith, skipping...
Parquet file already exists for The Makes Nice, skipping...
Parquet file already exists for Elvin Bishop, skipping...
Parquet file already exists for Gore Gore Girls, skipping...
Parquet file already exists for Rick Wilhite, skipping...
Parquet file already exists for Califone, skipping...
Parquet file already exists for Lake Trout, skipping...
Parquet file already exists for Screeching Weasel, skipping...
Parquet file already exists for Relient K, skipping...

Processing The Buckinghams: 100%|██████████| 9/9 [00:10<00:00,  1.13s/it]
Processing Bilal:   0%|          | 0/5 [00:00<?, ?it/s]

Parquet file already exists for Atlantic Starr, skipping...
Parquet file already exists for Carter Burwell, skipping...
Parquet file already exists for E.U., skipping...


Processing Bilal: 100%|██████████| 5/5 [00:05<00:00,  1.12s/it]
Processing Thursday:   0%|          | 0/10 [00:00<?, ?it/s]

Parquet file already exists for Static Daydream, skipping...
Parquet file already exists for Judith Hill, skipping...
Parquet file already exists for Steve "Silk" Hurley, skipping...


Processing Thursday: 100%|██████████| 10/10 [00:11<00:00,  1.10s/it]
Processing David Lynch: 100%|██████████| 12/12 [00:13<00:00,  1.13s/it]
Processing Des'ree: 100%|██████████| 7/7 [00:07<00:00,  1.11s/it]


Parquet file already exists for The Greyboy Allstars, skipping...
Parquet file already exists for Oakley Hall, skipping...
Error processing Birdman: 'album_id'
Parquet file already exists for Ash Ra Tempel, skipping...
Parquet file already exists for Jennifer Lopez, skipping...
Parquet file already exists for Akon, skipping...
Parquet file already exists for Jakob Dylan, skipping...
Parquet file already exists for Ashley Parker Angel, skipping...
Parquet file already exists for Ennio Morricone, skipping...
Parquet file already exists for B Boys, skipping...
Parquet file already exists for Frankie Miller, skipping...
Parquet file already exists for Beulah, skipping...
Parquet file already exists for Young Widows, skipping...
Parquet file already exists for Scott Fisher, skipping...
Parquet file already exists for GG Allin, skipping...
Parquet file already exists for Figures On A Beach, skipping...
Parquet file already exists for Joshua Starmer, skipping...
Parquet file already exists fo