In [112]:
import pandas as pd
import requests
import json
from pandas import json_normalize
import logging
import time
import os

logging.basicConfig(filename='data_retrieval.log', level=logging.INFO)


In [113]:
sid = os.getenv('SPOTIFY_CLIENT_ID')
skey = os.getenv('SPOTIFY_CLIENT_SECRET')
baseURL = 'https://api.spotify.com'

### Endpoints

In [114]:
def request_access_token(cid = sid, skey = skey ):
    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {
        "grant_type": "client_credentials",
        "client_id": cid,
        "client_secret": skey
    }

    response = requests.post(url, headers=headers, data=data)
    if response.status_code == 200:
        return response.json().get('access_token')
    else:
        print(f"Failed to obtain token, status code: {response.status_code}")
        return None

def fetch_tracks_info(track_ids):
    ids = ','.join(track_ids)
    url = f"https://api.spotify.com/v1/tracks?ids={ids}"
    
    try:
        response = requests.get(url, headers=headers)
        return response
    except requests.RequestException as e:
        # Log the error or handle it as needed
        print(f"Request failed: {e}")
        # Consider raising an exception here to indicate the failure to the caller
        raise

def fetch_tracks_audiofeatures(track_ids):
    ids = ','.join(track_ids)
    url = f"https://api.spotify.com/v1/audio-features?ids={ids}"
    
    try:
        response = requests.get(url, headers=headers)
        print(type(response.json().get('audio_features', [])))
        return response
    except requests.RequestException as e:
        print(f"Request failed: {e}", response)
        raise


### Endpoint utils

In [115]:
def new_session_headers():
    headers = {
        "Authorization": f'Bearer {request_access_token()}',
    }

    return headers

headers = new_session_headers()
headers

{'Authorization': 'Bearer BQC-9FkXML45F9HN5h_gA4SWHhqqnWcO1iNDi6UNHcYoJygnTexsPsWiD3Px_NtSM0faN6yuG3WPk1axTzOSs4tzVEU5AQ3Zqz-INzf8N7AWqYvS8dQ'}

### Data

In [116]:
# all_files = [f"./data_2022/MyData_streaming_hist/endsong_{i}.json" for i in range(12)]
# df_list = [pd.read_json(file) for file in all_files]
# df = pd.concat(df_list, ignore_index=True)
# df.ts = pd.to_datetime(df.ts)



# df = df.sort_values(by='ts').reset_index(drop = True)

# df = df[['spotify_track_uri', 'ts']].copy()

df = pd.read_csv("tracks_to_fetch.csv")
non_tracks = df.loc[df.spotify_track_uri.str.contains('track') != True].index
df.drop(index=non_tracks, inplace=True)
df = df[['spotify_track_uri']]
df.reset_index(drop=True, inplace=True)

### Metadata retrieval

In [117]:
def get_batch_metadata(batch_list, start_id=0, end_id=1):
    try:
        fetched_info = fetch_tracks_info(batch_list)
        fetched_info.raise_for_status()  # Raises an HTTPError if the response was an error
        
        tracks_data = fetched_info.json().get('tracks', [])
        frames = [json_normalize(track) for track in tracks_data if track is not None]
        df = pd.concat(frames, ignore_index=True)
        return df
    except requests.HTTPError as e:
        logging.error(f"Failed for IDs {start_id} to {end_id}. HTTP Error: {e}")
        raise  # Re-raise the exception after logging it
    except Exception as e:
        logging.error(f"Unexpected error for IDs {start_id} to {end_id}: {e}")
        raise  # Re-raise unexpected exceptions

def get_batch_audiofeatures(batch_list, start_id=0, end_id=1):
    try:
        fetched_info = fetch_tracks_audiofeatures(batch_list)
        fetched_info.raise_for_status()

        tracks_data = fetched_info.json().get('audio_features', [])
        frames = [json_normalize(track) for track in tracks_data if track is not None]
        df = pd.concat(frames, ignore_index=True)
        return df
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 429:
            retry_after = int(e.response.headers.get('Retry-After'))
            logging.info(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
            return retry_after
        else:
            logging.error(f"HTTP Error for IDs {start_id} to {end_id}: {e}")
            raise
    except Exception as e:
        logging.error(f"Unexpected error for IDs {start_id} to {end_id}: {e}")
        raise

def save_data(data, start_id, end_id, folder):
    data.to_csv(f'{folder}/track_data_{start_id}.csv', index = False)
    logging.info(f'Data saved from {start_id} to {end_id}')

    return True

In [118]:
def batchit():
    chunk_size = 50
    for start in range(0, len(df), chunk_size):
        end = start + chunk_size
        chunk = df[start:end].drop_duplicates(subset='spotify_track_uri')
        batch_list = list(chunk.spotify_track_uri.apply(lambda x: x.split(':')[-1])) #get just spotify id from uri
        # data = get_batch_audiofeatures(batch_list, start, end)
        data = get_batch_metadata(batch_list, start, end)

        if isinstance(data, pd.DataFrame) and not data.empty:
            save_data(data, start, end, 'fetched_data/batches_supplement')
            print('Finished:', start, batch_list)
        else:
            print(f'No data returned for batch starting at {start}.')
            return data
            
        time.sleep(1)
    return None


batchresult = batchit()

Finished: 0 ['7wKp9THGLEsCRLm8Btccgx', '3tL1nu7XzYe1VjZ6k6yQLX', '1S3Ah8sHYYWbIE1ZrxQuwa', '0YX63bl9FUtOLa0XoB6UvP', '0jg5uxIa6BCnQmbrRDkgc0', '4ICSoCyB9P4HsseLG5KS3v', '1CGC8QT5iwd6Flp9c1W4m5', '6nYsYnBq3V9Anlw1qjFEtX', '3xy99WsoXDyCWqxgr87xWC', '5KImYDn0L8ugwoSbyxtBxy', '5NlG6wkRWwPrKel04bKkYK', '0zeajOT0iYO8w3w4r6wFPX', '3HAP8EuzkRKQFSJHFqRAeV', '4vQtji75FLtbBIaivaWiAc', '6WA4uISv1Xn8XsaL2zJSDQ', '3rfV0emjPY7hJpEauhOJ79', '2NxQATnBJP7zfkWpQsbcvN', '2LES01awWEQFVqJLMoXkhh', '3ctooH9V8ZMbjPVxAcJY43', '5SojbZOB1O6TReZYPYYbnM', '4yqBk3iguOdXANg5qURLSB', '2rFkNlTVabumfUKnh4Kn3W', '7uaCn66SkDSuG2Tp57vLjk', '1W18TK5X3opzY2ksB4Rf8X', '5PSaVwQxQ3RpHa0TicXGRI', '1IdKUKv6GkmmonDX5pFpc2', '4i3PXGlNBIluJPVXpAEOnD', '7sxgyjjTLW9irJv0gfbtCd', '4n6MToGfoXwqxrxL5BJhV2', '6pifdJnm5L0bsQLQkJAYJZ', '1bhYSo8vmBwgq3n6Su3rcc', '708AblfH21HU5gOFBWAQHx', '0VDohahWOVVp0lXrCQtucu', '1nx2J8SyOuz3ITycgMFRsV', '4htEjLpOGBkjMHxoUslMgn', '5LPqeQT2Z6JTuVCqpA5wCK', '5RJWP9XOqYuGftwK6HaH1B', '1alY3KN3onDPmIPHZB4q4i',

In [108]:
df

Unnamed: 0,spotify_track_uri
0,spotify:track:7wKp9THGLEsCRLm8Btccgx
1,spotify:track:3tL1nu7XzYe1VjZ6k6yQLX
2,spotify:track:1S3Ah8sHYYWbIE1ZrxQuwa
3,spotify:track:0YX63bl9FUtOLa0XoB6UvP
4,spotify:track:0jg5uxIa6BCnQmbrRDkgc0
...,...
7653,spotify:track:2QtYwWmMjZpV03YgRPHPYs
7654,spotify:track:2kucmvJte18FofIyb251H9
7655,spotify:track:4ATZdotWJdN3HVRktDLIvl
7656,spotify:track:6ADF14VCHY0cNi6QyBU7Vp
