In [249]:
import pandas as pd
import requests
import json
from pandas import json_normalize
import logging
import time

logging.basicConfig(filename='data_retrieval.log', level=logging.INFO)


In [242]:
sid = '09dceb980747489297e7ea0a51077a66'
skey = 'f3543385c18c48cd98018f4cc39bf12a'
baseURL = 'https://api.spotify.com'


### Endpoints

In [243]:
def request_access_token(cid = sid, skey = skey ):
    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {
        "grant_type": "client_credentials",
        "client_id": cid,
        "client_secret": skey
    }

    response = requests.post(url, headers=headers, data=data)
    if response.status_code == 200:
        return response.json().get('access_token')
    else:
        print(f"Failed to obtain token, status code: {response.status_code}")
        return None

def fetch_tracks_info(track_ids):
    ids = ','.join(track_ids)
    url = f"https://api.spotify.com/v1/tracks?ids={ids}"
    
    try:
        response = requests.get(url, headers=headers)
        return response
    except requests.RequestException as e:
        # Log the error or handle it as needed
        print(f"Request failed: {e}")
        # Consider raising an exception here to indicate the failure to the caller
        raise

def fetch_tracks_audiofeatures(track_ids):
    ids = ','.join(track_ids)
    url = f"https://api.spotify.com/v1/audio-features?ids={ids}"
    
    try:
        response = requests.get(url, headers=headers)
        return response
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        raise


### Endpoint utils

In [244]:
def new_session_headers():
    headers = {
        "Authorization": f'Bearer {request_access_token()}',
    }

    return headers

In [245]:
headers = new_session_headers()

### Core data

In [246]:
all_files = [f"./data_2022/MyData_streaming_hist/endsong_{i}.json" for i in range(12)]
df_list = [pd.read_json(file) for file in all_files]
df = pd.concat(df_list, ignore_index=True)
df.ts = pd.to_datetime(df.ts)

non_tracks = df.loc[df.spotify_track_uri.str.contains('track') != True].index
df.drop(index=non_tracks, inplace=True)

df = df.sort_values(by='ts').reset_index(drop = True)

df = df[['spotify_track_uri', 'ts']].copy()

### Metadata retrieval

In [259]:
def get_batch_metadata(batch_list, start_id=0, end_id=1):
    try:
        fetched_info = fetch_tracks_info(batch_list)
        fetched_info.raise_for_status()  # Raises an HTTPError if the response was an error
        
        tracks_data = fetched_info.json().get('tracks', [])
        frames = [json_normalize(track) for track in tracks_data]
        df = pd.concat(frames, ignore_index=True)
        return df
    except requests.HTTPError as e:
        logging.error(f"Failed for IDs {start_id} to {end_id}. HTTP Error: {e}")
        raise  # Re-raise the exception after logging it
    except Exception as e:
        logging.error(f"Unexpected error for IDs {start_id} to {end_id}: {e}")
        raise  # Re-raise unexpected exceptions

def get_batch_audiofeatures(batch_list, start_id=0, end_id=1):
    try:
        fetched_info = fetch_tracks_audiofeatures(batch_list)
        if fetched_info.status_code != 200:
            logging.error(f"Failed for IDs {start_id} to {end_id}. HTTP Status Code: {fetched_info.status_code}")
            return fetched_info

        tracks_data = fetched_info.json().get('audio_features', [])
        frames = [json_normalize(track) for track in tracks_data]
        df = pd.concat(frames, ignore_index=True)
        return df
    except Exception as e:
        logging.error(f"Unexpected error for IDs {start_id} to {end_id}: {e}")
        raise  # Re-raise unexpected exceptions

def save_data(data, start_id, end_id, folder):
    data.to_csv(f'{folder}/track_data_{start_id}.csv', index = False)
    logging.info(f'Data saved from {start_id} to {end_id}')

    return True

In [283]:
def batchit():
    chunk_size = 50
    for start in range(99950, len(df), chunk_size):
        end = start + chunk_size
        chunk = df[start:end]
        batch_list = list(chunk.spotify_track_uri.apply(lambda x: x.split(':')[-1])) #get just spotify id from uri
        data = get_batch_audiofeatures(batch_list, start, end)
        # data = get_batch_metadata(batch_list, start, end)

        if isinstance(data, pd.DataFrame) and not data.empty:
            save_data(data, start, end, 'fetched_audiofeatures')
            print('Finished:', start, batch_list) 
        else:
            print(f'No data returned for batch starting at {start}.')
            return data
            
        time.sleep(1)
    return None

batchresult = batchit()

No data returned for batch starting at 99950.


In [289]:
batchresult.headers

{'content-type': 'application/json; charset=utf-8', 'cache-control': 'private, max-age=0', 'access-control-allow-origin': '*', 'access-control-allow-headers': 'Accept, App-Platform, Authorization, Content-Type, Origin, Retry-After, Spotify-App-Version, X-Cloud-Trace-Context, client-token, content-access-token', 'access-control-allow-methods': 'GET, POST, OPTIONS, PUT, DELETE, PATCH', 'access-control-allow-credentials': 'true', 'access-control-max-age': '604800', 'content-encoding': 'gzip', 'strict-transport-security': 'max-age=31536000', 'x-content-type-options': 'nosniff', 'date': 'Sun, 03 Mar 2024 00:14:36 GMT', 'server': 'envoy', 'Via': 'HTTP/2 edgeproxy, 1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'Transfer-Encoding': 'chunked'}