In [241]:
import pandas as pd
import requests
import json
from pandas import json_normalize
import logging

logging.basicConfig(filename='data_retrieval.log', level=logging.INFO)


In [242]:
sid = '09dceb980747489297e7ea0a51077a66'
skey = 'f3543385c18c48cd98018f4cc39bf12a'
baseURL = 'https://api.spotify.com'


### Endpoints

In [243]:
def request_access_token(cid = sid, skey = skey ):
    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {
        "grant_type": "client_credentials",
        "client_id": cid,
        "client_secret": skey
    }

    response = requests.post(url, headers=headers, data=data)
    if response.status_code == 200:
        return response.json().get('access_token')
    else:
        print(f"Failed to obtain token, status code: {response.status_code}")
        return None

def fetch_tracks_info(track_ids):
    ids = ','.join(track_ids)
    url = f"https://api.spotify.com/v1/tracks?ids={ids}"
    
    try:
        response = requests.get(url, headers=headers)
        return response
    except requests.RequestException as e:
        # Log the error or handle it as needed
        print(f"Request failed: {e}")
        # Consider raising an exception here to indicate the failure to the caller
        raise

def fetch_tracks_audiofeatures(track_ids):
    ids = ','.join(track_ids)
    url = f"https://api.spotify.com/v1/audio-features?ids={ids}"
    
    try:
        response = requests.get(url, headers=headers)
        return response
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        raise


### Endpoint utils

In [244]:
def new_session_headers():
    headers = {
        "Authorization": f'Bearer {request_access_token()}',
    }

    return headers

In [245]:
headers = new_session_headers()

### Core data

In [246]:
all_files = [f"./data_2022/MyData_streaming_hist/endsong_{i}.json" for i in range(12)]
df_list = [pd.read_json(file) for file in all_files]
df = pd.concat(df_list, ignore_index=True)
df.ts = pd.to_datetime(df.ts)

non_tracks = df.loc[df.spotify_track_uri.str.contains('track') != True].index
df.drop(index=non_tracks, inplace=True)

df = df.sort_values(by='ts').reset_index(drop = True)

df = df[['spotify_track_uri', 'ts']].copy()

### Metadata retrieval

In [247]:
def get_batch_metadata(batch_list, start_id=0, end_id=1):
    try:
        fetched_info = fetch_tracks_info(batch_list)
        fetched_info.raise_for_status()  # Raises an HTTPError if the response was an error
        
        tracks_data = fetched_info.json().get('tracks', [])
        frames = [json_normalize(track) for track in tracks_data]
        df = pd.concat(frames, ignore_index=True)
        return df
    except requests.HTTPError as e:
        logging.error(f"Failed for IDs {start_id} to {end_id}. HTTP Error: {e}")
        raise  # Re-raise the exception after logging it
    except Exception as e:
        logging.error(f"Unexpected error for IDs {start_id} to {end_id}: {e}")
        raise  # Re-raise unexpected exceptions

def get_batch_audiofeatures(batch_list, start_id=0, end_id=1):
    try:
        fetched_info = fetch_tracks_audiofeatures(batch_list)
        fetched_info.raise_for_status()  # Raises an HTTPError if the response was an error
        
        tracks_data = fetched_info.json().get('audio_features', [])
        frames = [json_normalize(track) for track in tracks_data]
        df = pd.concat(frames, ignore_index=True)
        return df
    except requests.HTTPError as e:
        logging.error(f"Failed for IDs {start_id} to {end_id}. HTTP Error: {e}")
        raise  # Re-raise the exception after logging it
    except Exception as e:
        logging.error(f"Unexpected error for IDs {start_id} to {end_id}: {e}")
        raise  # Re-raise unexpected exceptions

def save_data(data, start_id, end_id, folder):
    data.to_csv(f'{folder}/track_data_{start_id}.csv', index = False)
    logging.info(f'Data saved from {start_id} to {end_id}')

    return True

In [248]:
chunk_size = 50
for start in range(0, len(df), chunk_size):
    end = start + chunk_size
    chunk = df[start:end]
    batch_list = list(chunk.spotify_track_uri.apply(lambda x: x.split(':')[-1])) #get just spotify id from uri
    print(batch_list)
    data = get_batch_audiofeatures(batch_list, start, end)
    # data = get_batch_metadata(batch_list, start, end)

    if not data.empty:
        save_data(data, start, end, 'fetched_audiofeatures')


['7crplFzt7spG80NCbpmuCp', '1tYt8PbpbeTuqsNmprAZYY', '52izjvF7wwveRG1rDJsGWe', '1xQv4unOladJFtLUOy2eb7', '3vv9phIu6Y1vX3jcqaGz5Z', '0r4SsYcwvd8URat6AS2m6f', '6WhzFzROw3aq3rPWjgYlxr', '37k7igIs7IrMVUadwWKHJ6', '6Knv6wdA0luoMUuuoYi2i1', '6WjmPTZKeDylStKmGHLcqQ', '6mNMp0g3zkg1K7uBpn07zl', '6NljjfZDdfVh8x31Dwvj7J', '2MNv9hlHsQUlxG00IKNxUY', '3G2grVPTDrTQskKKsoEnRA', '0PaemlGLiH2O6nAxRCzmee', '7L5jgZtAyfiU7elB8DIqCx', '6splO3UF8hSmK33y5fJTNk', '3FQdtOMXdfDcDDjY1OHAqz', '2Pjazbe7W5svzVZt1sBtb1', '6WLS56xiRrXjSm9RSdL3zD', '2H8LPaY3NSjmOFMH2Rygnz', '0w3Q3VFdrYzo24QUIGnBNy', '1omIb7BrAk7qwRrCSHJTc5', '2OkU57LVQoynnVyLE1LrPN', '4oHmgneU9dwYoqg0SJSOCf', '3NBDgwEAGMj0aKRsU8zoO9', '203t5YqpCFmIdxj8hIotxo', '42JmSZ977by4p0AW0y558P', '5IDra44Yb9g0GHzZCn1vX4', '2QTp2o96FG61yVAxZTHqSV', '5snyhxAh55A2wlNRH7VVZJ', '0z8yrlXSjnI29Rv30RssNI', '73JYmIV6UitbrIDyGb2lWf', '4n5wWTTgnLtb8nVwPWVzgx', '76fM7d7hS1jHSzCGIy69DN', '7tSMnub7dlynTcPGwvjOMP', '7yiAjQfyR6f2AJ8sxlRect', '3PnOOmkxruDDAiMYZZTgO7', '2d29B8frPb