# Spotify EDA: Compiling list of Missing Artist/Tracks URIs

This notebook compiles file(s) containing all URIs that return errors when querying the Spotify API for data cleanup. 

## Setup

In [14]:
import os
import sys
import json
import time
import pickle

from IPython.display import clear_output

from sqlalchemy.ext.declarative import declarative_base 
from sqlalchemy import Table, Column, Integer, String, MetaData, and_, or_, func,distinct

import importlib

from spotify_api import get_spotify_data, get_tracks, get_artists, get_audiofeatures
from spotify_database import get_session, display_time
from spotify_utils import Table_Generator, List_Generator

%env SPOTIFY_CLIENT_ID=aff9ad651d0b4181a541bc73c6852474
%env SPOTIFY_CLIENT_SECRET=258c115477814b09b93b0b04d507e929
%env SPOTIFY_REDIRECT_URI=FinalProjectCS109a://callback

env: SPOTIFY_CLIENT_ID=aff9ad651d0b4181a541bc73c6852474
env: SPOTIFY_CLIENT_SECRET=258c115477814b09b93b0b04d507e929
env: SPOTIFY_REDIRECT_URI=FinalProjectCS109a://callback


In [15]:
# Just a test to see that the credentails are setup
display(os.environ['SPOTIFY_CLIENT_ID'])
display(os.environ['SPOTIFY_CLIENT_SECRET'])
display(os.environ['SPOTIFY_REDIRECT_URI'])

'aff9ad651d0b4181a541bc73c6852474'

'258c115477814b09b93b0b04d507e929'

'FinalProjectCS109a://callback'

In [16]:
# Set database path on local drive
db_path = "../data/spotify_songsv2.db"

# Get sesion
session = get_session(db_path)

# Get Classes
Playlists = getattr(get_session, "Playlists")
Artists = getattr(get_session, "Artists")
Tracks = getattr(get_session, "Tracks")

In [17]:
# test
result = display_time(session.query(Tracks).first)
display(result.track_uri)

result = display_time(session.query(Artists).first)
display(result.artist_uri)

Time to Execute: 0.01 seconds


'spotify:track:2d7LPtieXdIYzf7yHPooWd'

Time to Execute: 0.0 seconds


'spotify:artist:3tBUUrQ26CCErBABEw2EXo'

## Test - Artists

In [18]:
# build table generator
tbl_gen = Table_Generator(query=session.query(Artists), batch_size=50).batch_generator()

Creating Table Generator:
	batch size :  50
	length     :  296014
	num batches:  5921


In [19]:
# get 50 artists, extract URIs
first50_artists = tbl_gen.__next__()[0:50]
first50_artist_uris = [a.artist_uri for a in first50_artists]

first50_artist_uris

HBox(children=(IntProgress(value=0, max=296014), HTML(value='')))

['spotify:artist:3tBUUrQ26CCErBABEw2EXo',
 'spotify:artist:614CRLw0nKYoThHBfYETJ5',
 'spotify:artist:1ZeMjRwmR1Mwcc8OHBhnuo',
 'spotify:artist:29AMdFTZ4Ahs3fFXIq8Dic',
 'spotify:artist:7F0ZmNdg6ofNy2S4gRiIfI',
 'spotify:artist:6ylPKBT7j4mNXtoJeKlBoj',
 'spotify:artist:5Ny8cN3lGmui6Pu5H26Xqj',
 'spotify:artist:3MLPFTe4BrpEV2eOVG0gLK',
 'spotify:artist:6kN7AbeUV4yRewiVUToouk',
 'spotify:artist:1mkThrz3Up9YGWnb3NDyZV',
 'spotify:artist:4WaoY882GlpN7vKtNB25zC',
 'spotify:artist:51S9dRF8a9hrdg4N1PbEWT',
 'spotify:artist:7d779nStwID81AfOrbz7s6',
 'spotify:artist:1stsCjH4vP2aRdIhkxdguc',
 'spotify:artist:2vBXjxCF5BK4tEobAXl10f',
 'spotify:artist:0mu8JFOMnbPRERjQu4nYvX',
 'spotify:artist:4KQBlgVeRGs6D2kG5rdwRQ',
 'spotify:artist:4xW7Z8zF4QZpzPjJUsSw1f',
 'spotify:artist:2t2iHbw9NMohNNDBDAZzOz',
 'spotify:artist:6f1O7MaPkmSMsTpaXsGpLk',
 'spotify:artist:7AutmNQynw0f6nuuKHqNY2',
 'spotify:artist:3lcmzWPTzejqs36ZSSlV0S',
 'spotify:artist:74tKoyDq7xSvvgUXUU1FHG',
 'spotify:artist:4SqkB08U0CNJuLAvx

In [None]:
# use spotify API to get artist info based on above URIs
artist_api_test_response = get_artists(first50_artist_uris)

In [None]:
artist_api_test_response

In [None]:
# change a few URIs in the example to be "bad"
first50_artist_uris[0] = 'spotify:artist:3tBUUrQ26CCErBABEwbad1'
first50_artist_uris[10] = 'spotify:artist:3tBUUrQ26CCErBABEwbad2'
first50_artist_uris[48] = 'spotify:artist:3tBUUrQ26CCErBABEwbad3'

In [None]:
# rerun request
artist_api_test_response = get_artists(first50_artist_uris)

In [None]:
artist_api_test_response

In [None]:
# test - get bad indices
bad_uris_indices = [i for i,v in enumerate(artist_api_test_response) if v == None]

display(bad_uris_indices)

for i in bad_uris_indices:
    print(first50_artist_uris[i])
    
# test - get bad URIs
bad_uris = [first50_artist_uris[i] for i in bad_uris_indices]

bad_uris

In [None]:
# function that takes a chunk of 50 artists and returns a list of bad URIs
def get_bad_artist_uris(artist_uris:list) -> list:
    response = get_artists(artist_uris)
    bad_uris_indices = [i for i,v in enumerate(response) if v == None]
    return [artist_uris[i] for i in bad_uris_indices]

In [None]:
# test function above
display(get_bad_artist_uris(first50_artist_uris))

In [None]:
# test function to process a few batches (adding in some bad apples as we go)
def find_all_bad_artist_uris(tbl_gen, num_iterations=5) -> list:
    bad_artist_uris = []
    
    for i in range(num_iterations):
        batch = tbl_gen.__next__()
        artist_uris = [a.artist_uri for a in batch]
        
        artist_uris[0] = 'spotify:artist:3tBUUrQ26CCErBABEwbad1'
        artist_uris[10] = 'spotify:artist:3tBUUrQ26CCErBABEwbad2'
        artist_uris[48] = 'spotify:artist:3tBUUrQ26CCErBABEwbad3'
        
        l = get_bad_artist_uris(artist_uris)
        bad_artist_uris.extend(l)
    
    return bad_artist_uris

In [None]:
# test function
test_list = find_all_bad_artist_uris(tbl_gen)

In [None]:
test_list

In [None]:
# dump list to pickle
with open('test_pickle.pkl', 'wb') as f:
    pickle.dump(test_list, f)

In [None]:
# test reload of pickle
with open('test_pickle.pkl', 'rb') as f:
    test_open_list = pickle.load(f)

In [None]:
test_open_list

## Run - Artists

In [None]:
# FUNCTIONS TO PARSE DATA (part of this is copied from above to keep it in one place)

# function that takes a chunk of 50 artists and returns a list of bad URIs
def get_bad_artist_uris(artist_uris:list) -> list:
    response = get_artists(artist_uris)
    bad_uris_indices = [i for i,v in enumerate(response) if v == None]
    return [artist_uris[i] for i in bad_uris_indices]

# function that takes a table generator and iterates over it to find bad artist URIs
def find_all_bad_artist_uris(tbl_gen) -> list:
    bad_artist_uris = []
    
    for batch in tbl_gen:
        artist_uris = [a.artist_uri for a in batch]
        l = get_bad_artist_uris(artist_uris)
        bad_artist_uris.extend(l)
    
    return bad_artist_uris

In [None]:
# set up table generator again for full run-through
tbl_gen = Table_Generator(query=session.query(Artists), batch_size=50).batch_generator()

In [None]:
# run API requests to build full list of bad URIs
full_list = find_all_bad_artist_uris(tbl_gen)

In [None]:
# check list
display(len(full_list))
display(full_list)

In [None]:
# dump list to pickle
with open('bad_artist_uris.pkl', 'wb') as f:
    pickle.dump(full_list, f)

In [None]:
# test load
with open('bad_artist_uris.pkl', 'rb') as f:
    test_open_list = pickle.load(f)

In [None]:
test_open_list

#### NOTE: Zero bad artists found.

## Test - tracks

In [None]:
# build table generator
tbl_gen = Table_Generator(query=session.query(Tracks), batch_size=50).batch_generator()

In [None]:
# get 50 artists, extract URIs
first50_tracks = tbl_gen.__next__()[0:50]
first50_track_uris = [a.track_uri for a in first50_tracks]

first50_track_uris

In [None]:
# use spotify API to get artist info based on above URIs
track_api_test_response = get_tracks(first50_track_uris)

In [None]:
track_api_test_response

In [None]:
# change a few URIs in the example to be "bad"
first50_track_uris[0] = 'spotify:track:3tBUUrQ26CCErBABEwbad1'
first50_track_uris[10] = 'spotify:track:3tBUUrQ26CCErBABEwbad2'
first50_track_uris[48] = 'spotify:track:3tBUUrQ26CCErBABEwbad3'

In [None]:
# function that takes a chunk of 50 artists and returns a list of bad URIs
def get_bad_track_uris(track_uris:list) -> list:
    response = get_tracks(track_uris)
    bad_uris_indices = [i for i,v in enumerate(response) if v == None]
    return [track_uris[i] for i in bad_uris_indices]

In [None]:
# test function above
display(get_bad_track_uris(first50_track_uris))

## Run - Tracks

In [5]:
# FUNCTIONS TO PARSE DATA (part of this is copied from above to keep it in one place)

# function that takes a chunk of 50 tracks and returns a list of bad URIs
def get_bad_track_uris(track_uris:list) -> list:
    response = get_tracks(track_uris)
    bad_uris_indices = [i for i,v in enumerate(response) if v == None]
    return [track_uris[i] for i in bad_uris_indices]

# function that takes a table generator and iterates over it to find bad artist URIs
def find_all_bad_track_uris(tbl_gen) -> list:
    bad_track_uris = []
    
    for batch in tbl_gen:
        track_uris = [t.track_uri for t in batch]
        l = get_bad_track_uris(track_uris)
        bad_track_uris.extend(l)
    
    return bad_track_uris

In [6]:
# build table generator
tbl_gen = Table_Generator(query=session.query(Tracks), batch_size=50).batch_generator()

Creating Table Generator:
	batch size :  50
	length     :  2262181
	num batches:  45244


In [7]:
# run API requests to build full list of bad URIs
full_list = find_all_bad_track_uris(tbl_gen)

HBox(children=(IntProgress(value=0, max=2262181), HTML(value='')))

Setting credentials
token():INFO:   Getting initial token
token():INFO:   Token refreshed
token():INFO:   Token refreshed
token():INFO:   Token refreshed
token():INFO:   Token refreshed
token():INFO:   Token refreshed
token():INFO:   Token refreshed
token():INFO:   Token refreshed



In [8]:
len(full_list)

584

In [9]:
# dump list to pickle
with open('bad_track_uris.pkl', 'wb') as f:
    pickle.dump(full_list, f)

In [3]:
# test load
with open('bad_track_uris.pkl', 'rb') as f:
    test_open_list = pickle.load(f)

In [4]:
test_open_list

['spotify:track:1pTy8JZSD7UTU5Xg4Cs8VX',
 'spotify:track:05iPvBxNaTbMi1twl3rnTJ',
 'spotify:track:2nOyWmvoS9AffQFUc9I7v0',
 'spotify:track:11NIehXWtiykABhxSCRwFI',
 'spotify:track:7sRmheYLtEzJtc2MeVbT6F',
 'spotify:track:4WHjf37BBXUo3WYBmJPdoU',
 'spotify:track:0KiU8DYL1oUZP0MrEHC5CS',
 'spotify:track:0jcNaEKRJSwdW1Oz7LvuG7',
 'spotify:track:5nxkWaHFSqk1ep5wS3WelK',
 'spotify:track:2rGNgMHMn9NvjeoUCX2wc4',
 'spotify:track:0crHjy8Lrvkz7oPPRq1L6E',
 'spotify:track:5wWSOTMSqYUGFJ5Fl599LI',
 'spotify:track:1SuyTHBUdlsDJeZ77viTVY',
 'spotify:track:18JnPuzORjaFXgdcVJWjjI',
 'spotify:track:5mbnHTT20qMKEbj31NmpHF',
 'spotify:track:6cePxNkX02pGoQHhiVohO1',
 'spotify:track:5WJ2L1bDMN2mXiwyQaXUz0',
 'spotify:track:0zA7okPLt66GVf2ilDN9Eq',
 'spotify:track:3o9RCvd6Vg8wD8b9SLBhDr',
 'spotify:track:35yeDq2AngBhej5A3qNnpX',
 'spotify:track:6UnswPYevOy5rxgZPslKGn',
 'spotify:track:14FXVh0pq4KjGzzSsHfSoy',
 'spotify:track:3naqPs5OCvHpl6c3ViHIqu',
 'spotify:track:4BivwSVjlIZiRIPvxBk5s7',
 'spotify:track:

In [11]:
# test that these are in fact missing
N_URIS = len(test_open_list)
BATCH_SZ = 50

i = 0
while i < N_URIS:
    sz = min(BATCH_SZ, N_URIS-i)
    response = get_tracks(test_open_list[i:i+sz])
    print(response)
    i += BATCH_SZ

[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 

## Artist URI mismatches

Rather than looking for missing URIs (which there are none) this modification to the earlier code creates a dictionary mapping the URIs in the DB to the actual URI returned by spotify.

In [32]:
# function that takes a table generator and iterates over it to find mismatched artist URIs
# returns a dict mapping current URIs to the returned (from API) URIs
def find_artist_mismatch_uris(tbl_gen) -> dict:
    mismatch_uris = {}
    
    for batch in tbl_gen:
        artist_uris = [a.artist_uri for a in batch]
        response = get_artists(artist_uris)
        
        for i in range(len(response)):
            if response[i]['uri'] != artist_uris[i]:
                mismatch_uris[artist_uris[i]] = response[i]['uri']
                
    return mismatch_uris

In [33]:
# set up table generator again for full run-through
tbl_gen = Table_Generator(query=session.query(Artists), batch_size=50).batch_generator()

Creating Table Generator:
	batch size :  50
	length     :  296014
	num batches:  5921


In [34]:
mismatched_dict = find_artist_mismatch_uris(tbl_gen)

HBox(children=(IntProgress(value=0, max=296014), HTML(value='')))

In [35]:
len(mismatched_dict)

17

In [36]:
mismatched_dict

{'spotify:artist:6U7UttPmVs321tWQZnQoEt': 'spotify:artist:1vc9BdsxN00MpgIY1lifuB',
 'spotify:artist:0cVSr2fyCKZhweKdgDtzpv': 'spotify:artist:2URUumnB8mguimUYWej7Vh',
 'spotify:artist:2X559F6FePOMHWNjY926ja': 'spotify:artist:5JZYv0z6YzsztblsEpst8g',
 'spotify:artist:2t4d35GMIVX8cQFYbP89j8': 'spotify:artist:1XujSdsxykPhP3dn6HaT4l',
 'spotify:artist:3S4Y2OLp04uG7EgeLUuP5v': 'spotify:artist:6ISSzeT1ACKbGLO0JBZ23r',
 'spotify:artist:3nWDbRY3mWD7ObzRUva5Tf': 'spotify:artist:1XvC09ZYvWgsa5pXROeT9Y',
 'spotify:artist:3qTW0weIxjRdGSEKMbGRNZ': 'spotify:artist:4rv0B6KyCBBpV1xcNmqmCM',
 'spotify:artist:3ts3Txswf4tjfQIY2nC43m': 'spotify:artist:6fTEkjuuZ3hH6fdeBxxbA2',
 'spotify:artist:40dWpjd3A0HVYI2Y7ReobD': 'spotify:artist:6GqL4bwmVbFVhvupr8HF4u',
 'spotify:artist:4WIIabZbvghOasFdVWyl68': 'spotify:artist:1vc9BdsxN00MpgIY1lifuB',
 'spotify:artist:4mFx0DNvB2QGSC2XfEidWA': 'spotify:artist:2R6hhNDQT8vBWQgCCmDnK0',
 'spotify:artist:6Qix2PtaC8xSwi0cbepQc8': 'spotify:artist:2UocIcNiHj5n4tj1CnBzRq',
 'sp

In [37]:
test = get_artists(['spotify:artist:6U7UttPmVs321tWQZnQoEt'])
test

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/1vc9BdsxN00MpgIY1lifuB'},
  'followers': {'href': None, 'total': 265},
  'genres': ['reggae', 'roots reggae'],
  'href': 'https://api.spotify.com/v1/artists/1vc9BdsxN00MpgIY1lifuB',
  'id': '1vc9BdsxN00MpgIY1lifuB',
  'images': [{'height': 640,
    'url': 'https://i.scdn.co/image/ab67616d0000b273adbe12caba8da1342c74d896',
    'width': 640},
   {'height': 300,
    'url': 'https://i.scdn.co/image/ab67616d00001e02adbe12caba8da1342c74d896',
    'width': 300},
   {'height': 64,
    'url': 'https://i.scdn.co/image/ab67616d00004851adbe12caba8da1342c74d896',
    'width': 64}],
  'name': 'Ras Michael and The Sons Of Negus',
  'popularity': 22,
  'type': 'artist',
  'uri': 'spotify:artist:1vc9BdsxN00MpgIY1lifuB'}]

In [38]:
test2 = get_artists(['spotify:artist:3MLPFTe4BrpEV2eOVG0gLK'])
test2

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/3MLPFTe4BrpEV2eOVG0gLK'},
  'followers': {'href': None, 'total': 14362},
  'genres': ['fado', 'morna'],
  'href': 'https://api.spotify.com/v1/artists/3MLPFTe4BrpEV2eOVG0gLK',
  'id': '3MLPFTe4BrpEV2eOVG0gLK',
  'images': [{'height': 640,
    'url': 'https://i.scdn.co/image/d4856c118db23a50247d798a3dfa4e92001c120a',
    'width': 640},
   {'height': 320,
    'url': 'https://i.scdn.co/image/3c8c083b6fef768189b335849047781407be2c60',
    'width': 320},
   {'height': 160,
    'url': 'https://i.scdn.co/image/c51fced09660babac3ac564cb11e3377db0a3385',
    'width': 160}],
  'name': 'Camane',
  'popularity': 37,
  'type': 'artist',
  'uri': 'spotify:artist:3MLPFTe4BrpEV2eOVG0gLK'}]

In [39]:
# dump dict to pickle
with open('mismatched_artist_uris.pkl', 'wb') as f:
    pickle.dump(mismatched_dict, f)

In [40]:
# test
with open('mismatched_artist_uris.pkl', 'rb') as f:
    test_open_dict = pickle.load(f)

In [41]:
test_open_dict

{'spotify:artist:6U7UttPmVs321tWQZnQoEt': 'spotify:artist:1vc9BdsxN00MpgIY1lifuB',
 'spotify:artist:0cVSr2fyCKZhweKdgDtzpv': 'spotify:artist:2URUumnB8mguimUYWej7Vh',
 'spotify:artist:2X559F6FePOMHWNjY926ja': 'spotify:artist:5JZYv0z6YzsztblsEpst8g',
 'spotify:artist:2t4d35GMIVX8cQFYbP89j8': 'spotify:artist:1XujSdsxykPhP3dn6HaT4l',
 'spotify:artist:3S4Y2OLp04uG7EgeLUuP5v': 'spotify:artist:6ISSzeT1ACKbGLO0JBZ23r',
 'spotify:artist:3nWDbRY3mWD7ObzRUva5Tf': 'spotify:artist:1XvC09ZYvWgsa5pXROeT9Y',
 'spotify:artist:3qTW0weIxjRdGSEKMbGRNZ': 'spotify:artist:4rv0B6KyCBBpV1xcNmqmCM',
 'spotify:artist:3ts3Txswf4tjfQIY2nC43m': 'spotify:artist:6fTEkjuuZ3hH6fdeBxxbA2',
 'spotify:artist:40dWpjd3A0HVYI2Y7ReobD': 'spotify:artist:6GqL4bwmVbFVhvupr8HF4u',
 'spotify:artist:4WIIabZbvghOasFdVWyl68': 'spotify:artist:1vc9BdsxN00MpgIY1lifuB',
 'spotify:artist:4mFx0DNvB2QGSC2XfEidWA': 'spotify:artist:2R6hhNDQT8vBWQgCCmDnK0',
 'spotify:artist:6Qix2PtaC8xSwi0cbepQc8': 'spotify:artist:2UocIcNiHj5n4tj1CnBzRq',
 'sp