Using the artists schema, this script iterates over artistID's and gets the corresponding list of songID's, then appends each resulting (artistID,songID) pair to the song_ids.txt file.

In [1]:
# functions _get and get_artist_songs() in this code 
# are courtesy of GitHub user imdkm:
# https://gist.github.com/imdkm/a60247b59ff1881fa4bb8846a9b44c96

In [2]:
import requests, json
from time import sleep
import sys
import pandas as pd
import os

In [3]:
# Secret token
token = open("ACCESS_TOKEN.txt", "r").read()

In [4]:
# constant values.
BASE_URL = "https://api.genius.com"
CLIENT_ACCESS_TOKEN = token
QUERY_SIZE = 200

In [5]:
# send request and get response in json format.
def _get(path, params=None, headers=None):

    # generate request URL
    requrl = '/'.join([BASE_URL, path])
    token = "Bearer {}".format(CLIENT_ACCESS_TOKEN)
    if headers:
        headers['Authorization'] = token
    else:
        headers = {"Authorization": token}

    response = requests.get(url=requrl, params=params, headers=headers)
    response.raise_for_status()

    return response.json()

def get_artist_songs(artist_id):
    # initialize variables & a list.
    current_page = 1
    next_page = True
    songs = []

    # main loop
    while next_page:

        path = "artists/{}/songs/".format(artist_id)
        params = {'page': current_page}
        data = _get(path=path, params=params)

        page_songs = data['response']['songs']

        if page_songs:
            # add all the songs of current page,
            # and increment current_page value for next loop.
            songs += page_songs
            current_page += 1
        else:
            # if page_songs is empty, quit.
            next_page = False

    # get all the song ids, excluding not-primary-artist songs.
    songs = [song["id"] for song in songs
             if song["primary_artist"]["id"] == artist_id]

    return songs

In [6]:
def get_from(artist_id):
    with open("artists.txt","r") as f:
        lines = f.readlines()
        i=0
        while i < len(lines):
            if lines[i].split(',')[0] == str(artist_id):
                break
            i+=1
    return lines[i+1:]

In [23]:
# read in list of artist names
    
import os.path
if os.path.isfile('song_ids.txt'):
    l = !wc -l song_ids.txt
    if int(l[0].split()[0]) > 0:
        s = !tail -1 song_ids.txt
        last_artist = s[0].split(',')[0]
        artists = get_from(last_artist)
    
artist_ids = [line.strip().split(',')[0] for line in artists if line.strip().split(',')[0] != 'None']
print(str(len(artists))+" artists total")

0 artists total


In [24]:
artist_set = set()
with open("song_ids.txt", "r") as f:
    for line in f:
        line = line.strip()
        artist_id,_ = line.split(',')
        artist_set.add(artist_id)

print("done reading")

done reading


In [25]:
len(artist_set)

1678

In [22]:
# query only a subset of artists at a time to not overwork server
# increment chunk_num on each run
    
artists_chunk = artist_ids[0:]

# populate song ids using artist names
for i, artist_id in enumerate(artists_chunk):
    #sys.stdout.write('\r'+str(i).zfill(5))
    print(artist_id)
    
    # check if we've already queried this artist:
    if artist_id in artist_set:
        print(artist_id + " already queried, skipping")
        continue
    else:
        artist_set.add(artist_id)

    # get all song ids and make a list.
    song_ids = [None]
    try:
        song_ids = get_artist_songs(int(artist_id))
        print("-> " + str(len(song_ids))+" results")
    except:
        print("NOT FOUND")

    with open("song_ids.txt", "a") as f:
        for song_id in song_ids:
            f.write(str(artist_id)+","+str(song_id)+"\n")
            
    sleep(.5)
    
os.system('say "Done"')

329461
-> 2 results
350433
-> 14 results
4749
-> 186 results
6060
-> 1 results
507
507 already queried, skipping
365765
-> 25 results
338591
338591 already queried, skipping
36594
-> 190 results
368665
-> 6 results
2548
-> 50 results
15740
15740 already queried, skipping
26507
26507 already queried, skipping
737
-> 114 results
30242
-> 257 results
209879
-> 1 results
2957
2957 already queried, skipping
251397
-> 184 results
1042428
-> 45 results
368037
-> 38 results
59441
-> 106 results
552422
-> 36 results
332633
-> 60 results
358379
-> 117 results
1679
-> 67 results
66456
-> 13 results
4
4 already queried, skipping
11195
-> 435 results
511335
-> 11 results
59805
-> 120 results
482683
-> 2 results
344478
-> 167 results
39192
-> 2 results
353342
-> 115 results
28089
-> 37 results
157840
-> 78 results
16965
-> 254 results
351663
-> 108 results
372112
-> 22 results
152920
-> 192 results
100951
-> 374 results
48147
48147 already queried, skipping
1766
-> 6 results
385577
-> 2 results
1128

0

In [28]:
songids = pd.read_csv('song_ids.txt', sep=',', names=['artist', 'song'])

In [34]:
sids = songids.drop_duplicates(subset='song')

In [37]:
sids.to_csv('song_ids_unique.csv', index=False)