In [1]:
import requests as rq
from lxml import html

def xpath_query_url(url, xpath_query, payload=dict(), params=dict()):
    '''Gets url and performs an XPath Query'''
    headers = {'User-Agent': 'Mozilla/5.0 Gecko/20100101 Firefox/35.0'}
    if payload:
        headers.update(payload)
    try:
        response = rq.get(url, headers=headers, params=params)
        #creates an html tree from the data
        tree = html.fromstring(response.text)
        #XPATH query to grab all of the artist urls, then we grab the first
        return tree.xpath(xpath_query)
    except Exception as e:
        print(e)
        return ''

In [10]:
from re import sub
def fetch_artist_names(random_sample=None):
    '''Fetches top artists from wikipedia'''
    base = 'https://en.wikipedia.org/wiki/List_of_hip_hop_musicians'
    query = '//li/a/@title'
    results = xpath_query_url(base, query)

    if results:
        results = [sub(' \([\w ]+\)', '', a) for a in results]

    return results

wiki_artists = fetch_artist_names()
wiki_artists += ['2Pac', 'Joey BADA$$']
wiki_artist_count = len(wiki_artists)
print('Scraped {} potential artists'.format(wiki_artist_count))

Scraped 1176 potential artists


In [7]:
import gevent
from gevent.pool import Pool
from gevent import monkey
from functools import partial
import signal

gevent.monkey.patch_all(thread=False)
gevent.signal(signal.SIGQUIT, gevent.kill)

def grab_artist_links(artist):
    base = 'http://genius.com/search/artists'
    params = {'q': artist}
    xpath = '//li/a[@class="artist_link"]/@href'
    artist_links = partial(xpath_query_url, 
                           xpath_query=xpath, 
                           url=base)
    return artist, artist_links(params=params)

artist_link_lists = Pool(100).map(grab_artist_links, wiki_artists)
# filters out artists we couldn't find links for
# as well as false positives that aren't even artists 
artist_link_lists = [alist for alist in artist_link_lists if alist[1]]
# grabs just the first link from each list
artist_link_lists = [(alist[0], alist[1][0]) for alist in artist_link_lists if alist[1]]

In [12]:
def grab_artist_id(args):
    artist, artist_url = args
    xpath = '//meta[@name="newrelic-resource-path"]/@content'
    artist_id_raw = xpath_query_url(artist_url, xpath)
    
    if artist_id_raw:
        delisted = artist_id_raw[0]
        if delisted and 'artists/' in delisted:
            artist_id = delisted.split('artists/')[1]
            return artist, artist_id
    return artist, None

artist_ids = Pool(100).map(grab_artist_id, artist_link_lists)
# filters out missing entries that couldn't be resolved
artist_ids_filtered = [a for a in artist_ids if all(a)]

filtered_artist_count = len(artist_ids_filtered)
artist_count_diff = wiki_artist_count - filtered_artist_count
feedback = 'Found {} artist_ids, filtered out {} negative results'
print(feedback.format(filtered_artist_count, artist_count_diff))

Found 1108 artist_ids, filtered out 68 negative results


In [16]:
class Genius:
    def __init__(self):
        token = 'nmovpnwjelwjWJG_jjQHu-Emfhsv0WX0iv_WZxz5LRpzjjAezDXMhW56jhzwC634'
        headers = {'Authorization': 'Bearer {}'.format(token)}
        self._sesh = rq.Session()
        self._sesh.headers.update(headers)
        self._base = 'http://api.genius.com'
        
    def _key_response(self, response, subkey):
        r = response.get('response', None)
        if r and subkey in r:
            return r[subkey]
        return {}

    def get_artist(self, artist_id):
        url = '{}/artists/{}'.format(self._base, artist_id)
        response = self._sesh.get(url, params={'text_format': 'plain'}).json()
        return self._key_response(response, 'artist')
    
    def get_artist_songs(self, artist_id, page, sort='title', per_page=50):
        url = '{}/artists/{}/songs'.format(self._base, artist_id)
        params = {'sort': sort, 'per_page': per_page, 'page': page}
        response = self._sesh.get(url, params=params).json()
        return response.get('response', None)
    
    def get_song(self, song_id):
        url = '{}/songs/{}'.format(self._base, song_id)
        response = self._sesh.get(url, params={'text_format': 'plain'}).json()
        return self._key_response(response, 'song')

In [18]:
api = Genius()
api.get_artist_songs(28906, 1)

{'next_page': 2,
 'songs': [{'annotation_count': 11,
   'api_path': '/songs/716144',
   'full_title': 'Adore by\xa0Cashmere\xa0Cat (Ft.\xa0Ariana\xa0Grande)',
   'header_image_thumbnail_url': 'https://images.genius.com/e7cd910c669142c161e494dae0c161fe.300x300x1.png',
   'header_image_url': 'https://images.genius.com/e7cd910c669142c161e494dae0c161fe.1000x1000x1.png',
   'id': 716144,
   'lyrics_owner_id': 104344,
   'path': '/Cashmere-cat-adore-lyrics',
   'primary_artist': {'api_path': '/artists/49698',
    'header_image_url': 'https://images.genius.com/30a2628f1de2b58dccd193e87f051415.1000x668x1.jpg',
    'id': 49698,
    'image_url': 'https://images.genius.com/36a655defd25e908cc0cba4aa197db24.1000x1000x1.jpg',
    'is_meme_verified': False,
    'is_verified': False,
    'name': 'Cashmere Cat',
    'url': 'http://genius.com/artists/Cashmere-cat'},
   'pyongs_count': 33,
   'stats': {'hot': False, 'pageviews': 22401, 'unreviewed_annotations': 0},
   'title': 'Adore',
   'url': 'http://

In [19]:
artist_ids_stripped = [a[1] for a in artist_ids_filtered]
artist_info = Pool(100).map(api.get_artist, artist_ids_stripped)
artist_info[9]

{'alternate_names': ["Curtis '50 Cent' Jackson", 'Curtis Jackson'],
 'api_path': '/artists/108',
 'current_user_metadata': {'interactions': {'following': False},
  'permissions': ['view_activity_stream']},
 'description': {'plain': "Curtis James Jackson III (born July 6, 1975), better known by his stage name 50 Cent, is an American rapper, entrepreneur, investor, record producer, and actor.\n\nAfter recording a never-released album with Columbia, 50 got on his grind and rose to fame via a string of well-received mixtapes, thus setting the template for rap stardom in the modern era.\n\n50’s crowning success came in 2003 with the release of the album Get Rich or Die Tryin' which sold 872,000 copies in four days and was certified 8x platinum by the RIAA. Since that monster hit, he has continued to make music while also delving into movies, sports drinks, video games, mass charity campaigns and more."},
 'description_annotation': {'_type': 'referent',
  'annotatable': {'api_path': '/artist

In [None]:
from gevent.queue import JoinableQueue
import os 
import msgpack
from datetime import datetime

q_artists = JoinableQueue()
q_songs = JoinableQueue()
q_serial = JoinableQueue()
    
def artist_worker():
    while True:
        artist_id, page_n = q_artists.get()
        
        try:
            song_page = api.get_artist_songs(artist_id, page_n)

            for song in song_page['songs']:
                q_songs.put(song['id'])
            
            next_page = song_page['next_page']
            if next_page:
                q_artists.put((artist_id, next_page))
        except Exception as e:
            print('fucked up grabbing artist songs')
            print(e)
        finally:
            q_artists.task_done()

def song_worker():
    while True:
        song_id = q_songs.get()
        
        try:
            song_details = api.get_song(song_id)
            gevent.sleep(0)
            song_lyrics = xpath_query_url(song_details['url'], '//lyrics//text()')
            gevent.sleep(0)
            song_lyrics_clean = [l.replace('\n','') for l in lyrics \
                                 if len(l.strip()) > 1 and 'googletag' not in l]
            song_details['lyrics'] = song_lyrics_clean
            q_serial.put(song_details)
        except Exception as e:
            print('fucked up grabbing song')
            print(e)
        finally:
            q_songs.task_done()

def serial_worker():
    while True:
        song = q_serial.get()        
        try:       
            artist = song['primary_artist']['url'].split('/')[-1]
            artist_path = 'lyrics/{}'.format(artist)
            
            if not os.path.isdir(artist_path):
                os.makedirs(artist_path)     
                
            filepath = '{}/{}'.format(artist_path, song['path'])
            
            with open(filepath, 'wb') as f:
                msgpack.pack(song, f)
        except Exception as e:
            print('fucked up serializing song')
            print(e)
        finally:
            q_serial.task_done()

for artist_id in artist_ids_stripped:
    q_artists.put((artist_id, 1))
    
artist_workers = []
for _ in range(50):
    artist_workers.append(gevent.spawn(artist_worker))
    
song_workers = []
for _ in range(50):
    song_workers.append(gevent.spawn(song_worker))
    
serial_workers = []
for _ in range(25):
    serial_workers.append(gevent.spawn(serial_worker))
t1 = datetime.now()
q_artists.join()
q_songs.join()
q_serial.join()
gevent.killall(artist_workers)
gevent.killall(song_workers)
gevent.killall(serial_workers)
print('Took {}'.format(datetime.now() - t1))

fucked up grabbing artist songs
'NoneType' object is not subscriptable


In [81]:
def consolidate_lyrics(q):
    '''Consolidates all of the lyric files by their respective
    primary artist.'''
    while True:
        artist = q.get()
        try:
            songs = {}
            artist_path = 'lyrics/{}'.format(artist)
            for song_name in os.listdir(artist_path):
                song_path = '{}/{}'.format(artist_path, song_name)
                with open(song_path, 'rb') as s_file:
                    song_details = msgpack.unpack(s_file)
                    songs[song_name] = song_details

            if not os.path.isdir('lyrics_consolidated'):
                os.makedirs('lyrics_consolidated')

            with open('lyrics_consolidated/{}.mpk'.format(artist), 'wb') as a_file:
                msgpack.pack(songs, a_file)
        except Exception as e:
            print(artist, artist_path, song_name, song_path)
        finally:
            q.task_done()

q_artists = JoinableQueue()
for artist in os.listdir('lyrics'):
    q_artists.put(artist)

consolidate_workers = []
for _ in range (10):
    work = gevent.spawn(consolidate_lyrics, q_artists)
    consolidate_workers.append(work)
    
q_artists.join()
gevent.killall(consolidate_workers)


In [None]:
for artist in os.listdir('lyrics'):
    file = os.listdir('lyrics/{}'.format())
