First, we install the lyricsgenius API:

In [1]:
!pip install lyricsgenius



We import the libraries and set a path for our input file of artists:

In [58]:
from lyricsgenius import Genius
import json
import csv
import multiprocessing
import queue
import logging

# OS agnostic
import os 
CSV_PATH = os.path.join(os.path.curdir, 'artists', '10000-MTV-Music-Artists-page-%s.csv')

Now we import the artist dataset into an array. This consists of the top 10,000 artists listed on MTV:

8344


We set up a lyricsgenius token, and use the API to pull the lyrics data for each artist in the dataset:

In [None]:
# Genius setup
token = "EBufquOcw_ts4Y4V7yiddUNyUakTdqCpnMZhiI3XtAScWOntEom8Hj4T87gAV_cA"
genius = Genius(token, retries=2)

genius.verbose = False
genius.remove_section_headers = True
genius.skip_non_songs = True
genius.excluded_terms = ["(Remix)", "(Live)"]

# Multiprocessing cores
process_number = int(multiprocessing.cpu_count()) * 2
multiprocessing.log_to_stderr().setLevel(logging.DEBUG)

# Data management
final_ = multiprocessing.Manager().list()

# artist_queue = queue.Queue()
# final_ = []
checked_artists = set()

# Pull out artists
def get_artists(queue):
    for x in range(1,5):
        path = CSV_PATH % str(x)
        with open(path, encoding="UTF-8") as csvfile:
            TopArtists = csv.reader(csvfile)
            
            # Skip header
            next(TopArtists)
            for row in TopArtists:
                artist = row[0]
                # Check if we should skip this artists since we already found the data
                if artist not in checked_artists:
                    queue.put(artist)
                      


# File management
def write_to_csv(data, columns = ["artist", "song", "data"], file_name="song_data.csv"):
    """
    data: list of dictionaries {artist, song, data}
    """
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    with open(csv_path, 'w') as csv_file: 
        # creating a csv dict writer object 
        writer = csv.DictWriter(csv_file, fieldnames = columns) 
        
        # writing headers (field names) 
        writer.writeheader() 
        
        # writing data rows 
        writer.writerows(data) 
        

def read_csv(file_name="song_lyrics.csv"):
    global final_, checked_artists     
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    artist_set = set()
    
    # opening the CSV file
    try:
        with open(csv_path, mode ='r') as file:   

            # reading the CSV file
            data = csv.DictReader(file)

            for entry in data:
                artist_set.add(entry["artist"])
                final_.append(entry)
                
    except FileNotFoundError:
        pass
    
# Processing
def clean_data(data):
    cleaned_data = data.replace("\n", " ").replace(",", " ")
    return cleaned_data

def process_artist(artist):
    artist_dict = artist.to_dict()
    return ""

def process_song(song):
    lyrics = clean_data(song.lyrics)
    return lyrics
                           
def build_entry(artist, song, data, columns = ["artist", "song", "data"]):
    entry = [{"artist": artist, "song": song, "data": data}]
    

# Run genius search
def search_genius(args):
    artist_queue, num = args
    print("[{num}] Starting\n".format(num=num), end='')
    try:
        while True:
            artist = artist_queue.get()
            if artist is None:
                print("[{num}] Done\n".format(num=num), end='')
                return
            print("[{num}] Searching {artist}\n".format(num=num, artist=artist.strip()), end='')
            
            # Pull data for artist from genius
            genius_artist = genius.search_artist(artist, per_page=50, get_full_info=False)
            print("[{num}] Done".format(num=num), end='')              
            if genius_artist == None:
                print("[{num}] {artist} not found\n".format(num=num, artist=artist.strip()), end='')
                continue
                           
            artist_data =  process_artist(genius_artist)
                           
            print("[{num}] {artist} number of songs: {song_num}\n".format(num=num, artist=artist.strip(), song_num=len(genius_artist.songs)), end='')
            
            for song in genius_artist.songs:
                song_data = process_song(song)
                
                # Add to final list
                final_.append(build_entry(artist, song, song_data))
    
    except Exception as e:
        print("[{num}] Something went wrong: {error}\n".format(num=num, error= e), end='')
    
    
def run(multi_core=False):                  
    
    # Load in any previous data
    print("Reading previous")
    read_csv()
    
    pool = None
    try:  
        if multi_core:
            print("Multiprocessing with {process_number} processes".format(process_number=process_number))
            
            artist_queue = multiprocessing.Manager().Queue()
            get_artists(artist_queue)
            
            for x in range(process_number):
                artist_queue.put(None)
            
            print(artist_queue.qsize())
            # creating processes
            pool = multiprocessing.Pool(process_number)
            args = [(artist_queue, x) for x in range(process_number)]
            pool.map(search_genius, args)
            pool.join()
            
        else:
            print("Running single core")
            artist_queue = queue.Queue()
            get_artists(artist_queue)
            rtist_queue.put(None)
            print(artist_queue.qsize())
            search_genius((1, artist_queue))

    
    except KeyboardInterrupt:
        if pool:
            pool.close()
            pool.terminate()
            pool.join()
        print("KeyboardInterrupt: Writing results")
    
    finally:
        write_to_csv(list(final_))                       


run(multi_core=True) 




[DEBUG/SyncManager-412] Queue._after_fork()
[DEBUG/SyncManager-412] Queue._after_fork()
[INFO/SyncManager-412] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-412] incref failed: [Errno 2] No such file or directory
[DEBUG/SyncManager-412] Queue._after_fork()
[DEBUG/SyncManager-412] Queue._after_fork()
[DEBUG/SyncManager-412] Queue._after_fork()
[DEBUG/SyncManager-412] Queue._after_fork()
[DEBUG/SyncManager-412] Queue._after_fork()
[DEBUG/SyncManager-412] Queue._after_fork()
[DEBUG/SyncManager-412] Queue._after_fork()
[DEBUG/SyncManager-412] Queue._after_fork()
[INFO/SyncManager-412] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-412] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-412] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-412] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-412] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-412] incref failed: [Err

Reading previous
Multiprocessing with 32 processes


[DEBUG/SyncManager-413] Queue._after_fork()
[DEBUG/SyncManager-413] Queue._after_fork()
[INFO/SyncManager-413] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-413] incref failed: [Errno 2] No such file or directory
[DEBUG/SyncManager-413] Queue._after_fork()
[DEBUG/SyncManager-413] Queue._after_fork()
[DEBUG/SyncManager-413] Queue._after_fork()
[DEBUG/SyncManager-413] Queue._after_fork()
[DEBUG/SyncManager-413] Queue._after_fork()
[DEBUG/SyncManager-413] Queue._after_fork()
[DEBUG/SyncManager-413] Queue._after_fork()
[DEBUG/SyncManager-413] Queue._after_fork()
[INFO/SyncManager-413] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-413] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-413] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-413] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-413] incref failed: [Errno 2] No such file or directory
[INFO/SyncManager-413] incref failed: [Err

8376


[DEBUG/MainProcess] added worker
[DEBUG/MainProcess] added worker
[DEBUG/ForkPoolWorker-414] Queue._after_fork()[DEBUG/MainProcess] added worker

[DEBUG/MainProcess] added worker
[DEBUG/ForkPoolWorker-414] Queue._after_fork()
[INFO/ForkPoolWorker-414] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-415] Queue._after_fork()

[DEBUG/MainProcess] added worker
[INFO/ForkPoolWorker-414] incref failed: [Errno 2] No such file or directory[DEBUG/MainProcess] added worker
[DEBUG/ForkPoolWorker-415] Queue._after_fork()

[DEBUG/ForkPoolWorker-414] Queue._after_fork()[INFO/ForkPoolWorker-415] incref failed: [Errno 2] No such file or directory

[DEBUG/ForkPoolWorker-414] Queue._after_fork()[INFO/ForkPoolWorker-415] incref failed: [Errno 2] No such file or directory

[DEBUG/ForkPoolWorker-414] Queue._after_fork()[DEBUG/ForkPoolWorker-416] Queue._after_fork()
[DEBUG/ForkPoolWorker-415] Queue._after_fork()
[DEBUG/ForkPoolWorker-414] Queue._after_fork()
[DEBUG/MainProcess] added

[INFO/ForkPoolWorker-414] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-417] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-419] Queue._after_fork()
[INFO/ForkPoolWorker-416] incref failed: [Errno 2] No such file or directory[DEBUG/MainProcess] added worker
[DEBUG/MainProcess] added worker
[INFO/ForkPoolWorker-415] incref failed: [Errno 2] No such file or directory

[INFO/ForkPoolWorker-418] incref failed: [Errno 2] No such file or directory


[INFO/ForkPoolWorker-417] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-419] Queue._after_fork()
[DEBUG/ForkPoolWorker-420] Queue._after_fork()[INFO/ForkPoolWorker-414] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-415] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-416] incref failed: [Errno 2] No such file or directory
[INFO/ForkPoolWorker-418] incref failed: [Errno 2] No such file or directory




[INFO/ForkPoolWorker-41

[DEBUG/ForkPoolWorker-422] Queue._after_fork()[INFO/ForkPoolWorker-420] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-418] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-421] Queue._after_fork()

[DEBUG/ForkPoolWorker-423] Queue._after_fork()[DEBUG/ForkPoolWorker-417] INCREF '1130abeb8'
[DEBUG/MainProcess] added worker

[DEBUG/MainProcess] added worker


[INFO/ForkPoolWorker-419] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-424] Queue._after_fork()

[DEBUG/ForkPoolWorker-421] Queue._after_fork()[INFO/ForkPoolWorker-420] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-418] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-422] Queue._after_fork()

[INFO/ForkPoolWorker-417] child process calling self.run()[DEBUG/ForkPoolWorker-423] Queue._after_fork()




[INFO/ForkPoolWorker-419] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-424] incref faile

[INFO/ForkPoolWorker-423] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-419] child process calling self.run()[INFO/ForkPoolWorker-420] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-421] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-422] incref failed: [Errno 2] No such file or directory

[DEBUG/ForkPoolWorker-426] Queue._after_fork()[DEBUG/ForkPoolWorker-427] Queue._after_fork()[INFO/ForkPoolWorker-424] incref failed: [Errno 2] No such file or directory


[DEBUG/MainProcess] added worker



[DEBUG/ForkPoolWorker-429] Queue._after_fork()[INFO/ForkPoolWorker-425] incref failed: [Errno 2] No such file or directory


[DEBUG/ForkPoolWorker-428] Queue._after_fork()[INFO/ForkPoolWorker-423] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-430] Queue._after_fork()[INFO/ForkPoolWorker-419] child process calling self.run()[INFO/ForkPoolWorker-420] incref failed: [Errno 2] No such file or directory[I

[INFO/ForkPoolWorker-423] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-422] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-433] Queue._after_fork()
[DEBUG/ForkPoolWorker-421] INCREF '1131d8488'
[DEBUG/ForkPoolWorker-430] Queue._after_fork()[DEBUG/ForkPoolWorker-432] Queue._after_fork()[INFO/ForkPoolWorker-420] child process calling self.run()[INFO/ForkPoolWorker-434] incref failed: [Errno 2] No such file or directory


[DEBUG/ForkPoolWorker-431] Queue._after_fork()[INFO/ForkPoolWorker-426] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-427] incref failed: [Errno 2] No such file or directory
[INFO/ForkPoolWorker-425] incref failed: [Errno 2] No such file or directory


[INFO/ForkPoolWorker-424] incref failed: [Errno 2] No such file or directory

[DEBUG/ForkPoolWorker-429] Queue._after_fork()[INFO/ForkPoolWorker-428] incref failed: [Errno 2] No such file or directory

[INFO/ForkPoolWorker-423] incref failed: [Errn

[INFO/ForkPoolWorker-428] incref failed: [Errno 2] No such file or directory



[INFO/ForkPoolWorker-429] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-436] incref failed: [Errno 2] No such file or directory
[DEBUG/ForkPoolWorker-435] Queue._after_fork()[DEBUG/ForkPoolWorker-437] Queue._after_fork()[DEBUG/ForkPoolWorker-438] Queue._after_fork()
[DEBUG/ForkPoolWorker-423] INCREF '1130abeb8'
[INFO/ForkPoolWorker-433] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-439] Queue._after_fork()

[INFO/ForkPoolWorker-430] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-422] child process calling self.run()[INFO/ForkPoolWorker-432] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-431] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-434] Queue._after_fork()

[INFO/ForkPoolWorker-426] incref failed: [Errno 2] No such file or directory



[INFO/ForkPoolWorker-427] incref failed: [Er

[DEBUG/ForkPoolWorker-425] INCREF '1130abeb8'[INFO/ForkPoolWorker-429] incref failed: [Errno 2] No such file or directory


[INFO/ForkPoolWorker-424] child process calling self.run()[INFO/ForkPoolWorker-435] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-436] Queue._after_fork()
[INFO/ForkPoolWorker-441] incref failed: [Errno 2] No such file or directory
[DEBUG/ForkPoolWorker-437] Queue._after_fork()
[INFO/ForkPoolWorker-440] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-431] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-439] Queue._after_fork()[DEBUG/ForkPoolWorker-438] Queue._after_fork()[INFO/ForkPoolWorker-433] incref failed: [Errno 2] No such file or directory
[INFO/ForkPoolWorker-430] incref failed: [Errno 2] No such file or directory
[INFO/ForkPoolWorker-434] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-432] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker

[INFO/ForkPoolWorker-438] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-439] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-433] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-416] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-440] Queue._after_fork()[INFO/ForkPoolWorker-430] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-414] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-415] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-417] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-418] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-419] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-420] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-423] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-421] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-425] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-422] INCREF '1130abeb8'[DEBUG/ForkPoolWorker-424] INCREF '1130abeb8'

[INFO/ForkPoolWorker-432] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-434] incref

[0] Starting
[2] Starting
[1] Starting
[3] Starting
[4] Starting
[9] Starting
[5] Starting
[7] Starting
[6] Starting
[11] Starting
[8] Starting
[10] Starting



[INFO/ForkPoolWorker-434] incref failed: [Errno 2] No such file or directory
[INFO/ForkPoolWorker-432] incref failed: [Errno 2] No such file or directory[DEBUG/ForkPoolWorker-426] INCREF '1130abeb8'
[DEBUG/ForkPoolWorker-442] Queue._after_fork()[DEBUG/ForkPoolWorker-444] Queue._after_fork()


[DEBUG/ForkPoolWorker-428] INCREF '1130abeb8'

[INFO/ForkPoolWorker-427] child process calling self.run()




[DEBUG/ForkPoolWorker-414] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-415] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-417] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-416] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-423] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-418] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-421] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-419] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWork

[12] Starting


[DEBUG/ForkPoolWorker-442] Queue._after_fork()
[DEBUG/ForkPoolWorker-428] INCREF '1130abeb8'






[DEBUG/ForkPoolWorker-414] thread 'MainThread' does not own a connection
[DEBUG/ForkPoolWorker-427] INCREF '1130abeb8'
[DEBUG/ForkPoolWorker-417] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-415] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-416] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-421] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-423] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-418] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-419] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-422] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-420] thread 'MainThread' does not own a connection
[DEBUG/ForkPoolWorker-425] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-424] thread 'MainThread' does not own a

[13] Starting


[DEBUG/ForkPoolWorker-415] making connection to manager[DEBUG/ForkPoolWorker-417] making connection to manager
[DEBUG/ForkPoolWorker-423] making connection to manager[DEBUG/ForkPoolWorker-421] making connection to manager[DEBUG/ForkPoolWorker-416] making connection to manager[DEBUG/ForkPoolWorker-419] making connection to manager[DEBUG/ForkPoolWorker-422] making connection to manager
[DEBUG/ForkPoolWorker-424] making connection to manager[DEBUG/ForkPoolWorker-418] making connection to manager[DEBUG/ForkPoolWorker-425] making connection to manager
[DEBUG/ForkPoolWorker-420] making connection to manager[DEBUG/ForkPoolWorker-426] making connection to manager[DEBUG/ForkPoolWorker-429] INCREF '1130abeb8'[INFO/ForkPoolWorker-436] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-441] incref failed: [Errno 2] No such file or directory[INFO/ForkPoolWorker-435] incref failed: [Errno 2] No such file or directory


[DEBUG/ForkPoolWorker-431] INCREF '1131d8488'[INFO/ForkPoolWo