First, we install the lyricsgenius API, as well as multiprocess to increase speed of data scraping:

In [15]:
!pip install multiprocess
!pip install lyricsgenius



We import the libraries and set a path for our input file of artists:

In [4]:
import json
import csv
import multiprocess
import queue
import logging
import sys

from requests.exceptions import HTTPError, ConnectionError, RequestException
from lyricsgenius import Genius


# OS agnostic
import os 
CSV_PATH = os.path.join(os.path.curdir, 'artists', '10000-MTV-Music-Artists-page-%s.csv')

# Scrape Data

We set up a lyricsgenius token, and use the API to pull the lyrics data for each artist in the dataset for the top 10,000 artists from MTV.

In [5]:
# Genius setup
             
def genius_setup():
    token = "EBufquOcw_ts4Y4V7yiddUNyUakTdqCpnMZhiI3XtAScWOntEom8Hj4T87gAV_cA"
    genius = Genius(token, retries=2)

    genius.verbose = False
    genius.remove_section_headers = True
    genius.skip_non_songs = True
    genius.excluded_terms = ["(Remix)", "(Live)"]

    return genius    


# Multiprocessing cores
process_number = int(multiprocess.cpu_count()) * 2

# Data management
final_ = multiprocess.Manager().list()

# artist_queue = queue.Queue()
# final_ = []
checked_artists = set()

file_name = "song_data_2.csv"


# Pull out artists
def get_artists(queue):
    for x in range(1,5):
        path = CSV_PATH % str(x)
        with open(path, encoding="UTF-8") as csvfile:
            TopArtists = csv.reader(csvfile)
            
            # Skip header
            next(TopArtists)
            for row in TopArtists:
                artist = row[0]
                # Check if we should skip this artists since we already found the data
                if artist not in checked_artists:
                    queue.put(artist)


# File management
def write_to_csv(data):
    """
    data: list of dictionaries {artist, song, data}
    """
    global file_name
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    with open(csv_path, 'w') as csv_file: 
        # creating a csv dict writer object 
        print("Entries: {num}".format(num=len(data)))
        keys = data[0].keys()
        writer = csv.DictWriter(csv_file, fieldnames = keys) 
        
        # writing headers (field names) 
        writer.writeheader() 
        
        # writing data rows 
        writer.writerows(data) 
        

def read_csv():
    global final_, checked_artists, file_name   
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    
    # opening the CSV file
    try:
        with open(csv_path, mode ='r', encoding="UTF-8") as file:   

            # reading the CSV file
            data = csv.DictReader(file)

            for entry in data:
                checked_artists.add(entry["artist"])
                final_.append(entry)
                
        print("Number of artists already found {num}".format(num=len(checked_artists)))
    except FileNotFoundError:
        pass
    

# Run genius search
def search_genius(args):
    import sys
    from requests.exceptions import RequestException
    artist_queue, num, genius, final_ = args
    
    def log(string):
        print("[{num}] ".format(num=num) + string + "\n", end='')
        sys.stdout.flush()
    
    # Processing
    def clean_data(data):
        cleaned_data = data.replace("\n", "|").replace(",", " ")
        return cleaned_data

    def process_artist(artist):
        artist_dict = artist.to_dict()
        return ""

    def process_song(song):
        lyrics = clean_data(song.lyrics)
        return lyrics

    def build_entry(artist, song, data, columns = ["artist", "song", "data"]):
        entry = {"artist": artist, "song": song, "data": data}
        return entry
    
    log("Starting")
    try:
        while True:
            genius_artist = None
            artist = artist_queue.get()
            if artist is None:
                log("Done")
                return
            log("Remaining: [{queue}]. Searching {artist}".format(queue=artist_queue.qsize(), artist=artist.strip()))
            
            # Pull data for artist from genius
            for x in range(5):
                try:
                    genius_artist = genius.search_artist(artist, per_page=50, get_full_info=False)
                    break
                except RequestException as e:
                    log("HTTPSConnectionPool exception. Attempt {}/5".format(x+1))
                except Exception as e:
                    log("Exception. Attempt {}/3".format(x+1))
            
            log("Finished {artist}".format(num=num, artist=artist.strip()))
            if genius_artist == None:
                log("{artist} not found".format(num=num, artist=artist.strip()))
                continue
                           
            artist_data =  process_artist(genius_artist)
                           
            log("{artist} number of songs: {song_num}".format(num=num, artist=artist.strip(), song_num=len(genius_artist.songs)))
            
            for song in genius_artist.songs:
                song_data = process_song(song)
                
                # Add to final list
                final_.append(build_entry(artist, song.title, song_data))
    
    except Exception as e:
        log("Something went wrong: {error}".format(num=num, error= e))
    
    
def run(multi_core=False): 
    
    # Setup Genius
    genius = genius_setup()
    
    # Load in any previous data
    print("Reading previous")
    read_csv()
    
    pool = None
    try:  
        if multi_core:
            # multiprocess.log_to_stderr().setLevel(logging.DEBUG)
            print("Multiprocessing with {process_number} processes".format(process_number=process_number))
            
            artist_queue = multiprocess.Manager().Queue()
            get_artists(artist_queue)
            
            for x in range(process_number):
                artist_queue.put(None)
            
            print(artist_queue.qsize())
            # creating processes
            with multiprocess.get_context("spawn").Pool(process_number) as pool:
                args = [(artist_queue, x, genius, final_) for x in range(process_number)]
                pool.map(search_genius, args)
                pool.close()
                pool.join()
            
        else:
            print("Running single core")
            artist_queue = queue.Queue()
            get_artists(artist_queue)
            artist_queue.put(None)
            print(artist_queue.qsize())
            search_genius((artist_queue, 0, genius, final_))

    
    except KeyboardInterrupt:
        if pool:
            pool.close()
            pool.terminate()
            pool.join()
        print("KeyboardInterrupt: Writing results")
    
    finally:
        write_to_csv(list(final_))                       



In [6]:
# run(multi_core=True) 

In [7]:
# genius = genius_setup()
# genius_artist = genius.search_artist("Sam Hunt", per_page=50, get_full_info=False)

In [8]:
# genius_artist.songs[30].lyrics

# TensorFlow

We use TensorFlow to generate text embeddings for our lyric data. First we install Tensorflow and the ANNOY (approximate nearest neighbors) library from Spotify.

In [6]:
!pip3 install tensorflow
!pip3 install tensorflow_hub
!pip3 install annoy



In [2]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
# tf.get_logger().setLevel(logging.ERROR)
import tensorflow_hub as hub
from annoy import AnnoyIndex
import pickle
import os
import csv
import numpy as np
import random


Instructions for updating:
non-resource variables are not supported in the long term


Here we set global variables to store our Annoy index and our universal counter to keep track of lyric entries

# Grab data

We define several helper functions to load lyric lines documents, and use Tensorflow to get embeddings.

In [3]:

# Define input path
input_file="song_data_2.csv"
input_csv_path = os.path.join(os.path.curdir, 'data', input_file)
csv.field_size_limit(100000000)


def batch_load():
    """
    Load a batch of 1000 songs - not used unless batching input
    """
    
    batch_sentences = []
    global counter
    
    print("loading batch of songs, starting at song", counter)
    
    for x in range(1000):
        
        document = next(documents)

        data = document['data']
        lines = data.split("|")

        song = document['song']
        artist = document['artist']
        for line in lines:
            ids[counter] = (line, song, artist)
            batch_sentences.append(line)
            counter += 1
        
    return batch_sentences


def get_lines():
    """
    Get individual lines from the input CSV, to use as the input for embeddings
    """
    songs = []
    
    with open(input_csv_path, mode ='r+', encoding = 'utf-8') as file:   
            datareader = csv.DictReader(file)
            next(datareader)
            for row in datareader:
                data = row['data']
                song = row['song']
                artist = row['artist']
    
                songs.append([data, song, artist])
                    
                # if len(lines) % 100000 == 0:
                #     print("{} lines added".format(len(lines)))
    
    random.shuffle(songs)
    lines = []
    for x in songs:
        data, song, artist = x
        lyrics = data.split("|")

        for lyric in lyrics:
            lines.append([lyric, song, artist])
    
    del songs
    print("total lines added: {}".format(len(lines)))
    
    return lines

In [4]:
# Retrieves the embedding for a batch of sentences 

embed_module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/1")
placeholder = tf.placeholder(dtype=tf.string)
embed = embed_module(placeholder)
session = tf.Session()
session.run([tf.global_variables_initializer(), tf.tables_initializer()])

def get_embeddings(sentences):
    """
    Gets embeddings for a given line (document)
    """
    
    embeddings = session.run(embed, feed_dict={placeholder: sentences})
    
    return embeddings

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Metal device set to: Apple M1 Pro


2021-12-07 15:40:13.767337: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-07 15:40:13.768471: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-07 15:40:13.769332: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2021-12-07 15:40:13.984952: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is e

# Build the Annoy Index
Finally we build the index for our dataset.

In [5]:
# Set up mapping and counter to keep track of documents
counter = 0
mapping = {}

# Initialize the ANNOY index 
ann = AnnoyIndex(512, metric='angular')

def add_items_to_index(batch, embeddings):
    """
    Adds items to an ANNOY index
    
    sentences: a list of 
    embeddings: a list of tensorflow embeddings for sentences
    """ 
    global ann, counter, mapping
    
    for line, embed in zip(batch, embeddings):
        # if counter % batch_size == 0:
        #     print("added {} items to index".format(counter+batch_size))
            
        ann.add_item(counter, embed)
        mapping[counter] = line
        counter +=1  

def build_ann_index(batch_size=100000):
    """
    Constructs the ANNOY index
    """
    print("getting lines from CSV file...")
    lines = get_lines()
    print("lines retrieved, getting embeddings...")
    
    # num_lines = 100000 * 10
    num_lines = len(lines)
    
    ann.on_disk_build("annoy_index")

    # get the embeddings in batches - 1/50th of data set to test
    # for x in range(0, num_lines, batch_size):
    while len(lines) > 0:
        
        print("getting embeddings for lines {} - {}".format(counter, counter + batch_size))
        start = 0
        if x + batch_size >= len(lyrics):
            end = len(lyrics)
        else:
            end = batch_size
        
        batch = lines[start:end]
        
        lyrics = [x[0] for x in batch]
        embeddings = get_embeddings(lyrics)
        add_items_to_index(batch, embeddings)
        
        del embeddings, lyrics
        del lines[:end]
        
        print("{} left".format(len(lines)))

    

In [None]:
try:
    build_ann_index(batch_size=200000)
except KeyboardInterrupt:
    print("KeyboardInterrupt")
finally:
    print("Building index...")
    ann.build(20)
    ann.unload()
    
    with open('annoy_index.mapping', 'wb') as handle:
        pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    print('mapping saved')

getting lines from CSV file...


In [7]:
lis = [1, 2, 3 ,4]

In [9]:
del lis[2:3]

In [10]:
lis

[1, 2, 4]

# Save Model

# Load the model 

In [4]:
ann = AnnoyIndex(512, metric='angular')
ann.load("annoy_index", prefault=True)
print('annoy index loaded.')
with open('annoy_index.mapping', 'rb') as handle:
    mapping = pickle.load(handle)
print('mapping file loaded.')

annoy index loaded.


prefault is set to true, but MAP_POPULATE is not defined on this platform

mapping file loaded.


And finally, use a sample query to test the performance of our retreival system:

In [7]:
input_sentence = "black dog"

query_embeddings = get_embeddings([input_sentence])[0]
print("getting query embeddings")

# Return 10 nearest neighbors
print("getting nearest neighbors")
nns = ann.get_nns_by_vector(query_embeddings, 10, include_distances=False)

print("Closest: ")
for idx, item in enumerate(nns):
    print("{}. {} - {}:".format(idx+1, mapping[item][1], mapping[item][2]))
    for x in range(item-3, item+3):
        if x == item:
            print("==== {} ====".format(mapping[x][0]))
        else:
            print("     {}     ".format(mapping[x][0]))
    
    print("\n")


getting query embeddings
getting nearest neighbors
Closest: 
1. Waitin’ on 5 -  Chris Janson :
     If you wanting overtime  well sorry I'm sick     
     Waitin' on five to start on six     
          
==== Everybody watching that tick tock tick ====
     Slower it goes the closer it gets     
     We'll be cracking and popping and giving it a twist     


2. Somebody New (Unreleased) -  Demi Lovato :
     But if you're coming to the show     
     There's something you should know     
          
==== I got somebody new ====
     I look at him the way I used to     
     Look at you     


3. Song Covers -  Ellie Goulding :
     "Life Round Here" - James Blake     
     "Mirrors" - Justin Timberlake     
     "Only Girl In The World" - Rihanna     
==== "Roscoe" - Midlake ====
     "Some Nights" - Fun     
     "Sweet Disposition" - The Temper Trap     


4. We Carry On -  Tim McGraw :
     Just twelve weeks along and she's got a life inside     
     Says she's never ever felt so al

2021-12-07 10:54:27.640021: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


# Interactive Application

We need ipywidgets to display interactive widgets:

In [4]:
!pip install ipywidgets



First we set up our widgets.

In [21]:
import ipywidgets as widgets

query = widgets.Text(
        value='Sitting in my truck',
        description='Input query:')

button = widgets.Button(description='Submit')

slider = widgets.IntSlider(
         value=5,
         min=0,
         max=20,
         step=1,
         description='# of results:')

def on_click(_):
    with query:
        clear_output()
        print(query.value)
        

interact = widgets.TwoByTwoLayout(top_left=query,
                       bottom_left=slider)

interact

TwoByTwoLayout(children=(Text(value='Sitting in my truck', description='Input query:', layout=Layout(grid_area…

Now we run the query defined above, looking for the number of results requested.

In [25]:
# code autoruns, not sure how to get it to wait for a user input

user_query = query.value
print("running query...")

query_embeddings = get_embeddings([user_query])[0]

# Return X nearest neighbors
nns = ann.get_nns_by_vector(query_embeddings, slider.value, include_distances=False)

print("Top {} results for \'{}\'".format(slider.value, query.value))
for idx, item in enumerate(nns):
    print("{}. {} - {}:".format(idx+1, mapping[item][1], mapping[item][2]))
    for x in range(item-3, item+3):
        if x == item:
            print("==== {} ====".format(mapping[x][0]))
        else:
            print("     {}     ".format(mapping[x][0]))
    
    print("\n")

running query...
Top 5 results for 'guy in a truck'
1. My Neck Of The Woods -  Blake Shelton :
     Or pouring rain     
     Sells tomatoes     
     From the back     
==== Of his pickup truck ====
     Reads the Bible line for line     
     While sipping on     


2. Like You Were Mine -  Jason Aldean :
     Still feeling you like you never said goodbye     
     Like you were mine     
          
==== Sometimes this take-me-back truck ====
     Talks me into burning gas     
     Past your house     


3. Cop Car -  Sam Hunt :
     We thought we had all night     
     There was no need to rush     
     That's when those cops     
==== Came pulling up ====
     And I thought     
     Man  ain't this some shhhh     


4. Cop Car (Acoustic) -  Sam Hunt :
     We thought we had all night     
     There  was no need to rush     
     That's  when those cops     
==== Came pulling up ====
     And  I thought     
     Man  ain't this some shhhh     


5. Meija -  Porno for Pyros :
 