First, we install the lyricsgenius API:

In [1]:
!pip install multiprocess

!pip install lyricsgenius
!pip install metapy



We import the libraries and set a path for our input file of artists:

In [2]:
import json
import csv
import multiprocess
import queue
import logging
import sys

from requests.exceptions import HTTPError, ConnectionError, RequestException
from lyricsgenius import Genius


# OS agnostic
import os 
CSV_PATH = os.path.join(os.path.curdir, 'artists', '10000-MTV-Music-Artists-page-%s.csv')

# Scrape Data

We set up a lyricsgenius token, and use the API to pull the lyrics data for each artist in the dataset for the top 10,000 artists from MTV.

In [None]:
# Genius setup
             
def genius_setup():
    token = "EBufquOcw_ts4Y4V7yiddUNyUakTdqCpnMZhiI3XtAScWOntEom8Hj4T87gAV_cA"
    genius = Genius(token, retries=2)

    genius.verbose = False
    genius.remove_section_headers = True
    genius.skip_non_songs = True
    genius.excluded_terms = ["(Remix)", "(Live)"]

    return genius    
# Multiprocessing cores
process_number = int(multiprocess.cpu_count()) * 2

# Data management
final_ = multiprocess.Manager().list()

# artist_queue = queue.Queue()
# final_ = []
checked_artists = set()

file_name = "song_data_2.csv"

# Pull out artists
def get_artists(queue):
    for x in range(1,5):
        path = CSV_PATH % str(x)
        with open(path, encoding="UTF-8") as csvfile:
            TopArtists = csv.reader(csvfile)
            
            # Skip header
            next(TopArtists)
            for row in TopArtists:
                artist = row[0]
                # Check if we should skip this artists since we already found the data
                if artist not in checked_artists:
                    queue.put(artist)
                      


# File management
def write_to_csv(data):
    """
    data: list of dictionaries {artist, song, data}
    """
    global file_name
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    with open(csv_path, 'w') as csv_file: 
        # creating a csv dict writer object 
        print("Entries: {num}".format(num=len(data)))
        keys = data[0].keys()
        writer = csv.DictWriter(csv_file, fieldnames = keys) 
        
        # writing headers (field names) 
        writer.writeheader() 
        
        # writing data rows 
        writer.writerows(data) 
        

def read_csv():
    global final_, checked_artists, file_name   
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    
    # opening the CSV file
    try:
        with open(csv_path, mode ='r', encoding="UTF-8") as file:   

            # reading the CSV file
            data = csv.DictReader(file)

            for entry in data:
                checked_artists.add(entry["artist"])
                final_.append(entry)
                
        print("Number of artists already found {num}".format(num=len(checked_artists)))
    except FileNotFoundError:
        pass
    

# Run genius search
def search_genius(args):
    import sys
    from requests.exceptions import RequestException
    artist_queue, num, genius, final_ = args
    
    def log(string):
        print("[{num}] ".format(num=num) + string + "\n", end='')
        sys.stdout.flush()
    
    # Processing
    def clean_data(data):
        cleaned_data = data.replace("\n", "|").replace(",", " ")
        return cleaned_data

    def process_artist(artist):
        artist_dict = artist.to_dict()
        return ""

    def process_song(song):
        lyrics = clean_data(song.lyrics)
        return lyrics

    def build_entry(artist, song, data, columns = ["artist", "song", "data"]):
        entry = {"artist": artist, "song": song, "data": data}
        return entry
    
    log("Starting")
    try:
        while True:
            genius_artist = None
            artist = artist_queue.get()
            if artist is None:
                log("Done")
                return
            log("Remaining: [{queue}]. Searching {artist}".format(queue=artist_queue.qsize(), artist=artist.strip()))
            
            # Pull data for artist from genius
            for x in range(5):
                try:
                    genius_artist = genius.search_artist(artist, per_page=50, get_full_info=False)
                    break
                except RequestException as e:
                    log("HTTPSConnectionPool exception. Attempt {}/5".format(x+1))
                except Exception as e:
                    log("Exception. Attempt {}/3".format(x+1))
            
            log("Finished {artist}".format(num=num, artist=artist.strip()))
            if genius_artist == None:
                log("{artist} not found".format(num=num, artist=artist.strip()))
                continue
                           
            artist_data =  process_artist(genius_artist)
                           
            log("{artist} number of songs: {song_num}".format(num=num, artist=artist.strip(), song_num=len(genius_artist.songs)))
            
            for song in genius_artist.songs:
                song_data = process_song(song)
                
                # Add to final list
                final_.append(build_entry(artist, song.title, song_data))
    
    except Exception as e:
        log("Something went wrong: {error}".format(num=num, error= e))
    
    
def run(multi_core=False): 
    
    # Setup Genius
    genius = genius_setup()
    
    # Load in any previous data
    print("Reading previous")
    read_csv()
    
    pool = None
    try:  
        if multi_core:
            # multiprocess.log_to_stderr().setLevel(logging.DEBUG)
            print("Multiprocessing with {process_number} processes".format(process_number=process_number))
            
            artist_queue = multiprocess.Manager().Queue()
            get_artists(artist_queue)
            
            for x in range(process_number):
                artist_queue.put(None)
            
            print(artist_queue.qsize())
            # creating processes
            with multiprocess.get_context("spawn").Pool(process_number) as pool:
                args = [(artist_queue, x, genius, final_) for x in range(process_number)]
                pool.map(search_genius, args)
                pool.close()
                pool.join()
            
        else:
            print("Running single core")
            artist_queue = queue.Queue()
            get_artists(artist_queue)
            artist_queue.put(None)
            print(artist_queue.qsize())
            search_genius((artist_queue, 0, genius, final_))

    
    except KeyboardInterrupt:
        if pool:
            pool.close()
            pool.terminate()
            pool.join()
        print("KeyboardInterrupt: Writing results")
    
    finally:
        write_to_csv(list(final_))                       



In [None]:
run(multi_core=True) 

In [None]:
genius = genius_setup()
genius_artist = genius.search_artist("Sam Hunt", per_page=50, get_full_info=False)

In [None]:
genius_artist.songs[30].lyrics

# Tokenizing 
Now we tokenize the lyrics into stemmed, lowercase unigrams:

In [None]:
import metapy
from tempfile import NamedTemporaryFile

# OS agnostic
import os 
import shutil

In [None]:
def tokenize(data):
    """
    data: a string to tokenize
    
    tokens: a list of tokenized ngrams
    """
    doc = metapy.index.Document()
    doc.content(data)

    tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)
    tok = metapy.analyzers.LowercaseFilter(tok)
    tok = metapy.analyzers.Porter2Filter(tok)     
    ana = metapy.analyzers.NGramWordAnalyzer(1, tok)
    trigrams = ana.analyze(doc)
    tok.set_content(doc.content())

    tokens, counts = [], []
    for token, count in trigrams.items():
        tokens.append(token)
        counts.append(count)    

    return tokens


def tokenize_file(input_file="song_data.csv", output_file="song_data_tokenize.csv"):
    """
    processes a file (artist, song, data) into tokenized lyrics
    currently only processes first song (?)
    
    file_name: file to tokenize data rows in
    """
    print("Tokenizing data in", input_file)

    # set our path and temp file to write to - safer
    input_csv_path = os.path.join(os.path.curdir, 'data', input_file)
    output_csv_path = os.path.join(os.path.curdir, 'data', output_file)
    tempfile = NamedTemporaryFile('w+t', newline='', delete=False)
    
    # read in each lyric and tokenize it as a metapy document
    try:
        with open(input_csv_path, mode ='r+', encoding = 'utf-8') as file, tempfile:   
            # read from main file, write to temp file
            reader = csv.DictReader(file)
            writer = csv.DictWriter(tempfile, extrasaction='ignore', 
                                    fieldnames=['artist', 'song', 'data'])
            
            for row in reader:
                # print("Tokenizing song", row['song'])
                try:
                    row['data'] = tokenize(row['data'])
                    writer.writerow(row)
                except UnicodeDecodeError:
                    print("Error decoding sonng {}".format(row['song']))
            
            shutil.move(tempfile.name, output_csv_path)
    
    except (FileNotFoundError) as err:
        print(err)
        

In [None]:
tokenize_file()

# Doc2Vec 
Using doc2vec to turn out sets of lyrics in to vectors

In [None]:
!pip3 install sklearn
!pip3 install gensim

In [None]:
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os
import csv
import re
import sys

In [None]:

input_file="song_data_2.csv"
input_csv_path = os.path.join(os.path.curdir, 'data', input_file)

def get_data():
    documents = []
    with open(input_csv_path, mode ='r+', encoding = 'utf-8') as file:   
            datareader = csv.DictReader(file)
            
            next(datareader)
            for row in datareader:
                yield row


In [None]:
def tokenize_words(data):
    tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)
    tok = metapy.analyzers.LowercaseFilter(tok)
    tok.set_content(data)
    tokens = []
    for t in tok:
        if not "embed" in t:
            tokens.append(t)
    return tokens


tokenized_documents = []
size = len(documents)
documents = get_data()
for x in range(1000):
    
    document = next(documents)
    
    data = document['data']
    lines = data.split("|")
    
    song = document['song']
    artist = document['artist']
    
    for line in lines:
        tokens = tokenize_words(line)
        tokenized_documents.append(TaggedDocument(tokens, [(song, artist)]))

In [None]:
tokenized_documents[:500]

In [None]:
import multiprocessing
cores = multiprocessing.cpu_count()
model = Doc2Vec(tokenized_documents, dm=1, vector_size=100, negative=5, hs=0, sample = 0, workers=cores)

In [None]:
doc = """
I'm tired of gettin' drunk, tired of bein' free
"""
test_doc = tokenize_words(doc.lower())
model.dv.most_similar(positive=[model.infer_vector(test_doc)],topn=200)

# TensorFlow

In [None]:
!pip3 install tensorflow
!pip3 install tensorflow_hub
!pip3 install annoy

In [3]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
tf.get_logger().setLevel(logging.ERROR)
import tensorflow_hub as hub
from annoy import AnnoyIndex
import os
import csv
import numpy as np

Instructions for updating:
non-resource variables are not supported in the long term


Define the input file and a generator function to fetch the file row by row

In [4]:
input_file="song_data_2.csv"
input_csv_path = os.path.join(os.path.curdir, 'data', input_file)

def get_rows():
    """
    Returns a generator for each row of the CSV file
    """
    documents = []
    with open(input_csv_path, mode ='r+', encoding = 'utf-8') as file:   
            datareader = csv.DictReader(file)
            
            next(datareader)
            for row in datareader:
                yield row


Define several helper functions to load documents in batches, fetch TF embeddings, and initialize our index

In [5]:
def batch_load():
    """
    Load a batch of 1000 songs
    """
    print("loading batch")
    
    batch_sentences = []
    global counter
    
    for x in range(1000):
        
        document = next(documents)

        data = document['data']
        lines = data.split("|")

        song = document['song']
        artist = document['artist']
        for line in lines:
            ids[counter] = (line, song, artist)
            batch_sentences.append(line)
            counter += 1
        
    return batch_sentences


In [6]:
def get_embeddings(sentences):
    """
    Gets embeddings for a given line (document)
    """
    print("getting embeddings")
    
    embed_module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/1")
    placeholder = tf.placeholder(dtype=tf.string)
    embed = embed_module(placeholder)
    session = tf.Session()
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    
    embeddings = session.run(embed, feed_dict={placeholder: sentences})
    
    return embeddings


In [20]:
# Set up mapping and counter to keep track of documents
counter = 0
mapping = {}
documents = get_rows()

# Initialize the ANNOY index 
ann = AnnoyIndex(512, metric='angular')

In [21]:
def split_lines(batch_num=1000):
    """
    Split up a document into lines
    """
    lines = []
    for x in range(batch_num):
        row = next(documents)
        
        if row == None:
            break
            
        data = row['data']
        lyrics = data.split("|")
        song = row['song']
        artist = row['artist']
        
        for lyric in lyrics:
            lines.append([lyric, song, artist])
    
    return lines
    
    
def add_items_to_index(sentences):
    """
    Builds an ANNOY index
    """
    print("adding items to index")
    
    global ann, counter, mapping
    
    sent = [x[0] for x in sentences]
    
    embeddings = get_embeddings(sent)
    for x, embed in enumerate(embeddings):
        ann.add_item(counter, embed)
        mapping[counter] = sentences[x] # maps only the song
        counter +=1
    

        
# while True: # run entire data set
for x in range(2): # run just a couple batches
    lines = split_lines()
    if len(lines) == 0:
        break
    add_items_to_index(lines)

print("building index")
ann.build(n_trees=10)

adding items to index
getting embeddings
adding items to index
getting embeddings
building index


True

In [22]:
# input_sentence = "I'm sick of sittin' at the house, dyin' on my phone"
input_sentence = "feeding the dog"

query_embeddings = get_embeddings([input_sentence])[0]

#Return 10 nearest neighbors
print("getting nearest neighbors")
nns = ann.get_nns_by_vector(query_embeddings, 10, include_distances=False)

print("Closest: ")
for item in nns:
    print("{} - {}:".format(mapping[item][1], mapping[item][2]))
    for x in range(item-4, item+4):
        if x == item:
            print("==== {} ====".format(mapping[x][0]))
        else:
            print("     {}     ".format(mapping[x][0]))
    
    
print("\n\n")


getting embeddings
getting nearest neighbors
Closest: 
Old Shit -  Miranda Lambert :
     Redman tobacco  Grandpa's two cents     
     Old timers  "there in a pinch"     
     I'm a fan of it  old shit     
     Splittin logs  smokin hogs     
==== Feedin leftovers to a three legged dog ====
     I'm a fan of it  old shit     
          
     One man's trash is another man's treasure     
Little Red Wagon -  Miranda Lambert :
     I live in Oklahoma     
     And I've got long  blonde hair     
     And I play guitar and I go on the road     
     And I do all the shit you wanna do     
==== And my dog does tricks ====
     And I ain't about drama y'all     
     I love my apron     
     But I ain't your momma     
It’s Christmas Time -  Joey + Rory :
     It's Christmas time  who's at the door     
     I'm sure there's room for 7 more     
     We'll make some pallets on the floor     
     That'll be just fine     
==== The turkeys done  pull up a chair ====
     Grab a hand  let'