First, we install the lyricsgenius API:

In [None]:
!pip install multiprocess

!pip install lyricsgenius
!pip install metapy

We import the libraries and set a path for our input file of artists:

In [23]:
import json
import csv
import multiprocess
import queue
import logging
import sys

from requests.exceptions import HTTPError, ConnectionError, RequestException
from lyricsgenius import Genius


# OS agnostic
import os 
CSV_PATH = os.path.join(os.path.curdir, 'artists', '10000-MTV-Music-Artists-page-%s.csv')

# Scrape Data

We set up a lyricsgenius token, and use the API to pull the lyrics data for each artist in the dataset for the top 10,000 artists from MTV.

In [30]:
# Genius setup
             
def genius_setup():
    token = "EBufquOcw_ts4Y4V7yiddUNyUakTdqCpnMZhiI3XtAScWOntEom8Hj4T87gAV_cA"
    genius = Genius(token, retries=2)

    genius.verbose = False
    genius.remove_section_headers = True
    genius.skip_non_songs = True
    genius.excluded_terms = ["(Remix)", "(Live)"]

    return genius    
# Multiprocessing cores
process_number = int(multiprocess.cpu_count()) * 2

# Data management
final_ = multiprocess.Manager().list()

# artist_queue = queue.Queue()
# final_ = []
checked_artists = set()

file_name = "song_data_2.csv"

# Pull out artists
def get_artists(queue):
    for x in range(1,5):
        path = CSV_PATH % str(x)
        with open(path, encoding="UTF-8") as csvfile:
            TopArtists = csv.reader(csvfile)
            
            # Skip header
            next(TopArtists)
            for row in TopArtists:
                artist = row[0]
                # Check if we should skip this artists since we already found the data
                if artist not in checked_artists:
                    queue.put(artist)
                      


# File management
def write_to_csv(data):
    """
    data: list of dictionaries {artist, song, data}
    """
    global file_name
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    with open(csv_path, 'w') as csv_file: 
        # creating a csv dict writer object 
        print("Entries: {num}".format(num=len(data)))
        keys = data[0].keys()
        writer = csv.DictWriter(csv_file, fieldnames = keys) 
        
        # writing headers (field names) 
        writer.writeheader() 
        
        # writing data rows 
        writer.writerows(data) 
        

def read_csv():
    global final_, checked_artists, file_name   
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    
    # opening the CSV file
    try:
        with open(csv_path, mode ='r', encoding="UTF-8") as file:   

            # reading the CSV file
            data = csv.DictReader(file)

            for entry in data:
                checked_artists.add(entry["artist"])
                final_.append(entry)
                
        print("Number of artists already found {num}".format(num=len(checked_artists)))
    except FileNotFoundError:
        pass
    

# Run genius search
def search_genius(args):
    import sys
    from requests.exceptions import RequestException
    artist_queue, num, genius, final_ = args
    
    def log(string):
        print("[{num}] ".format(num=num) + string + "\n", end='')
        sys.stdout.flush()
    
    # Processing
    def clean_data(data):
        cleaned_data = data.replace("\n", "|").replace(",", " ")
        return cleaned_data

    def process_artist(artist):
        artist_dict = artist.to_dict()
        return ""

    def process_song(song):
        lyrics = clean_data(song.lyrics)
        return lyrics

    def build_entry(artist, song, data, columns = ["artist", "song", "data"]):
        entry = {"artist": artist, "song": song, "data": data}
        return entry
    
    log("Starting")
    try:
        while True:
            genius_artist = None
            artist = artist_queue.get()
            if artist is None:
                log("Done")
                return
            log("Remaining: [{queue}]. Searching {artist}".format(queue=artist_queue.qsize(), artist=artist.strip()))
            
            # Pull data for artist from genius
            for x in range(5):
                try:
                    genius_artist = genius.search_artist(artist, per_page=50, get_full_info=False)
                    break
                except RequestException as e:
                    log("HTTPSConnectionPool exception. Attempt {}/5".format(x+1))
                except Exception as e:
                    log("Exception. Attempt {}/3".format(x+1))
            
            log("Finished {artist}".format(num=num, artist=artist.strip()))
            if genius_artist == None:
                log("{artist} not found".format(num=num, artist=artist.strip()))
                continue
                           
            artist_data =  process_artist(genius_artist)
                           
            log("{artist} number of songs: {song_num}".format(num=num, artist=artist.strip(), song_num=len(genius_artist.songs)))
            
            for song in genius_artist.songs:
                song_data = process_song(song)
                
                # Add to final list
                final_.append(build_entry(artist, song.title, song_data))
    
    except Exception as e:
        log("Something went wrong: {error}".format(num=num, error= e))
    
    
def run(multi_core=False): 
    
    # Setup Genius
    genius = genius_setup()
    
    # Load in any previous data
    print("Reading previous")
    read_csv()
    
    pool = None
    try:  
        if multi_core:
            # multiprocess.log_to_stderr().setLevel(logging.DEBUG)
            print("Multiprocessing with {process_number} processes".format(process_number=process_number))
            
            artist_queue = multiprocess.Manager().Queue()
            get_artists(artist_queue)
            
            for x in range(process_number):
                artist_queue.put(None)
            
            print(artist_queue.qsize())
            # creating processes
            with multiprocess.get_context("spawn").Pool(process_number) as pool:
                args = [(artist_queue, x, genius, final_) for x in range(process_number)]
                pool.map(search_genius, args)
                pool.close()
                pool.join()
            
        else:
            print("Running single core")
            artist_queue = queue.Queue()
            get_artists(artist_queue)
            artist_queue.put(None)
            print(artist_queue.qsize())
            search_genius((artist_queue, 0, genius, final_))

    
    except KeyboardInterrupt:
        if pool:
            pool.close()
            pool.terminate()
            pool.join()
        print("KeyboardInterrupt: Writing results")
    
    finally:
        write_to_csv(list(final_))                       



In [None]:
run(multi_core=True) 

Reading previous
Number of artists already found 2904
Multiprocessing with 32 processes
5455
[0] Starting
[0] Remaining: [5454]. Searching Peer van Mladen
[1] Starting
[1] Remaining: [5453]. Searching JD Shelburne
[2] Starting
[2] Remaining: [5452]. Searching Hearty2Raw
[5] Starting
[5] Remaining: [5451]. Searching Kurt Stevens
[4] Starting
[4] Remaining: [5450]. Searching Albert Phillips
[3] Starting
[3] Remaining: [5449]. Searching Ed Roman
[6] Starting
[6] Remaining: [5448]. Searching YonnieMcfly
[7] Starting
[7] Remaining: [5447]. Searching Madelyn Victoria
[8] Starting
[9] Starting
[8] Remaining: [5446]. Searching Xander Demos
[9] Remaining: [5445]. Searching Jony Privat
[15] Starting
[14] Starting
[21] Starting
[11] Starting
[17] Starting
[13] Starting
[20] Starting
[18] Starting
[15] Remaining: [5444]. Searching Kyle (Rapper)
[19] Starting
[14] Remaining: [5443]. Searching Heidi Feek
[10] Starting
[11] Remaining: [5441]. Searching Saint Gurmeet Ram Rahim Singh Ji Insan
[21] Rema

In [4]:
genius = genius_setup()
genius_artist = genius.search_artist("Sam Hunt", per_page=50, get_full_info=False)

In [13]:
genius_artist.songs[30].lyrics

"No watch\nNo phone\nNo shoes\nGot on a pair of cheap shades I'm probably gonna lose\nDug a hole in the sand with some kid's toy\nNow I'm kicking back in it like a lazy boy\nGot the radio right\nI'm lounging hard\nTake a picture of me, put it on a postcard\n\nHey, I'm on vacation\nI might not go back home\nAin't got no static on my reggae station\nI'm here\nI'm gone\nI'm on vacation\n\nThat parasailing don't look all that hard\nThat ain't nothing but fly fishing for sharks\nRunning on the beach is for the seabirds\nSay that for these overachievers\nI just sit right here on the breeze\nExcuse me honey, can you get my shoulders please?\nHey, I'm on vacation\nI might not go back home\nAin't got no static on my reggae station\nI'm here\nI'm gone\nI'm on vacation\n\nSitting by my swimming pool\nEating lobster tail in my swimming suit\nRight here holding me down, down by the ocean\nYou can find me somewhere in slow motion, slow motion\n\nHey, I'm on vacation\nI might not go back home\nAin't 

# Tokenizing 
Now we tokenize the lyrics into stemmed, lowercase unigrams:

In [5]:
import metapy
from tempfile import NamedTemporaryFile

# OS agnostic
import os 
import shutil

In [28]:
def tokenize(data):
    """
    data: a string to tokenize
    
    tokens: a list of tokenized ngrams
    """
    doc = metapy.index.Document()
    doc.content(data)

    tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)
    tok = metapy.analyzers.LowercaseFilter(tok)
    tok = metapy.analyzers.Porter2Filter(tok)     
    ana = metapy.analyzers.NGramWordAnalyzer(1, tok)
    trigrams = ana.analyze(doc)
    tok.set_content(doc.content())

    tokens, counts = [], []
    for token, count in trigrams.items():
        tokens.append(token)
        counts.append(count)    

    return tokens


def tokenize_file(input_file="song_data.csv", output_file="song_data_tokenize.csv"):
    """
    processes a file (artist, song, data) into tokenized lyrics
    currently only processes first song (?)
    
    file_name: file to tokenize data rows in
    """
    print("Tokenizing data in", input_file)

    # set our path and temp file to write to - safer
    input_csv_path = os.path.join(os.path.curdir, 'data', input_file)
    output_csv_path = os.path.join(os.path.curdir, 'data', output_file)
    tempfile = NamedTemporaryFile('w+t', newline='', delete=False)
    
    # read in each lyric and tokenize it as a metapy document
    try:
        with open(input_csv_path, mode ='r+', encoding = 'utf-8') as file, tempfile:   
            # read from main file, write to temp file
            reader = csv.DictReader(file)
            writer = csv.DictWriter(tempfile, extrasaction='ignore', 
                                    fieldnames=['artist', 'song', 'data'])
            
            for row in reader:
                # print("Tokenizing song", row['song'])
                try:
                    row['data'] = tokenize(row['data'])
                    writer.writerow(row)
                except UnicodeDecodeError:
                    print("Error decoding sonng {}".format(row['song']))
            
            shutil.move(tempfile.name, output_csv_path)
    
    except (FileNotFoundError) as err:
        print(err)
        

In [9]:
tokenize_file()

Tokenizing data in song_data.csv
Error decoding sonng Half The World Away but if it was released


# Doc2Vec 
Using doc2vec to turn out sets of lyrics in to vectors

In [None]:
!pip3 install sklearn
!pip3 install gensim

In [6]:
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os
import csv
import re
import sys

In [16]:

input_file="song_data_2.csv"
input_csv_path = os.path.join(os.path.curdir, 'data', input_file)

def get_data():
    documents = []
    with open(input_csv_path, mode ='r+', encoding = 'utf-8') as file:   
            datareader = csv.DictReader(file)
            
            next(datareader)
            for row in datareader:
                yield row


In [7]:
def tokenize_words(data):
    tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)
    tok = metapy.analyzers.LowercaseFilter(tok)
    tok.set_content(data)
    tokens = []
    for t in tok:
        if not "embed" in t:
            tokens.append(t)
    return tokens


tokenized_documents = []
size = len(documents)
documents = get_data()
for x in range(1000):
    
    document = next(documents)
    
    data = document['data']
    lines = data.split("|")
    
    song = document['song']
    artist = document['artist']
    
    for line in lines:
        tokens = tokenize_words(line)
        tokenized_documents.append(TaggedDocument(tokens, [(song, artist)]))

In [15]:
tokenized_documents[:500]

[TaggedDocument(words=['got', 'a', 'girl', 'from', 'the', 'southside', 'got', 'braids', 'in', 'her', 'hair'], tags=[('Body Like a Back Road', ' Sam Hunt ')]),
 TaggedDocument(words=['first', 'time', 'i', 'seen', 'her', 'walk', 'by', 'man', 'i', "'", 'bout', 'fell', 'up', 'out', 'my', 'chair'], tags=[('Body Like a Back Road', ' Sam Hunt ')]),
 TaggedDocument(words=['had', 'to', 'get', 'her', 'number', 'it', 'took', 'me', 'like', 'six', 'weeks'], tags=[('Body Like a Back Road', ' Sam Hunt ')]),
 TaggedDocument(words=['now', 'me', 'and', 'her', 'go', 'way', 'back', 'like', 'cadillac', 'seats'], tags=[('Body Like a Back Road', ' Sam Hunt ')]),
 TaggedDocument(words=[], tags=[('Body Like a Back Road', ' Sam Hunt ')]),
 TaggedDocument(words=['body', 'like', 'a', 'back', 'road', 'drivin', "'", 'with', 'my', 'eyes', 'closed'], tags=[('Body Like a Back Road', ' Sam Hunt ')]),
 TaggedDocument(words=['i', 'know', 'every', 'curve', 'like', 'the', 'back', 'of', 'my', 'hand'], tags=[('Body Like a Ba

In [9]:
import multiprocessing
cores = multiprocessing.cpu_count()
model = Doc2Vec(tokenized_documents, dm=1, vector_size=100, negative=5, hs=0, sample = 0, workers=cores)

In [12]:
doc = """
I'm tired of gettin' drunk, tired of bein' free
"""
test_doc = tokenize_words(doc.lower())
model.dv.most_similar(positive=[model.infer_vector(test_doc)],topn=200)

[(('\u200bwebseries001.wav', ' Twenty One Pilots '), 0.9179467558860779),
 (('Disco.mp3', ' Twenty One Pilots '), 0.8726375102996826),
 (('You Gave Me Love', ' Fifth Harmony '), 0.8280184268951416),
 (('The Way You Look', ' Fifth Harmony '), 0.7855388522148132),
 (('I Thought There Was Time', ' Blake Shelton '), 0.7839843034744263),
 (('Holding on to You (Radio Version) [TV Track]', ' Twenty One Pilots '),
  0.7811504602432251),
 (('Daytime IHeart Radio Festival 2018 Setlist', ' 5 Seconds Of Summer '),
  0.7758229374885559),
 (('No Boys Allowed', ' Fifth Harmony '), 0.7733003497123718),
 (('Anything Could Happen (Finals)', ' Fifth Harmony '), 0.7668943405151367),
 (('Can’t Help Falling in Love', ' Twenty One Pilots '), 0.7501000761985779),
 (('Come Over (Acoustic)', ' Sam Hunt '), 0.7463387846946716),
 (('Give Your Heart a Break (Demi Lovato cover)', ' Fifth Harmony '),
  0.7438371777534485),
 (('We Don’t Believe What’s on TV', ' Twenty One Pilots '),
  0.7337716221809387),
 (('Don’t S

# TensorFlow

In [21]:
!pip3 install tensorflow
!pip3 install tensorflow_hub
!pip3 install annoy

Looking in indexes: https://pypi.python.org/simple, https://pypi.apple.com/simple
Looking in indexes: https://pypi.python.org/simple, https://pypi.apple.com/simple
Looking in indexes: https://pypi.python.org/simple, https://pypi.apple.com/simple
Collecting annoy
  Downloading annoy-1.17.0.tar.gz (646 kB)
     |████████████████████████████████| 646 kB 5.3 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25ldone
[?25h  Created wheel for annoy: filename=annoy-1.17.0-cp36-cp36m-macosx_12_0_x86_64.whl size=69705 sha256=93e8b781a1bcceb167d22c9aecd310c920704de9b62d235f7d969282774a2303
  Stored in directory: /Users/braydenturner/Library/Caches/pip/wheels/1d/90/43/49cf1e7f7aaaebab491d3447e5d4063fdf7407173a3455f8c2
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.0


In [1]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
import tensorflow_hub as hub
from annoy import AnnoyIndex
import os
import csv
import numpy as np

Instructions for updating:
non-resource variables are not supported in the long term


In [3]:
input_file="song_data_2.csv"
input_csv_path = os.path.join(os.path.curdir, 'data', input_file)

def get_data():
    documents = []
    with open(input_csv_path, mode ='r+', encoding = 'utf-8') as file:   
            datareader = csv.DictReader(file)
            
            next(datareader)
            for row in datareader:
                yield row
                
documents = get_data()
batch_sentences = []
ids = dict()
counter = 0
for x in range(1000):
    
    document = next(documents)
    
    data = document['data']
    lines = data.split("|")
    
    song = document['song']
    artist = document['artist']
    for line in lines:
        ids[counter] = (line, song, artist)
        batch_sentences.append(line)
        counter+=1



In [5]:
def get_embeddings(sentences):
    embed_module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/1")
    placeholder = tf.placeholder(dtype=tf.string)
    embed = embed_module(placeholder)
    session = tf.Session()
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    
    embeddings = session.run(embed, feed_dict={placeholder: sentences})
    
    return embeddings

g = tf.Graph()
with g.as_default():
    
    embeddings = get_embeddings(batch_sentences)
    
    #USE emits 512 dimensional vectors
    D=512

    #Default number of trees
    NUM_TREES=10

    index_file = "annoy_file"

    ann = AnnoyIndex(D)
    for index, embed in enumerate(embeddings):
        ann.add_item(index, embed)
    ann.build(NUM_TREES)
    ann.save("index")
    

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [20]:
input_sentence = "I'm sick of sittin' at the house, dyin' on my phone"

query_embeddings = get_embeddings([input_sentence])[0]

#Return 10 nearest neighbors
nns = ann.get_nns_by_vector(query_embeddings, 10, include_distances=False)
print("Closest: ")
for item in nns:
    print("{} - {}:".format(ids[item][1], ids[item][2]))
    for x in range(first-4, first+4):
        if x == first:
            print("==== {} ====".format(ids[x][0]))
        else:
            print("     {}     ".format(ids[x][0]))
    
print("\n\n")

first = nns[0]


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Closest: 
[887] ("I'm sick of sittin' at the house  dyin' on my phone", "Breaking Up Was Easy in the 90's", ' Sam Hunt ')
[905] ("I'm sick of sittin' at the house  dyin' on my phone", "Breaking Up Was Easy in the 90's", ' Sam Hunt ')
[917] ("Baby  I'm sick of sittin' at the house  dyin' on my phone", "Breaking Up Was Easy in the 90's", ' Sam Hunt ')
[39053] ("Well  I don't have to hear no jingle bells  just the ringing of the phone", 'Time For Me To Come Home', ' Blake Shelton ')
[39065] ("Well  I don't have to hear no jingle bells  just the ringing of the phone", 'Time For Me To Come Home', ' Blake Shelton ')
[18446] ('Bad bitch in my door', 'Monies', ' Fifth Harmony ')
[18479] ('Bad bitch in my door', 'Monies', ' Fifth Harmony ')
[1858] ("I'm over drivin' around  windows down", 'Over You', ' Sam Hunt ')
[22107] ("Don't you wanna fall asleep with me tonight?", 'Don’t You Wanna Stay', ' Jason Aldean ')
[22118] ("Don't you wanna fall asleep with me tonight?", 'Don’t You Wanna Stay', ' J