# <font color='purple'>Predicting Song Genres Using Lyrical Analysis</font>

<i>Authors</i>: Zachary Zalman, Jacob Mannix

<i>Date</i>: 16 May 2020

### Load In Needed Materials and Functions

In [None]:
import pandas as pd
from datetime import datetime
from tqdm import tqdm_notebook as tqdm
import ast

import numpy as np

import spacy

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn import utils
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


spacy.prefer_gpu()

In [None]:
# define a function to turn the string of lyric tokens into a list of lyric tokens
def format_column(text):

    lyrics_list = ast.literal_eval(text)
    
    result = []
    
    # loop through every lyric to remove the newline character
    for entry in lyrics_list:

        if entry != '\n ':
            result.append(entry)
        else:
            continue
    
    return result

### First Time Creation of Tokens

In [None]:
# load in the lyrics data set
first_df = pd.read_csv("lyrics.csv")

In [None]:
# create a song id number by renaming the index
first_df.rename(columns={"index":"song_id"}, inplace=True)
first_df.shape

In [None]:
first_df.dropna(subset=['lyrics'], inplace=True)
first_df.shape

In [None]:
# split the dataframe
info = first_df[["song_id", "song", "year", "artist", "genre"]]
lyrics = first_df[["song_id", "lyrics"]]

print(info.columns)
print(lyrics.columns)

In [None]:
# load the spaCy object with the english corpus
nlp = spacy.load("en_core_web_md")

Creating a tokenizer pipeline

In [None]:
# define the lemmatizer function
def lemmatizer(doc):
    # remove the PRON (which are pronouns after lemming)
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_punct(doc):
    # remove punctuation -> Use token.text to return strings, needed for Gensim.
    doc = [token.text for token in doc if token.is_punct != True]
    return doc


# add_pipe adds the function to the tokenizer
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_punct, name="punct", last=True)

In [None]:
# get the column of lyrics
doc = lyrics.lyrics
doc

In [None]:
# check to see if any lyrics are missing
doc.isna().sum()

In [None]:
# laptop time to run = 
# desktop time to run = 

doc_list = []

# go through each song
for doc in tqdm(doc):
    
    # tokenize the document
    pr = nlp(doc)
    
    # add it to the list
    doc_list.append(pr)

In [None]:
# check a list of tokens
doc_list[0]

In [None]:
# turn the list into a series, then turn it into a dataframe
temp = pd.DataFrame(pd.Series(doc_list), columns=["tokens"])

# reset the indices of the original df and the token df
temp.reset_index(drop=True, inplace=True)
first_df.reset_index(drop=True, inplace=True)

# merge the tokens df onto the original df using the indicies
df_with_tokens = pd.merge(first_df, temp, left_index=True, right_index=True)

Save and load the data as needed

In [None]:
# save the df
df_with_tokens.to_csv("lyrics_with_tokens.txt", sep="|", index=False)

In [None]:
# load in the dat set if it has already been created
# df = pd.read_csv("lyrics_with_tokens.txt", sep="|")

In [None]:
df = df[df.genre != "Not Available"]
df = df[df.genre != "Other"]
df.genre.value_counts()

### Use Doc2Vec For Embeddings

In [None]:
# shuffle the df before splitting
df = shuffle(df)
df.reset_index(inplace=True, drop=True)
df

In [None]:
# perform a test/train split on the data
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
# turn the training data into tagged documents
# train_tagged = train.apply(lambda r: TaggedDocument(words=format_column(r['tokens']), tags=[r.genre]), axis=1)
# train_tagged

train_tagged = [TaggedDocument(words=format_column(_d.lower()), tags=[str(i)]) for i, _d in enumerate(train.tokens)]
train_tagged

In [None]:
# turn the test data into tagged documents
# test_tagged = test.apply(lambda r: TaggedDocument(words=format_column(r['tokens']), tags=[r.genre]), axis=1)
# test_tagged

test_tagged = [TaggedDocument(words=format_column(_d.lower()), tags=[str(i)]) for i, _d in enumerate(test.tokens)]
test_tagged

In [None]:
train_tagged[0]

Creating the model

In [None]:
# create the parameters of the Doc2Vec model
model = Doc2Vec(vector_size=300,
                window=5, 
                alpha=.025, 
                min_alpha=0.00025, 
                min_count=2, 
                dm=1, 
                workers=8)

# build the vocab of the model
model.build_vocab(train_tagged)

In [None]:
# define the epochs count
epochs = range(50)

# loop through each epoch
for epoch in epochs:
    
    print(f'Epoch {epoch+1}')
    
    # train the model on the training data
    model.train(train_tagged,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    
    # decrease the learning rate
    model.alpha -= 0.00025
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
     
model.save('lyricsDoc2Vec.model')

In [None]:
# laod in the model if its already trained
# model = Doc2Vec.load('lyricsDoc2Vec.model')

Creation of test/train X and y arrays

In [None]:
X_train = np.array([model.docvecs[str(i)] for i in range(len(train_tagged))])
y_train = train['genre']

In [None]:
X_test = np.array([model.infer_vector(test_tagged[i][0]) for i in range(len(test_tagged))])
y_test = test['genre']

In [None]:
X_train[0]

## Networks With Doc2Vec

### Create a Hierarchical Attention Network (HAN)

### Regular CNN

### Regular RNN

### RNN-CNN

## Networks With Internal Embeddings

### Create a Hierarchical Attention Network (HAN)

### Regular CNN

### Regular RNN

### RNN-CNN