# Introduction and Pipeline
\
Lyrics -> Remove punctuations/unnecessary symbols -> Bag of Words Representation -> Naive Bayes Classifier

The lyrics are provided in a pandas dataframe with the following columns:\
Lyrics | Mood

Mood is our label and we have 4 unique moods labels {'Calm', 'Energetic', 'Happy', 'Sad'}

In [20]:
import io, itertools, collections, os, shutil, re, string
import numpy as np
import pandas as pd
import tqdm

# Need word_tokenize for our bag of words
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize, TweetTokenizer

song_df = pd.read_csv("data_moodsUPDATED2.csv")
num_rows = int(song_df.size/len(song_df.columns))
# Songs in database: song_df.size/len(song_df.columns)
print('Songs in database = {}'.format(num_rows))

labels = []
for i in range(0, num_rows):
    mood = str(song_df.at[i, 'mood'])
    labels.append(mood)

print("labels =",collections.Counter(labels))

Songs in database = 412
labels = Counter({'Sad': 173, 'Energetic': 125, 'Happy': 96, 'Calm': 18})


# Process Corpus and Labels

In [65]:
label_to_num = {'Sad': 0, 'Energetic': 1, 'Happy': 2, 'Calm': 3}
moods_in_label_order = ['Sad', 'Energetic', 'Happy', 'Calm']

Clean up data and create a bag of words:



In [66]:
# Will clean up the lyric and then store it in corpus
# Then, will use corpus later to create vector representations
BoW = []
labels = []

# Remove punctuation with a table
table = str.maketrans('', '', string.punctuation)

corpus = []

for i in range(0, num_rows):
    new_lyric = str(song_df.at[i, 'lyrics'])

    # Place spaces in front of certain characters
    # so they are tokenized correctly
    new_lyric = new_lyric.replace('.', ' . ')
    new_lyric = new_lyric.replace('\"', ' \" ')
    new_lyric = new_lyric.replace(',', ' , ')
    new_lyric = new_lyric.replace(';', ' ; ')

    # For now remove all symbols
    # In the future, we can examine symbols differently
        # for example, "()" could indicate backing vocals
        # and we would know to regard those lyrics differently
    new_lyric = new_lyric.translate(table)

    corpus.append(new_lyric)
    lyric_tokenizer = TweetTokenizer()
    BoW += [x for x in lyric_tokenizer.tokenize(new_lyric)]
    labels.append( label_to_num[str(song_df.at[i, 'mood'])] )

# Convert labels to numpy array for later use
labels = np.array(labels)

print(BoW[:100])
print(len(BoW))
print(len(corpus))
print(labels)
print(len(labels))

['Dont', 'worry', 'I', 'wont', 'hurt', 'you', 'I', 'only', 'want', 'you', 'to', 'have', 'some', 'fun', 'I', 'was', 'dreamin', 'when', 'I', 'wrote', 'this', 'Forgive', 'me', 'if', 'it', 'goes', 'astray', 'But', 'when', 'I', 'woke', 'up', 'this', 'morning', 'I', 'could', 'have', 'sworn', 'it', 'was', 'judgement', 'day', 'The', 'sky', 'was', 'all', 'purple', 'There', 'were', 'people', 'runnin', 'everywhere', 'Tryin', 'to', 'run', 'from', 'the', 'destruction', 'And', 'you', 'know', 'I', 'didnt', 'even', 'care', 'Cause', 'they', 'say', '2000', 'zero', 'zero', 'party', 'over', 'oops', 'out', 'of', 'time', 'So', 'tonight', 'Im', 'gonna', 'party', 'like', 'its', '1999', 'I', 'was', 'dreamin', 'when', 'I', 'wrote', 'this', 'So', 'sue', 'me', 'if', 'I', 'go', 'too', 'fast']
110977
412
[2 0 0 1 2 1 1 1 2 3 1 0 0 1 0 0 1 1 0 1 0 0 0 1 1 2 1 0 1 1 3 0 0 0 0 1 0
 2 1 0 0 2 0 0 0 1 1 1 0 0 1 1 1 0 0 2 2 1 0 1 2 1 2 1 0 1 2 1 3 1 0 0 2 1
 2 2 0 3 0 2 0 1 0 1 0 0 3 0 2 0 1 0 1 1 2 0 2 1 0 2 1 1 0 0 2 0

Build a vector out of bag of words list using the most frequent words.

In [67]:
import heapq

frequencies = collections.Counter(BoW)
most_freq = heapq.nlargest(8208, frequencies, key=frequencies.get)
# 9120 unique words in our corpus
# 90% of them = 8208 so we take those and leave out the 10% of least common words

print(most_freq)



In [68]:
# Convert lyrics from corpus into corresponding vector representation
lyric_vectors = []

# Go through lyrics in corpus
    # mood label will correlate to the row the lyric is on
    # because order preserved in lists
for lyric in corpus:
    # Same tokenization process as we had to get word frequencies
    # Already took care of creating a pre-processed corpus earlier
    lyric_tokenizer = TweetTokenizer()
    lyric_tokens = lyric_tokenizer.tokenize(lyric)
    lyric_vect = []
    for token in most_freq:
        if token in lyric_tokens:
            lyric_vect.append(1)
        else:
            lyric_vect.append(0)
    lyric_vectors.append(lyric_vect)

print(len(lyric_vectors)) # making sure we still have all of our 412 songs

412


In [69]:
# Convert our vectors into a matrix
lyric_vectors = np.asarray(lyric_vectors)

# Create Naive Bayes Model

In [70]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

Now we use sklearn to get an array of sparce tf-idf numerical vectors based on our corpus, bag of words, and what we selected as most frequent terms. This can be fed into traditional classifiers such as Naive Bayes.

In [71]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=1).fit(lyric_vectors)
BoWencoded = vectorizer.transform(lyric_vectors)

In [72]:
# Get training/test splits
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=1).split(lyric_vectors, labels)
train_indices, test_indices = next(sss)

# Train Model

Naive Bayes Classification on our BoW

In [73]:
model = MultinomialNB()
train_x = BoWencoded[train_indices]
test_x = BoWencoded[test_indices]
train_labels = labels[train_indices]
test_labels = labels[test_indices]

model.fit(train_x, train_labels)
predicted_labels = model.predict(test_x)

print(confusion_matrix(labels[test_indices], predicted_labels))
print(classification_report(labels[test_indices], predicted_labels, digits=4, target_names=moods_in_label_order))


[[43  0  0  0]
 [31  0  0  0]
 [24  0  0  0]
 [ 5  0  0  0]]
              precision    recall  f1-score   support

         Sad     0.4175    1.0000    0.5890        43
   Energetic     0.0000    0.0000    0.0000        31
       Happy     0.0000    0.0000    0.0000        24
        Calm     0.0000    0.0000    0.0000         5

    accuracy                         0.4175       103
   macro avg     0.1044    0.2500    0.1473       103
weighted avg     0.1743    0.4175    0.2459       103



  _warn_prf(average, modifier, msg_start, len(result))


# Conclusions

Model performed slightly better than RNN + LSTM for Sad song classification. Saw significantly worse results for other genres (energetic, happy, and calm). This makes me think that some part of the pre-processing/vectorization did not go the way I intended for it to so I will revisit that tonight in order to see how/if that can be improved.

# Next Steps

Going to look into SVMs and Transformers more closely.

* Looking at alternatives to BERT for lack of computational power - http://xplordat.com/2019/10/10/bow-vs-bert-classification/