# 1. Import Packages and Libraries

In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding
import keras.backend as K
from keras.models import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import xgboost

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,recall_score

import scipy
import pandas as pd
import numpy as np
import gensim

import nltk
from nltk.data import find
import matplotlib.pyplot as plt
#import shap

import matplotlib
import sklearn
import pickle
import random
import multiprocessing
import os
import sys
import re
import gc

# 2. Read in Train/Val/Test Data

In [2]:
train = pickle.load(open('Train_Test_Data/genre_sub_genre_train.pkl','rb'))[['danceability', 'energy',
                                                                                 'loudness', 
                                                                                 'speechiness', 'acousticness',
                                                                                 'instrumentalness', 'liveness', 
                                                                                 'valence', 'tempo', 
                                                                                 'duration_ms',
                                                                                 'Lyrics','Major Genre']]

val = pickle.load(open('Train_Test_Data/genre_sub_genre_test.pkl','rb'))[['danceability', 'energy',
                                                                                 'loudness',
                                                                                 'speechiness', 'acousticness',
                                                                                 'instrumentalness', 'liveness', 
                                                                                 'valence', 'tempo',
                                                                                 'duration_ms', 
                                                                                 'Lyrics','Major Genre']].iloc[:1437]
test = pickle.load(open('Train_Test_Data/genre_sub_genre_test.pkl','rb'))[['danceability', 'energy',
                                                                                 'loudness', 
                                                                                 'speechiness', 'acousticness',
                                                                                 'instrumentalness', 'liveness', 
                                                                                 'valence', 'tempo',
                                                                                 'duration_ms', 
                                                                                 'Lyrics','Major Genre']].iloc[1437:]

np.random.seed(50)
all_data = pd.concat([train,val,test],ignore_index=True)
#all_data['Major Genre'] = all_data['Major Genre'].apply(lambda x:'Rock' if x == 'Metal' else x)
all_data = all_data.iloc[np.random.choice(all_data.index,len(all_data))]
train = all_data.iloc[:len(train)]
val = all_data.iloc[len(train):len(train) + len(val)]
test = all_data.iloc[len(train) + len(val):]

# 3. Clean Lyric Data

#### Genius API Generated Text at the beginning of lyrics, typically of the form Track Name or Album Name + ' ' + Lyrics, remove from input text

In [3]:
train['Lyrics'] = train['Lyrics'].apply(lambda x: ' '.join(x.split(' Lyrics')[1:]).lower())
val['Lyrics'] = val['Lyrics'].apply(lambda x: ' '.join(x.split(' Lyrics')[1:]).lower())
test['Lyrics'] = test['Lyrics'].apply(lambda x: ' '.join(x.split(' Lyrics')[1:]).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Lyrics'] = train['Lyrics'].apply(lambda x: ' '.join(x.split(' Lyrics')[1:]).lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val['Lyrics'] = val['Lyrics'].apply(lambda x: ' '.join(x.split(' Lyrics')[1:]).lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Lyrics'] = test['Ly

#### Drop examples with over 1000 tokens

In [4]:
token_thresh = 1000

train_bool = train['Lyrics'].apply(lambda x:True if len(str(x).split()) <= token_thresh else False)
train = train[train_bool]

val_bool = val['Lyrics'].apply(lambda x:True if len(str(x).split()) <= token_thresh else False)
val = val[val_bool]

test_bool = test['Lyrics'].apply(lambda x:True if len(str(x).split()) <= token_thresh else False)
test = test[test_bool]

train.index = np.arange(0,len(train))
val.index = np.arange(0,len(val))
test.index = np.arange(0,len(test))

In [5]:
del train_bool,val_bool,test_bool
gc.collect()

0

#### Get Language Indicators Marking Natural Split Points Between Songs, shown by genius API with language between brackets to serve as potential text splitting criteria later

In [6]:
def split_text_into_regions(text):
    string = text
    
    #mark line breaks
    string = string.replace('\n','[]')
    string = string.replace('embed','')
    #find language indicators of song sections
    splits = re.findall('\[.*?\]',string)
    #find ad libs to remove
    ad_libs = re.findall('\(.*?\)',string)
    
    #remove ad libs
    if len(ad_libs) > 0:
        for ad_lib in ad_libs:
            string = string.replace(ad_lib,'')
        string = string.replace('  ',' ')
    
    #If there is no splitting criteria, single string is entire song without any additional groupings
    if len(splits) == 0:
        string = [string]
    else:
        #replace split criteria with makers for splitting
        for delim in splits:
            string = string.replace(delim,'[]')
        string = string.split('[]')
    
    #Identify sections of song, made up of groups of lyrics
    sections = []
    section = []
    last_part = ''
    for part in string:
        if part == '' and last_part != '':
            sections.append(section)
            section = []
        elif part != '':
            section.append(part)
        
        last_part = part
    
    try:
        if section != sections[-1]:
            sections.append(section)
    except:
        sections.append(section)
    
    return sections    

#### Cleaner version of lyrics by joining the lyric groups above

In [7]:
def single_text_lyrics(group_of_lyrics):
    lyrics = ''
    for group in group_of_lyrics:
        lyrics = lyrics + ' ' + ' '.join(group)
    return lyrics.strip() 

#### Create Modified Versions of Lyrics

In [8]:
# Lyric Groups
train['Lyric Group'] = train['Lyrics'].apply(lambda x:split_text_into_regions(x))
val['Lyric Group'] = val['Lyrics'].apply(lambda x:split_text_into_regions(x))
test['Lyric Group'] = test['Lyrics'].apply(lambda x:split_text_into_regions(x))

# Cleaner Lyrics
train['Cleaner Lyrics'] = train['Lyric Group'].apply(lambda x:single_text_lyrics(x))
val['Cleaner Lyrics'] = val['Lyric Group'].apply(lambda x:single_text_lyrics(x))
test['Cleaner Lyrics'] = test['Lyric Group'].apply(lambda x:single_text_lyrics(x))

In [9]:
train = train[train['Cleaner Lyrics'] != '']
val = val[val['Cleaner Lyrics'] != '']
test = test[test['Cleaner Lyrics'] != '']

In [10]:
display(train['Major Genre'].value_counts()/len(train))
display(val['Major Genre'].value_counts()/len(val))
display(test['Major Genre'].value_counts()/len(test))

Rock           0.321368
Indie          0.157749
Pop            0.142096
Metal          0.120842
Hip Hop        0.100600
Alternative    0.096822
Blues          0.060522
Name: Major Genre, dtype: float64

Rock           0.316679
Indie          0.164489
Pop            0.146810
Metal          0.126826
Hip Hop        0.094543
Alternative    0.086856
Blues          0.063797
Name: Major Genre, dtype: float64

Rock           0.310185
Indie          0.172068
Pop            0.153549
Alternative    0.105710
Metal          0.105710
Hip Hop        0.095679
Blues          0.057099
Name: Major Genre, dtype: float64

# 4. Class Weights

In [11]:
label_weights = train['Major Genre'].value_counts().max()/train['Major Genre'].value_counts()
class_weights = {}
label_mapping = {}
weights = {}

for num in range(len(label_weights)):
    class_weights[label_weights.index[num]] = label_weights.iloc[num]
    label_mapping[label_weights.index[num]] = num
    weights[num] = label_weights.iloc[num]

In [12]:
display(class_weights)
display(label_mapping)
display(weights)

{'Rock': 1.0,
 'Indie': 2.0372112917023095,
 'Pop': 2.261633428300095,
 'Metal': 2.6594081518704633,
 'Hip Hop': 3.194500335345406,
 'Alternative': 3.3191637630662023,
 'Blues': 5.309921962095875}

{'Rock': 0,
 'Indie': 1,
 'Pop': 2,
 'Metal': 3,
 'Hip Hop': 4,
 'Alternative': 5,
 'Blues': 6}

{0: 1.0,
 1: 2.0372112917023095,
 2: 2.261633428300095,
 3: 2.6594081518704633,
 4: 3.194500335345406,
 5: 3.3191637630662023,
 6: 5.309921962095875}

# 5. Separate Audio and Lyric Features, labels, Standardize Audio Features

In [13]:
scaler = StandardScaler()
scaler.fit(train.iloc[:,:10])

StandardScaler()

In [14]:
# Train Audio + Lyrics
train_audio = scaler.transform(train.iloc[:,:10])
train_lyrics = train.iloc[:,-1]

# Val Audio + Lyrics
val_audio = scaler.transform(val.iloc[:,:10])
val_lyrics = val.iloc[:,-1]

# Test Audio + Lyrics
test_audio = scaler.transform(test.iloc[:,:10])
test_lyrics = test.iloc[:,-1]

#Train/Val/Test Labels
train_labels = train.iloc[:,-3].map(label_mapping)
val_labels = val.iloc[:,-3].map(label_mapping)
test_labels = test.iloc[:,-3].map(label_mapping)

# 6. Feed Forward Network w/ Audio Features

In [15]:
def class_recall(y_true,y_pred):
    #true labels
    true = y_true.numpy()
    #predicted prob of each class for each sample
    pred = y_pred.numpy()
    #prob to class based off max predicted prob
    pred = np.array([x.argmax() for x in pred])
    #confusion matrix
    confuse = confusion_matrix(true,pred)
    confuse_sum = confuse.sum(axis=1)
    score = 0
    for num in range(len(confuse_sum)):
        if confuse_sum[num]!=0:
            score = score + confuse[num][num]/confuse_sum[num]
    
    return score/len(confuse_sum)

In [16]:
def create_ff(hidden_layers = [100,100],hidden_layer_activation = 'relu',dropout_rate = 0.3,shape=(10,),
             output_layer_size = 7, output_layer_activation = 'softmax',learning_rate = 0.001,epochs = 10):
    
    #input layer
    input_layer = tf.keras.layers.Input(shape=shape)
    
    x = input_layer
    for layer in hidden_layers:
        #hidden layer
        hidden = tf.keras.layers.Dense(layer,activation=hidden_layer_activation)(x)
        dropout = tf.keras.layers.Dropout(rate=dropout_rate)(hidden)
        x = dropout
    
    #classification
    classification = tf.keras.layers.Dense(output_layer_size,activation= output_layer_activation)(x)
    
    #model
    model = tf.keras.models.Model(inputs = [input_layer], outputs = [classification])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001,decay=learning_rate/epochs),
                            loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
                            metrics=['accuracy',class_recall],
                 run_eagerly=True)
    
    display(model.summary())
    return model

In [17]:
epochs = 30
model = create_ff(shape=(train_audio.shape[1],),epochs = epochs)
stoppage = keras.callbacks.EarlyStopping(monitor = 'val_class_recall',verbose=1,patience=3,mode='max')
model.fit(np.array(train_audio),np.array(train_labels),
         validation_data=(np.array(val_audio),np.array(val_labels)),
         epochs = epochs,
         batch_size=8,
         class_weight = weights,
         shuffle=True,
         callbacks = [stoppage])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10)]              0         
                                                                 
 dense (Dense)               (None, 100)               1100      
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 100)               10100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 7)                 707       
                                                                 
Total params: 11,907
Trainable params: 11,907
Non-trainable p

2022-07-22 00:34:54.537953: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


None

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 12: early stopping


<keras.callbacks.History at 0x7fb8a44b0250>

# 7. Standardized Term Density Features

In [18]:
vectorizer = CountVectorizer()
vectorizer.fit(train['Cleaner Lyrics'])

train_lyrics = pd.DataFrame(vectorizer.transform(train['Cleaner Lyrics']).todense(),columns = vectorizer.get_feature_names())
train_lyrics = (train_lyrics/np.array(train_lyrics).sum(axis = 1).repeat(len(vectorizer.get_feature_names())).reshape(train_lyrics.shape)).astype('float32')

val_lyrics = pd.DataFrame(vectorizer.transform(val['Cleaner Lyrics']).todense(),columns = vectorizer.get_feature_names())
val_lyrics = (val_lyrics/np.array(val_lyrics).sum(axis = 1).repeat(len(vectorizer.get_feature_names())).reshape(val_lyrics.shape)).astype('float32')

test_lyrics = pd.DataFrame(vectorizer.transform(test['Cleaner Lyrics']).todense(),columns = vectorizer.get_feature_names())
test_lyrics = (test_lyrics/np.array(test_lyrics).sum(axis = 1).repeat(len(vectorizer.get_feature_names())).reshape(test_lyrics.shape)).astype('float32')

In [19]:
# Standardize Term Density Features
scaler = StandardScaler()
scaler.fit(train_lyrics)

train_lyrics = scaler.transform(train_lyrics)
val_lyrics = scaler.transform(val_lyrics)
test_lyrics = scaler.transform(test_lyrics)

In [20]:
epochs = 30
model = create_ff(shape=(train_lyrics.shape[1]),dropout_rate=0.3,hidden_layers=[100,100],epochs = epochs)
stoppage = keras.callbacks.EarlyStopping(monitor = 'val_class_recall',verbose=1,patience=3,mode='max')
model.fit(train_lyrics,np.array(train_labels),
         validation_data=(val_lyrics,np.array(val_labels)),
         epochs = epochs,
         batch_size=8,
         class_weight = weights,
         shuffle=True,
         callbacks=[stoppage])

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 83372)]           0         
                                                                 
 dense_3 (Dense)             (None, 100)               8337300   
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_4 (Dense)             (None, 100)               10100     
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_5 (Dense)             (None, 7)                 707       
                                                                 
Total params: 8,348,107
Trainable params: 8,348,107
Non-tra

None

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 13: early stopping


<keras.callbacks.History at 0x7fb941c01a90>

In [21]:
epochs = 30
model = create_ff(shape=(train_lyrics.shape[1] + train_audio.shape[1]),epochs = epochs)
stoppage = keras.callbacks.EarlyStopping(monitor = 'val_class_recall',verbose=1,patience=3,mode='max')
model.fit(np.hstack((train_lyrics,train_audio)),np.array(train_labels),
         validation_data=(np.hstack((val_lyrics,val_audio)),np.array(val_labels)),
         epochs = epochs,
         batch_size=8,
         class_weight = weights,
         shuffle=True,
         callbacks=[stoppage])

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 83382)]           0         
                                                                 
 dense_6 (Dense)             (None, 100)               8338300   
                                                                 
 dropout_4 (Dropout)         (None, 100)               0         
                                                                 
 dense_7 (Dense)             (None, 100)               10100     
                                                                 
 dropout_5 (Dropout)         (None, 100)               0         
                                                                 
 dense_8 (Dense)             (None, 7)                 707       
                                                                 
Total params: 8,349,107
Trainable params: 8,349,107
Non-tra

None

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 13: early stopping


<keras.callbacks.History at 0x7fb8a44e4d00>