In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding
import keras.backend as K
from keras.callbacks import EarlyStopping

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate

from sklearn.feature_extraction.text import CountVectorizer

!pip install transformers
from transformers import BertTokenizer, TFBertModel

import pickle as pkl
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
import string
import multiprocessing
import unicodedata
import re
import gc
import sklearn

import gensim
import nltk
nltk.download('word2vec_sample')
from nltk.data import find
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!pwd

/Users/trespimentel/Desktop/w266_final_project/Genre_Classification


# 1. Import data, filter out problematic data, create normalized feature set

In [None]:
# import train and test data
df_test = pkl.load(open('Train_Test_Data/genre_sub_genre_test.pkl', 'rb'))
df_train = pkl.load(open('Train_Test_Data/genre_sub_genre_train.pkl', 'rb'))

In [None]:
df_train['Lyrics'].iloc[:5]

16303    Unbreakable Lyrics[Intro]\nGo take it all\nYou...
8721     NOBODY LyricsTake a bitch\nThat I have in one ...
11930    Worth It Lyrics[Verse 1]\nYour eyes are just l...
7945     Bloodrush Lyrics[Intro: Denzel Curry]\nUgh\nUg...
15504    Age of Man Lyrics[Intro]\nIn an age of darknes...
Name: Lyrics, dtype: object

data cleanliness issue: it seems that a few things were causing us to get incorrect lyrics from genius:

1. our dataset contains versions of songs e.g. xyz remastered, xyz live version, etc. which genius thinks is a different song than the actual "base" song

2. genius saves songs/artists in "unaccented" characters (e.g. cafe vs café) - our dataset has these accents which is causing us to get incorrect results

we can solve these problems in 2 ways - cleaning pre-genius query or filtering post-query.  for now, i have filtered them post-query but we can always re-run the query if we want.

based on exploration of this error, it appears that the incorrect data we are getting are all very long documents - there exists a risk that we are getting incorrect lyric data that is the same size as the correct results, but i have no way of checking this other than spot-checking (which i have done).  the only additional errors i found using this method had to do with a special case of (2) above - i believe filtering out any rows where the song or artist contains accented characters will be enough to solve this issue.

In [None]:
# in case we want to clean pre-query, here are variables we can use to find problem words/chars
#problem_words = ['acoustic', 'version', 'remastered', 'anniversary', 'remaster']
#accented_characters = "ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"

In [None]:
# filter out results
df_train.drop(df_train[df_train['Lyrics'].str.len() > 5000].index, inplace=True)
df_train[df_train['Artist Name'].str.contains("ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") == False]
df_train[df_train['Track Name'].str.contains("ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") == False]

df_test.drop(df_test[df_test['Lyrics'].str.len() > 5000].index, inplace=True)
df_test[df_test['Artist Name'].str.contains("ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") == False]
df_test[df_test['Track Name'].str.contains("ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") == False]

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,...,Sub-Genre: modern alternative rock,Sub-Genre: southern hip hop,Sub-Genre: nu metal,Sub-Genre: israeli mediterranean,Sub-Genre: thrash metal,Sub-Genre: pop rock,Sub-Genre: chicago blues,Sub-Genre: indie pop,Sub-Genre: classic rock,Sub-Genre: hardcore hip hop
5,Stevie Ray Vaughan,Life By The Drop,51,0.659,0.163,6,-11.864,0,0.0388,0.76600,...,0,0,0,0,0,0,0,0,1,0
6,DARKSIDE,Paper Trails,55,0.947,0.419,8,-13.043,0,0.0578,0.77800,...,0,0,0,0,0,0,0,0,0,0
11,"Christone ""Kingfish"" Ingram",Outside Of This Town,48,0.418,0.866,11,-4.033,0,0.0513,0.00381,...,0,0,0,0,0,0,0,0,0,0
28,Jesse Cook,I Put A Spell On You,34,0.420,0.373,1,-9.302,0,0.0320,0.92200,...,0,0,0,0,0,0,0,0,0,0
31,"Christone ""Kingfish"" Ingram",Before I'm Old,41,0.534,0.649,2,-5.526,1,0.0410,0.04380,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19316,Woolbright,Tuesday,23,0.514,0.819,11,-6.713,0,0.0375,0.01220,...,0,0,0,0,0,0,0,0,0,0
19321,Runnin' Wild,How You Want It Done,27,0.614,0.953,9,-3.539,1,0.0517,0.07710,...,0,0,0,0,0,0,0,0,0,0
19344,Four Year Strong,Go Down in History,48,0.505,0.985,5,-4.401,1,0.1190,0.00006,...,0,0,0,0,0,0,0,0,0,0
19355,Nathaniel Rateliff & The Night Sweats,S.O.B.,66,0.699,0.579,1,-6.504,1,0.0416,0.26700,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# this removes ~10% of both train/test data
print(len(df_train))
print(len(df_test))

14778
2581


below cells are related to the tensorflow issue involving lyric + audio data.  leaving until resolved (do not run)

In [None]:
#test = df_train_audio_normalized.iloc[:5,:-1].copy()
#test1 = [np.array(x) for x in train_tokens_prebuilt[0:5]]
#test1 = train_tokens_prebuilt[0:5]
#print(test)

In [None]:
#ls = list([list(test.iloc[num]) for num in range(len(test))])

#print(ls)

In [None]:
#final_array = list([list(x) for x in zip(ls, test1)])
#print(final_array[0])

end issue section

In [30]:
# let's create a df that is all of the normalized audio features we want to use
df_train_audio_normalized = df_train[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].copy()
df_train_audio_normalized = (df_train_audio_normalized-df_train_audio_normalized.mean())/df_train_audio_normalized.std()

df_test_audio_normalized = df_test[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].copy()
df_test_audio_normalized = (df_test_audio_normalized-df_test_audio_normalized.mean())/df_test_audio_normalized.std()

In [None]:
# looks good
df_train_audio_normalized

Unnamed: 0,danceability,energy,loudness,acousticness,speechiness,instrumentalness,valence,tempo,duration_ms
16303,0.083890,0.541561,0.082481,-0.663296,-0.419475,-0.421705,0.281600,-0.045805,-0.635516
8721,1.777120,0.379438,-0.018027,-0.311507,0.535964,-0.420469,0.204008,-0.455375,-1.530281
11930,0.162224,0.202576,0.573121,-0.635356,-0.672419,-0.422246,-0.485701,-0.726473,-0.476165
7945,0.867234,-0.475394,-1.048238,-0.368260,2.019137,-0.423698,-0.856420,0.194771,-0.346021
15504,-0.536761,-0.534348,0.497078,-0.535344,-0.632178,2.799775,-1.059022,0.425864,1.866022
...,...,...,...,...,...,...,...,...,...
7303,0.626205,-0.401702,-0.481556,0.156409,3.410330,-0.424316,1.363581,1.375829,-0.792824
9125,1.572245,0.615253,-1.054850,0.763627,-0.007866,-0.415252,1.962766,0.043757,0.134579
5125,1.801223,0.163273,-0.034888,0.751721,-0.057305,-0.418003,0.919581,-0.827523,-0.473074
15805,-0.386117,-1.477611,-2.291367,0.287377,-0.662071,0.707498,-0.826245,1.420558,0.838621


# 2. Embedding Creation, get token and label data ready

In [None]:
# get word2vec model
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [None]:
# how big does our embedding matrix need to be
print(len(model.key_to_index.items()))

In [None]:
#construct embedding matrix w/ prebuilt embedding
vocab_dict = model.key_to_index.copy()
embedding_matrix = np.zeros((43982,300))
for word,index in model.key_to_index.items():
    embedding_matrix[index] = model[word]

In [None]:
# text cleaning function - this is just part 1
def text_cleaner(text_data):
    return_data = []
    for text in text_data:
        final_text = []
        new_text = text.lower()
        new_text = new_text.replace('\n',' ')
        new_text = re.sub(r"[,.;@#?!&$]+\ *", " ", new_text)
        new_text = new_text.replace('   ',' ')
        new_text = new_text.replace('  ',' ')
        new_text = new_text.split()
        for word in new_text:
            if word not in stopWords:
                final_text.append(word)
        return_data.append(final_text)
    return return_data

In [None]:
train_lang_clean = text_cleaner(df_train['Lyrics'])
test_lang_clean = text_cleaner(df_test['Lyrics'])
#train_lang_clean[0]

In [None]:
# this is a messy implementation, but this basically is a step 2 text cleaner that is slightly different based off the format of the text
# if i have time, i will combine into a single function, but for now this runs and does what it intends to do

def more_cleaning_if_brackets(song):
    include_word = False
    clean_song = []
    final_song_counter = 0
    
    for word in song:
        final_song_counter += 1
        if ']' in word and '[' in word:
            include_word = True
            continue
        
        if include_word == False:
            if ']' in word:
                    include_word = True
                    continue
                    
        else: # if include_word == True
            if '[' in word:
                    include_word = False
                    
        if include_word == True and final_song_counter != len(song):
            new_word = word.replace('(','').replace(')','')
            new_word = new_word.lower()
            clean_song.append(new_word)
        elif include_word == True and final_song_counter == len(song):
            try:
                r = re.compile("([a-zA-Z]+)([0-9]+)")
                clean_song.append(r.match(word).groups()[0])
            except:
                pass
            
    return(clean_song)

def more_cleaning_if_no_brackets(song):
    include_word = False
    clean_song = []
    final_song_counter = 0
    
    for word in song:
        final_song_counter += 1
        if ']' in word and '[' in word:
            continue
        
        if include_word == False:
            if 'lyrics' in word:
                    include_word = True
                    continue
                    
        else: # if include_word == True
            if '[' in word:
                    include_word = False
                    
        if include_word == True and final_song_counter != len(song):
            new_word = word.replace('(','').replace(')','')
            new_word = new_word.lower()
            clean_song.append(new_word)
        elif include_word == True and final_song_counter == len(song):
            try:
                r = re.compile("([a-zA-Z]+)([0-9]+)")
                clean_song.append(r.match(word).groups()[0])
            except:
                pass
            
    return(clean_song)

In [None]:
train_lang_clean_post_function = []
for song in train_lang_clean:
    if '[' in ''.join(song):
        new_song = more_cleaning_if_brackets(song)
        train_lang_clean_post_function.append(new_song)
    else:
        new_song = more_cleaning_if_no_brackets(song)
        train_lang_clean_post_function.append(new_song)
        
test_lang_clean_post_function = []
for song in test_lang_clean:
    if '[' in ''.join(song):
        new_song = more_cleaning_if_brackets(song)
        test_lang_clean_post_function.append(new_song)
    else:
        new_song = more_cleaning_if_no_brackets(song)
        test_lang_clean_post_function.append(new_song)

In [None]:
#checks
print(len(test_lang_clean_post_function))
#train_lang_clean_post_function[2]

In [None]:
# add cleaned lyrics to df
df_train['Lyrics'] = train_lang_clean_post_function
df_test['Lyrics'] = test_lang_clean_post_function

In [None]:
# this takes our cleaned text and converts it to word2vec tokens

def text_to_index_post_cleaning(text_data,mapping,max_size):
    return_data = []
    for text in text_data:
        mapped_text = []
        for token in text:
            try:
                mapped_text.append(mapping[token])
            except:
                mapped_text.append(len(mapping))
        
        if len(mapped_text) > max_size:
            mapped_text = mapped_text[:max_size]
        else:
            while len(mapped_text) < max_size:
                mapped_text.append(len(mapping))
                
        return_data.append(mapped_text)
    
    return return_data

one thing i would like to do once we are confident data is in a good place is experiment with prebuilt embedding size - 1000 seems pretty long

In [None]:
# tokenize lyrics - for the prebuilt embedding models these are our X
train_tokens_prebuilt_new = text_to_index_post_cleaning(df_train['Lyrics'],vocab_dict,1000)
test_tokens_prebuilt_new = text_to_index_post_cleaning(df_test['Lyrics'],vocab_dict,1000)
df_train['Lyric_Tokens'] = train_tokens_prebuilt_new
df_test['Lyric_Tokens'] = test_tokens_prebuilt_new

In [21]:
# get labels "Y"
train_labels = df_train['Major Genre']
test_labels = df_test['Major Genre']

In [23]:
# create mapper so we can use numeric labels in our networks
mapping = {}
count = 0
for label in train_labels.unique():
    mapping[label] = count
    count = count + 1
print(mapping)

{'Rock': 0, 'Indie': 1, 'Alternative': 2, 'Hip Hop': 3, 'Metal': 4, 'Pop': 5, 'Blues': 6}


In [None]:
# want to keep the functions consistent across notebooks, so defining this so i can use the DAN and WAN models as-is
embedding_matrix_custom = None

# 3. Lyric-Only Models

DAN Model

In [None]:
# i want to try letting these run and using early stopping to see how high we get
es = EarlyStopping(monitor='val_accuracy', mode='max', patience = 3, verbose=1, restore_best_weights = True)

In [None]:
def create_dan_model(retrain_embeddings=False, 
                     max_sequence_length=1000,
                     embedding_matrix=embedding_matrix_custom, 
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001):
    """
    Construct the DAN model including the compilation and return it. Parametrize it using the arguments.
    retrain_embeddings: bool, indicates whether embeddings are retrainable
    max_sequence_length: Number of token IDs to expect in a given input
    embedding_matrix: initialize embedding layer with embedding matrix, specifying weights
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    #Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
    dan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                  embedding_matrix.shape[1],
                                  weights = [embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=retrain_embeddings,
                                   name = 'embedding_layer')
    
    
    #Input Layer, sequence of max_sequence_length tokens
    dan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64',name='input')
    #Inputs go into embedding layer, form max_sequence_length x embedding dim matrix
    dan_embeddings = dan_embedding_layer(dan_input_layer)
    #Embeddings are averaged, forming single vector represenation of size embedding matrix
    dan_avg_input_embeddings = tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1), name='averaging')(dan_embeddings)
    
    #input into hidden layers
    x = dan_avg_input_embeddings #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = dropout
        
    #dan_hidden_out_1 = tf.keras.layers.Dense(hidden_dim, activation='relu', name='hidden_1')(dan_avg_input_embeddings)
    #dan_hidden_out_1 = tf.keras.layers.Dropout(dropout)(dan_hidden_out_1)
    dan_classification = tf.keras.layers.Dense(output_layer_size, activation='softmax', name='dan_classification')(x)
    dan_model = tf.keras.models.Model(inputs=dan_input_layer, outputs=[dan_classification])
    dan_model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics='accuracy')
    
    print(dan_model.summary())

    return dan_model

In [None]:
dan_model_sorted = create_dan_model(embedding_matrix = embedding_matrix, output_layer_size = 7)
dan_sorted_history = dan_model_sorted.fit(np.array(train_tokens_prebuilt_new),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(test_tokens_prebuilt_new), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1,
                        callbacks = [es])

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1000)]            0         
                                                                 
 embedding_layer (Embedding)  (None, 1000, 300)        13194600  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0   

40% validation accuracy after 10 epochs - so not very good!  but the model is learning a little, at least

Lets see if a WAN will perform any better

In [None]:
def create_wan_model(retrain_embeddings=False, 
                     max_sequence_length=1000,
                     embedding_matrix=embedding_matrix_custom,
                     num_attention = 1,
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001,
                     loss = keras.losses.SparseCategoricalCrossentropy()):
    """
    Construct the WAN model including the compilation and return it. Parametrize it using the arguments.
    retrain_embeddings: bool, indicates whether embeddings are retrainable
    max_sequence_length: Number of token IDs to expect in a given input
    embedding_matrix: initialize embedding layer with embedding matrix, specifying weights
    num_attention = number of parallel attention computations that learn how to balance embeddings into a single
    vector representation, final attention layer weights prior attention based representations
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    #Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
    wan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                  embedding_matrix.shape[1],
                                  weights = [embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=retrain_embeddings,
                                   name = 'embedding_layer')
    
    
    #Input Layer, sequence of max_sequence_length tokens
    wan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64',name='input')
    #Inputs go into embedding layer, form max_sequence_length x embedding dim matrix
    wan_embeddings = wan_embedding_layer(wan_input_layer)
    
    #Create attention based single vector representations of words according to alternative query vectors
    attention_embeddings = []
    for num in range(num_attention):
        #Apply Query Vector to words in embeddings, returning a max_sequence_length x 1 tensor
        l1_query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query' + str(num+1))(wan_embeddings)
        #reshape to 1 x max_sequence_length
        l1_reshape_query = tf.keras.layers.Reshape((1,max_sequence_length))(l1_query)
        #Softmax over query * key (words) to obtain weights
        l1_weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                            name='attention_weights' + str(num+1))(l1_reshape_query)
        #weight embeddings according to weights
        l1_attention = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((wan_embeddings,l1_weights)))
        attention_embeddings.append(l1_attention)
    
    concat_attention = tf.keras.layers.Concatenate()(attention_embeddings)
    concat_attention = tf.keras.layers.Reshape((num_attention,embedding_matrix.shape[1]))(concat_attention)
    
    #Apply Query Vector to attention based representations, returning a num_attention x 1 tensor
    wan_query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query')(concat_attention)
    #reshape to 1 x num_attention
    reshaped_query = tf.keras.layers.Reshape((1,num_attention))(wan_query)
    #Softmax over query * key (words) to obtain weights
    wan_weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                        name='attention_weights')(reshaped_query)
    #weight attention embeddings according to weights, learning how to balance attention based vector representations 
    #from prior layer
    wan_attention = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((concat_attention,wan_weights)))
    
    #input into hidden layers
    x = wan_attention #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = dropout
        
    #wan_hidden_out_1 = tf.keras.layers.Dense(hidden_dim, activation='relu', name='hidden_1')(wan_avg_input_embeddings)
    #wan_hidden_out_1 = tf.keras.layers.Dropout(dropout)(wan_hidden_out_1)
    wan_classification = tf.keras.layers.Dense(output_layer_size, activation=output_activation, name='wan_classification')(x)
    wan_model = tf.keras.models.Model(inputs=wan_input_layer, outputs=[wan_classification])
    wan_model.compile(loss=loss,
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics='accuracy')
    
    print(wan_model.summary())

    return wan_model

In [None]:
wan_model_sorted = create_wan_model(embedding_matrix=embedding_matrix, output_layer_size = 7,
                                   num_attention=1)
wan_sorted_history = wan_model_sorted.fit(np.array(train_tokens_prebuilt_new),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(test_tokens_prebuilt_new), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1,
                        callbacks = [es],
                        class_weight = class_weights)

Model: "model_17"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 1000)]       0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 1000, 300)    13194600    ['input[0][0]']                  
                                                                                                  
 attention_query1 (Dense)       (None, 1000, 1)      300         ['embedding_layer[0][0]']        
                                                                                                  
 reshape (Reshape)              (None, 1, 1000)      0           ['attention_query1[0][0]']       
                                                                                           

42%...so not much better - but it seems like it was steadily improving

both of those were performed with embedding lengths of 1000 over 10 epochs.  what happens if we experiment with embedding size?

# 3A Prebuilt Embeddings, Size Experimentation

In [None]:
# let's start by splitting our data up so we can easily get length
def clean_and_split(text_data):
    return_data = []
    for text in text_data:
        new_text = text.lower()
        new_text = text.replace('\n',' ')
        new_text = text.replace('  ',' ')
        new_text = new_text.split()
        return_data.append(new_text)
    return(return_data)

In [None]:
list_lyrics = clean_and_split(df_train['Lyrics'])

In [None]:
len_list = []

for lyric in new_lyrics:
    len_list.append(len(lyric))

len_list = np.array(len_list)
print('Mean length = ',np.mean(len_list))
print('Stdev = ',np.std(len_list))

In [None]:
bins = np.linspace(math.ceil(min(len_list)), 
                   math.floor(max(len_list)),
                   20) # fixed number of bins

plt.xlim([min(len_list)-5, max(len_list)+5])

plt.hist(len_list, bins=bins, alpha=0.5)
plt.title('Song Length')
plt.xlabel('Length')
plt.ylabel('Count')

plt.show()

i dont have a great sense for what embedding size will be good, so lets try a few different options

In [None]:
embedding_sizes = [300,450,600,750,900]

In [None]:
best_dan_score, best_wan_score  = 0,0
best_dan_emb_size, best_wan_emb_size = None, None

for embedding_size in embedding_sizes:
    
    train_tokens_prebuilt = text_to_index_post_cleaning(df_train['Lyrics'],vocab_dict,embedding_size)
    test_tokens_prebuilt = text_to_index_post_cleaning(df_test['Lyrics'],vocab_dict,embedding_size)
    
    dan_model_sorted = create_dan_model(embedding_matrix = embedding_matrix, output_layer_size = 7, max_sequence_length=embedding_size)
    dan_sorted_history = dan_model_sorted.fit(np.array(train_tokens_prebuilt),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(test_tokens_prebuilt), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1,
                        callbacks = [es])
    
    
    if max(dan_sorted_history.history['val_accuracy']) > best_dan_score:
            best_dan_score = max(dan_sorted_history.history['val_accuracy'])
            best_dan_emb_size = embedding_size
            
    wan_model_sorted = create_wan_model(embedding_matrix=embedding_matrix, output_layer_size = 7, max_sequence_length=embedding_size,
                                   num_attention=1)
    wan_sorted_history = wan_model_sorted.fit(np.array(train_tokens_prebuilt),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(test_tokens_prebuilt), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1,
                        callbacks = [es])
    
    
    if max(wan_sorted_history.history['val_accuracy']) > best_wan_score:
            best_wan_score = max(wan_sorted_history.history['val_accuracy'])
            best_wan_emb_size = embedding_size

print('best dan performance ', best_dan_score)
print('best dan embedding size ', best_dan_emb_size)
print('best wan performance ', best_wan_score)
print('best wan embedding size ', best_wan_emb_size)

Model: "model_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 300)]             0         
                                                                 
 embedding_layer (Embedding)  (None, 300, 300)         13194600  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0  

Total params: 13,246,207
Trainable params: 51,607
Non-trainable params: 13,194,600
__________________________________________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 17: early stopping
Model: "model_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 450)]             0         
                                                                 
 embedding_layer (Embedding)  (None, 450, 300)         13194600  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (

 dot_16 (Dot)                   (None, 300, 1)       0           ['embedding_layer[0][0]',        
                                                                  'attention_weights1[0][0]']     
                                                                                                  
 flatten_16 (Flatten)           (None, 300)          0           ['dot_16[0][0]']                 
                                                                                                  
 concatenate_8 (Concatenate)    (None, 300)          0           ['flatten_16[0][0]']             
                                                                                                  
 reshape_25 (Reshape)           (None, 1, 300)       0           ['concatenate_8[0][0]']          
                                                                                                  
 attention_query (Dense)        (None, 1, 1)         300         ['reshape_25[0][0]']             
          

Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 15: early stopping
Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 600)]        0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 600, 300)     13194600    ['input[0][0]']                  
                                                                                                  
 attention_query1 (Dense)       (None, 600, 1)       300         ['embedding_layer[0][0]']        
                                                                                                  
 reshape_27 (Reshape)      

Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 22: early stopping
Model: "model_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 750)]             0         
                                                                 
 embedding_layer (Embedding)  (None, 750, 300)         13194600  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2

                                                                                                  
 hidden_1 (Dense)               (None, 100)          30100       ['flatten_21[0][0]']             
                                                                                                  
 dropout_1 (Dropout)            (None, 100)          0           ['hidden_1[0][0]']               
                                                                                                  
 hidden_2 (Dense)               (None, 100)          10100       ['dropout_1[0][0]']              
                                                                                                  
 dropout_2 (Dropout)            (None, 100)          0           ['hidden_2[0][0]']               
                                                                                                  
 hidden_3 (Dense)               (None, 100)          10100       ['dropout_2[0][0]']              
          

Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 23: early stopping
best dan performance  0.425416499376297
best dan embedding size  450
best wan performance  0.4436264932155609
best wan embedding size  600


# 4 Custom Embeddings

Both of those models used prebuilt embeddings, what happens if we use custom ones?

In [None]:
df_train['Lyrics_String']=df_train['Lyrics'].apply(lambda x: " ".join(x))
df_test['Lyrics_String']=df_test['Lyrics'].apply(lambda x: " ".join(x))
vectorizer = CountVectorizer()

In [None]:
df_train['Lyrics_String']

16303    go take life dreams fire go take day time hour...
8721     bitch one click ruin life trip yea take pic bi...
11930    eyes like face bit different bit fucked guess ...
7945     ugh ugh what what ugh what what ugh ugh behind...
15504    age darkness light appears wards away ancient ...
                               ...                        
7303     saucey genius aztro cut put magnum bottom gloc...
9125     i’ve loved / i’ve done months made feel young ...
5125     another it's kel p vibes wanna give everything...
15805    woke mornin' understand means give life one ma...
2952     läppar döljer dina tänder och din tunga är så ...
Name: Lyrics_String, Length: 14778, dtype: object

In [None]:
vectorizer.fit(df_train['Lyrics_String'])

CountVectorizer()

In [None]:
len(vectorizer.get_feature_names())

96406

In [None]:
embedding_matrix_cust = np.random.random((len(vectorizer.get_feature_names()) + 1) * 300).reshape((len(vectorizer.get_feature_names()) + 1,300))

In [None]:
embedding_matrix_cust[-1] = 0

In [None]:
mapping_dict = {}
i = 0
for feature_name in vectorizer.get_feature_names():
    mapping_dict[feature_name] = i
    i += 1

In [None]:
def get_unique_words(dataset, mapping_dict, seq_size = 1000):
    mapped_lyrics = []
    for song in dataset:
        song_tokens = []
        for word in song.split():
            try:
                song_tokens.append(mapping_dict[word])
            except:
                song_tokens.append(len(mapping_dict))
        if len(song_tokens) > seq_size:
            song_tokens = song_tokens[:seq_size]
        elif len(song_tokens) < seq_size:
            while len(song_tokens) < seq_size:
                song_tokens.append(len(mapping_dict))
                    
        mapped_lyrics.append(song_tokens)
    return np.array(mapped_lyrics)
    

In [None]:
mapped_lyrics_train = get_unique_words(df_train['Lyrics_String'], mapping_dict, seq_size = 1000)
mapped_lyrics_test = get_unique_words(df_test['Lyrics_String'], mapping_dict, seq_size = 1000)

In [None]:
mapping

{'Rock': 0,
 'Indie': 1,
 'Alternative': 2,
 'Hip Hop': 3,
 'Metal': 4,
 'Pop': 5,
 'Blues': 6}

In [35]:
train_labels.value_counts()

Rock           2406
Indie          1196
Pop            1057
Metal           934
Alternative     720
Hip Hop         675
Blues           420
Name: Major Genre, dtype: int64

In [24]:
weights = 4720/train_labels.value_counts()
class_weights = {}
for num in range(len(weights)):
    class_weights[mapping[weights.index[num]]] = weights.iloc[num]

In [25]:
class_weights

{0: 1.9617622610141314,
 1: 3.9464882943143813,
 2: 6.555555555555555,
 3: 6.992592592592593,
 4: 5.053533190578158,
 5: 4.465468306527909,
 6: 11.238095238095237}

In [None]:
dan_model_sorted = create_dan_model(embedding_matrix = embedding_matrix_cust, output_layer_size = 7)
dan_sorted_history = dan_model_sorted.fit(np.array(mapped_lyrics_train),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(mapped_lyrics_test), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 2,
                        callbacks = [es],
                        class_weight = class_weights                  
                        )

Model: "model_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1000)]            0         
                                                                 
 embedding_layer (Embedding)  (None, 1000, 300)        28922100  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0  

In [None]:
wan_model_sorted = create_wan_model(embedding_matrix=embedding_matrix_cust, output_layer_size = 7,
                                   num_attention=1)
wan_sorted_history = wan_model_sorted.fit(np.array(mapped_lyrics_train),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(mapped_lyrics_test), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1,
                        callbacks = [es],
                        class_weight = class_weights)

Model: "model_26"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 1000)]       0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 1000, 300)    28922100    ['input[0][0]']                  
                                                                                                  
 attention_query1 (Dense)       (None, 1000, 1)      300         ['embedding_layer[0][0]']        
                                                                                                  
 reshape_12 (Reshape)           (None, 1, 1000)      0           ['attention_query1[0][0]']       
                                                                                           

best result .29

lets experiment with embedding size and see if we find anything interesting

In [None]:
best_dan_score, best_wan_score  = 0,0
best_dan_emb_size, best_wan_emb_size = None, None
embedding_sizes_cust = [100,200,300,400,500,600,700,800,900]

for embedding_size in embedding_sizes_cust:
    
    mapped_lyrics_train = get_unique_words(df_train['Lyrics_String'], mapping_dict, seq_size = embedding_size)
    mapped_lyrics_test = get_unique_words(df_test['Lyrics_String'], mapping_dict, seq_size = embedding_size)
    
    dan_model_sorted = create_dan_model(embedding_matrix = embedding_matrix_cust, output_layer_size = 7, max_sequence_length=embedding_size)
    dan_sorted_history = dan_model_sorted.fit(np.array(mapped_lyrics_train),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(mapped_lyrics_test), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 2,
                        callbacks = [es],
                        class_weight = class_weights)
    
    
    if max(dan_sorted_history.history['val_accuracy']) > best_dan_score:
            best_dan_score = max(dan_sorted_history.history['val_accuracy'])
            best_dan_emb_size = embedding_size
            
    wan_model_sorted = create_wan_model(embedding_matrix=embedding_matrix_cust, output_layer_size = 7, max_sequence_length=embedding_size,
                                   num_attention=1)
    wan_sorted_history = wan_model_sorted.fit(np.array(mapped_lyrics_train),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(mapped_lyrics_test), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 2,
                        callbacks = [es],
                        class_weight = class_weights)
    
    
    if max(wan_sorted_history.history['val_accuracy']) > best_wan_score:
            best_wan_score = max(wan_sorted_history.history['val_accuracy'])
            best_wan_emb_size = embedding_size

print('best dan performance ', best_dan_score)
print('best dan embedding size ', best_dan_emb_size)
print('best wan performance ', best_wan_score)
print('best wan embedding size ', best_wan_emb_size)

Model: "model_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 100)]             0         
                                                                 
 embedding_layer (Embedding)  (None, 100, 300)         28922100  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0  

Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 5: early stopping
Model: "model_32"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 200)]             0         
                                                                 
 embedding_layer (Embedding)  (None, 200, 300)         28922100  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                

None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 9: early stopping
Model: "model_34"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 300)]             0         
                                                                 
 embedding_layer (Embedding)  (None, 300, 300)         28922100  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)             

                                                                                                  
 hidden_3 (Dense)               (None, 100)          10100       ['dropout_2[0][0]']              
                                                                                                  
 dropout_3 (Dropout)            (None, 100)          0           ['hidden_3[0][0]']               
                                                                                                  
 wan_classification (Dense)     (None, 7)            707         ['dropout_3[0][0]']              
                                                                                                  
Total params: 28,973,707
Trainable params: 51,607
Non-trainable params: 28,922,100
__________________________________________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: early stopping
Model: "model_36"
______

 flatten_19 (Flatten)           (None, 300)          0           ['dot_19[0][0]']                 
                                                                                                  
 hidden_1 (Dense)               (None, 100)          30100       ['flatten_19[0][0]']             
                                                                                                  
 dropout_1 (Dropout)            (None, 100)          0           ['hidden_1[0][0]']               
                                                                                                  
 hidden_2 (Dense)               (None, 100)          10100       ['dropout_1[0][0]']              
                                                                                                  
 dropout_2 (Dropout)            (None, 100)          0           ['hidden_2[0][0]']               
                                                                                                  
 hidden_3 

                                                                                                  
 reshape_31 (Reshape)           (None, 1, 300)       0           ['concatenate_10[0][0]']         
                                                                                                  
 attention_query (Dense)        (None, 1, 1)         300         ['reshape_31[0][0]']             
                                                                                                  
 reshape_32 (Reshape)           (None, 1, 1)         0           ['attention_query[0][0]']        
                                                                                                  
 attention_weights (Lambda)     (None, 1, 1)         0           ['reshape_32[0][0]']             
                                                                                                  
 dot_21 (Dot)                   (None, 300, 1)       0           ['reshape_31[0][0]',             
          

                                                                                                  
 reshape_33 (Reshape)           (None, 1, 600)       0           ['attention_query1[0][0]']       
                                                                                                  
 attention_weights1 (Lambda)    (None, 1, 600)       0           ['reshape_33[0][0]']             
                                                                                                  
 dot_22 (Dot)                   (None, 300, 1)       0           ['embedding_layer[0][0]',        
                                                                  'attention_weights1[0][0]']     
                                                                                                  
 flatten_22 (Flatten)           (None, 300)          0           ['dot_22[0][0]']                 
                                                                                                  
 concatena

 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 700)]        0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 700, 300)     28922100    ['input[0][0]']                  
                                                                                                  
 attention_query1 (Dense)       (None, 700, 1)       300         ['embedding_layer[0][0]']        
                                                                                                  
 reshape_36 (Reshape)           (None, 1, 700)       0           ['attention_query1[0][0]']       
                                                                                                  
 attention_weights1 (Lambda)    (None, 1, 700)       0           ['reshape_36[0][0]']             
          

Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 10: early stopping
Model: "model_45"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 800)]        0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 800, 300)     28922100    ['input[0][0]']                  
                                                                                                  
 attention_query1 (Dense)       (None, 800, 1)       300         ['embedding_layer[0][0]']        
                                                                                                  
 reshape_39 (Reshape)           (None, 1, 800)       0           ['a

 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 hidden_3 (Dense)            (None, 100)               10100     
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dan_classification (Dense)  (None, 7)                 707       
                                                                 
Total params: 28,973,107
Trainable params: 51,007
Non-trainable params: 28,9

currently:

best dan performance  0.3304920494556427
best dan embedding size  700
best wan performance  0.3138318359851837
best wan embedding size  300
    

# 5 Audio Feature DL

lets look at the audio features we have too

In [None]:
# reminder, what does our data look like?
df_train_audio_normalized

Unnamed: 0,danceability,energy,loudness,acousticness,speechiness,instrumentalness,valence,tempo,duration_ms
16303,0.083890,0.541561,0.082481,-0.663296,-0.419475,-0.421705,0.281600,-0.045805,-0.635516
8721,1.777120,0.379438,-0.018027,-0.311507,0.535964,-0.420469,0.204008,-0.455375,-1.530281
11930,0.162224,0.202576,0.573121,-0.635356,-0.672419,-0.422246,-0.485701,-0.726473,-0.476165
7945,0.867234,-0.475394,-1.048238,-0.368260,2.019137,-0.423698,-0.856420,0.194771,-0.346021
15504,-0.536761,-0.534348,0.497078,-0.535344,-0.632178,2.799775,-1.059022,0.425864,1.866022
...,...,...,...,...,...,...,...,...,...
7303,0.626205,-0.401702,-0.481556,0.156409,3.410330,-0.424316,1.363581,1.375829,-0.792824
9125,1.572245,0.615253,-1.054850,0.763627,-0.007866,-0.415252,1.962766,0.043757,0.134579
5125,1.801223,0.163273,-0.034888,0.751721,-0.057305,-0.418003,0.919581,-0.827523,-0.473074
15805,-0.386117,-1.477611,-2.291367,0.287377,-0.662071,0.707498,-0.826245,1.420558,0.838621


In [None]:
# audio features should help too: lets see what results we get from
# a standard feed-forward network
# note: audio features have been normalized

model = keras.Sequential([
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(7,activation='softmax')
])

#Compile the model, specifying loss function, optimizer, and performance metric
model.compile(loss = keras.losses.SparseCategoricalCrossentropy(),
             optimizer = keras.optimizers.Adam(learning_rate=0.01),
             metrics=['accuracy'],
             )

model.fit(x = np.array(df_train_audio_normalized),y = train_labels.map(mapping),batch_size=8,epochs=10,
         validation_data = (np.array(df_test_audio_normalized) ,test_labels.map(mapping)),
         use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1, class_weight = class_weights)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc8fd72aaf0>

In [44]:
#predictions = model.predict(np.array(df_test_audio_normalized))
#predictions = model.predict([np.array(test_term_df), np.array(df_test_audio_normalized)])
predictions = model.predict(np.array(test_term_df))

In [45]:
predictions_ = [x.argmax() for x in predictions]

In [46]:
conf = sklearn.metrics.confusion_matrix(test_labels.map(mapping), predictions_)
conf = pd.DataFrame(conf, index = mapping.keys(), columns = mapping.keys())
conf

Unnamed: 0,Rock,Indie,Alternative,Hip Hop,Metal,Pop,Blues
Rock,781,0,0,0,0,0,0
Indie,427,0,0,0,0,0,0
Alternative,239,0,0,0,0,0,0
Hip Hop,272,0,0,0,0,0,0
Metal,353,0,0,0,0,0,0
Pop,346,0,0,0,0,0,0
Blues,163,0,0,0,0,0,0


write up some precision/recall stuff on the final conf

# 6. Lyrics + Audio DL

In [None]:
def create_multimodal_genre_FFN(max_sequence_length = 1000, retrain_embeddings = True, learning_rate = 0.01):
    audio_inputs = keras.layers.Input(shape = (8,), dtype = 'float32', name = 'audio_input')
    lyric_inputs = keras.layers.Input(shape = (1000,), dtype='int64',name='lyric_input')
                                    
    ffn_embedding_layer = Embedding(embedding_matrix.shape[0],
                                  embedding_matrix.shape[1],
                                  weights = [embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=retrain_embeddings,
                                   name = 'embedding_layer')
    
    #Input Layer, sequence of max_sequence_length tokens
    #ffn_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64',name='input')
    #Inputs go into embedding layer, form max_sequence_length x embedding dim matrix
    ffn_embeddings = ffn_embedding_layer(lyric_inputs)
    ffn_avg_input_embeddings = tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1), name='averaging')(ffn_embeddings)
    concat_ffn = tf.keras.layers.Concatenate()([ffn_avg_input_embeddings,audio_inputs])
    hidden = keras.layers.Dense(100,activation='relu')(concat_ffn)
    classification = keras.layers.Dense(7,activation='softmax')(hidden)
    
    ffn_model = tf.keras.models.Model(inputs=[audio_inputs,lyric_inputs], outputs=[classification])
    ffn_model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics='accuracy',
                     run_eagerly = True)
    
    print(ffn_model.summary())

    return ffn_model
  
    

In [None]:
# define two sets of inputs
inputA = Input(shape=(1000,))
inputB = Input(shape=(9,))
# the first branch operates on the first input
x = Dense(8, activation="relu")(inputA)
x = Dense(4, activation="relu")(x)
x = Model(inputs=inputA, outputs=x)
# the second branch opreates on the second input
y = Dense(64, activation="relu")(inputB)
y = Dense(32, activation="relu")(y)
y = Dense(4, activation="relu")(y)
y = Model(inputs=inputB, outputs=y)
# combine the output of the two branches
combined = concatenate([x.output, y.output])
# apply a FC layer and then a classification prediction on the
# combined outputs
z = Dense(2, activation="relu")(combined)
z = Dense(7, activation="softmax")(z)
#z = Dense(1, activation="linear")(z)
# our model will accept the inputs of the two branches and
# then output a single value
model = Model(inputs=[x.input, y.input], outputs=z)

In [None]:
model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=.001,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
             metrics='accuracy',
             )

In [None]:
model.fit(x=[np.array(train_tokens_prebuilt_new), np.array(df_train_audio_normalized)], y=train_labels.map(mapping), validation_data=([np.array(test_tokens_prebuilt_new), np.array(df_test_audio_normalized)], test_labels.map(mapping)), epochs=10, batch_size=8,
             class_weight = class_weights)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc046908160>

In [None]:
# -------------------------------Model that accepts input and creates embedding matrix-------------------------------
#input_layer = tf.keras.layers.Input(shape=(1000,))
lyric_input = tf.keras.layers.Input(shape=(1000,))
audio_input = tf.keras.layers.Input(shape=(9,))
#Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights = [embedding_matrix],
                            input_length=1000,
                            trainable=True,
                            name = 'embedding_layer')

embeddings = embedding_layer(lyric_input)
embedding_model = tf.keras.Model(inputs = [lyric_input],outputs=[embeddings])


# -----------------------------------------------------AUDIO FFN MODEL-----------------------------------------------------

# AUDIO
#audio_layer = Dense(300, activation="relu")(audio_input)
# add more layers?
audio_model = tf.keras.Model(inputs = [audio_input],outputs=[audio_input])

# -----------------------------------------------------WAN Model-----------------------------------------------------
# LYRICS
#Apply Query Vector to attention based representations, returning a num_attention x 1 tensor
query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query')(embedding_model.output)
#reshape to 1 x num_attention
reshaped_query = tf.keras.layers.Reshape((1,1000))(query)
#Softmax over query * key (words) to obtain weights
weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                    name='attention_weights')(reshaped_query)
#weight attention embeddings according to weights, learning how to balance attention based vector representations 
#from prior layer
wan_embedding = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((embedding_model.output,weights)))
wan_embedding = tf.keras.Model(inputs=[embedding_model.input],outputs=[wan_embedding])

# WAN Model that uses an attention layer with a single node to learn how to combine WAN/DAN embeddings into single representation
dual_embedding = tf.keras.layers.concatenate([audio_model.output,wan_embedding.output])
#dual_embedding = tf.keras.layers.Reshape((2,embedding_matrix.shape[1]))(dual_embedding)
#query = tf.keras.layers.Dense(1,activation='linear',use_bias=False)(dual_embedding)
#reshaped_query = tf.keras.layers.Reshape((1,2))(query)
#weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x))(reshaped_query)
#embedding = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((dual_embedding,weights)))
hidden = tf.keras.layers.Dense(100,activation='relu')(dual_embedding)
output = tf.keras.layers.Dense(7,activation='softmax')(hidden)
final_model = tf.keras.Model(inputs=[audio_model.input, embedding_model.input],outputs=[output])

final_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                            loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
                            metrics='accuracy') 

In [None]:
final_model.fit(x=[np.array(df_train_audio_normalized), np.array(train_tokens_prebuilt_new)], y=train_labels.map(mapping), validation_data=([np.array(df_test_audio_normalized), np.array(test_tokens_prebuilt_new)], test_labels.map(mapping)), epochs=10, batch_size=8,
             class_weight = class_weights)

Epoch 1/10
 393/1848 [=====>........................] - ETA: 7:46 - loss: 3.5846 - accuracy: 0.3302

In [None]:
tf.executing_eagerly()

True

In [None]:
final_model.fit(x=[np.array(train_tokens_prebuilt_new), np.array(df_train_audio_normalized)], y=train_labels.map(mapping), validation_data=([np.array(test_tokens_prebuilt_new), np.array(df_test_audio_normalized)], test_labels.map(mapping)), epochs=10, batch_size=8,
             class_weight = class_weights)

Epoch 1/10


ValueError: in user code:

    File "/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/opt/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/anaconda3/lib/python3.8/site-packages/keras/engine/input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "model_48" is incompatible with the layer: expected shape=(None, 9), found shape=(None, 1000)


In [None]:
# -------------------------------Model that accepts input and creates embedding matrix-------------------------------
#input_layer = tf.keras.layers.Input(shape=(1000,))
lyric_input = tf.keras.layers.Input(shape=(1000,))
audio_input = tf.keras.layers.Input(shape=(9,))
#Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights = [embedding_matrix],
                            input_length=1000,
                            trainable=True,
                            name = 'embedding_layer')

audio_layer = Dense(100, activation="relu")(audio_input)

embeddings = embedding_layer(lyric_input)
embedding_model = tf.keras.Model(inputs = [lyric_input],outputs=[embeddings])

audio_model = tf.keras.Model(inputs = [audio_input],outputs=[audio_layer])

# -----------------------------------------------------DAN MODEL-----------------------------------------------------
# AUDIO
avg_embedding_audio = tf.keras.layers.Lambda(lambda x:K.mean(x,axis=1))(audio_model.output)
avg_embedding_audio = tf.keras.layers.Reshape((1,embedding_matrix.shape[1]))(avg_embedding_audio)
avg_embedding_audio = tf.keras.Model(inputs = [audio_model.input], outputs = [avg_embedding_audio])


# -----------------------------------------------------WAN Model-----------------------------------------------------
# LYRICS
#Apply Query Vector to attention based representations, returning a num_attention x 1 tensor
query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query')(embedding_model.output)
#reshape to 1 x num_attention
reshaped_query = tf.keras.layers.Reshape((1,1000))(query)
#Softmax over query * key (words) to obtain weights
weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                    name='attention_weights')(reshaped_query)
#weight attention embeddings according to weights, learning how to balance attention based vector representations 
#from prior layer
wan_embedding = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((embedding_model.output,weights)))
wan_embedding = tf.keras.Model(inputs=[embedding_model.input],outputs=[wan_embedding])

# WAN Model that uses an attention layer with a single node to learn how to combine WAN/DAN embeddings into single representation
dual_embedding = tf.keras.layers.concatenate([audio_reshaped.output,wan_embedding.output])
dual_embedding = tf.keras.layers.Reshape((2,embedding_matrix_custom.shape[1]))(dual_embedding)
query = tf.keras.layers.Dense(1,activation='linear',use_bias=False)(dual_embedding)
reshaped_query = tf.keras.layers.Reshape((1,2))(query)
weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x))(reshaped_query)
embedding = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((dual_embedding,weights)))
hidden = tf.keras.layers.Dense(100,activation='relu')(embedding)
output = tf.keras.layers.Dense(11,activation='softmax')(hidden)
final_model = tf.keras.Model(inputs=[embedding_model.input],outputs=[output])

final_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                            loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
                            metrics='accuracy') 

ValueError: Exception encountered when calling layer "reshape_4" (type Reshape).

total size of new array must be unchanged, input_shape = [], output_shape = [1, 300]

Call arguments received by layer "reshape_4" (type Reshape):
  • inputs=tf.Tensor(shape=(None,), dtype=float32)

In [None]:
print(embedding_matrix.shape[0], embedding_matrix.shape[1])

43982 300


# 7. Subgenre Prediction

once we have predicted a genre, let's see if we can predict the correct subgenres a song fits into

In [None]:
df_train['lyric_token_array'] = [np.array(song) for song in train_tokens_prebuilt_new]
df_test['lyric_token_array'] = [np.array(song) for song in test_tokens_prebuilt_new]

def run_subgenre_model(major_genre, sub_genre_label, df_train, df_test, model_type = 'dan'):
    # create df for major genre
    df_train_major_genre = df_train[df_train['Major Genre'] == major_genre]
    df_test_major_genre = df_test[df_test['Major Genre'] == major_genre]
    
    # now get an array of the tokens
    train_tokens_prebuilt_major_genre = df_train_major_genre['lyric_token_array'].to_numpy()
    test_tokens_prebuilt_major_genre  = df_test_major_genre['lyric_token_array'].to_numpy()
    
    # convert those tokens to a tensor (not sure why i have to do this, but its the only way i can get the model to run)
    tensor_train_major_genre = tf.convert_to_tensor(np.array([np.array(song) for song in train_tokens_prebuilt_major_genre]))
    tensor_test_major_genre = tf.convert_to_tensor(np.array([np.array(song) for song in test_tokens_prebuilt_major_genre]))
    
    # run the model
    if model_type == 'dan':
        dan_model_sorted = create_dan_model(embedding_matrix = embedding_matrix, output_activation = 'sigmoid', output_layer_size = 2)
        dan_sorted_history = dan_model_sorted.fit(tensor_train_major_genre,
                        np.array(df_train_major_genre[sub_genre_label]),
                        validation_data=(tensor_test_major_genre, np.array(df_test_major_genre[sub_genre_label])),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1,
                        callbacks = [es])
        return(dan_model_sorted.predict(tensor_test_pop))
        
    elif model_type == 'wan':
        wan_model_sorted = create_wan_model(embedding_matrix=embedding_matrix, output_layer_size = 1, output_activation = 'sigmoid',
                                   num_attention=1, loss = tf.keras.losses.BinaryCrossentropy())
        wan_sorted_history = wan_model_sorted.fit(tensor_train_major_genre,
                        np.array(df_train_major_genre[sub_genre_label]),
                        validation_data=(tensor_test_major_genre, np.array(df_test_major_genre[sub_genre_label])),
                        batch_size=8,
                        epochs=100,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1,
                        callbacks = [es])
        return(wan_model_sorted.predict(tensor_test_pop))
        

In [None]:
run_subgenre_model('Pop', 'Sub-Genre: electropop', df_train, df_test, 'wan')

Model: "model_21"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 1000)]       0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 1000, 300)    13194600    ['input[0][0]']                  
                                                                                                  
 attention_query1 (Dense)       (None, 1000, 1)      300         ['embedding_layer[0][0]']        
                                                                                                  
 reshape_9 (Reshape)            (None, 1, 1000)      0           ['attention_query1[0][0]']       
                                                                                           

In [None]:
pop_subgenres = ['Sub-Genre: electropop', 'Sub-Genre: new rave', 'Sub-Genre: post-teen pop', 'Sub-Genre: art pop', 'Sub-Genre: dance pop', 'Sub-Genre: pop', 'Sub-Genre: pop rap', 'Sub-Genre: pop rock', 'Sub-Genre: indie pop']

In [None]:
predicted_subgenres = []
for sub_genre in pop_subgenres:
    predictions = run_subgenre_model('Pop', sub_genre, df_train, df_test)
    for pred in predictions:
        single_pred = []
        if pred[1] > pred[0]:
            single_pred.append(sub_genre)
        predicted_
    

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1000)]            0         
                                                                 
 embedding_layer (Embedding)  (None, 1000, 300)        13194600  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0   

 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 hidden_3 (Dense)            (None, 100)               10100     
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dan_classification (Dense)  (None, 2)                 202       
                                                                 
Total params: 13,245,102
Trainable params: 50,502
Non-trainable params: 13,194,600
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 

                                                                 
Total params: 13,245,102
Trainable params: 50,502
Non-trainable params: 13,194,600
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 4: early stopping
Model: "model_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1000)]            0         
                                                                 
 embedding_layer (Embedding)  (None, 1000, 300)        13194600  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (

In [None]:
predictions = dan_model_sorted.predict(tensor_test_pop)
rounded_predictions = []
for prediction in predictions:
    if prediction[0] > prediction[1]:
        rounded_predictions.append(0)
    else:
        rounded_predictions.append(1)

print(rounded_predictions)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
subgenre_predictions = []
for pred in rounded_predictions:
    single_pred = []
    if pred == 1:
        single_pred.append('subgenre')
    subgenre_predictions.append(single_pred)

# 8. BERT Models

In [33]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [34]:
#df_train['Lyrics_String']
df_train['final_modified_lyrics']

KeyError: ignored

In [36]:
#train_bert_ids = bert_tokenizer(list(df_train['Lyrics_String']),
 #                              max_length=512,truncation=True,padding='max_length', return_tensors='tf')['input_ids']
#test_bert_ids = bert_tokenizer(list(df_test['Lyrics_String']),
 #                            max_length=512,truncation=True,padding='max_length', return_tensors='tf')['input_ids']

train_bert_ids = bert_tokenizer(list(df_train['final_modified_lyrics']),
                               max_length=512,truncation=True,padding='max_length', return_tensors='tf')['input_ids']
test_bert_ids = bert_tokenizer(list(df_test['final_modified_lyrics']),
                             max_length=512,truncation=True,padding='max_length', return_tensors='tf')['input_ids']

In [37]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [38]:
def create_bert_model(train_layers=-1,
                      embedding_dim=768,
                      token = 'cls', # 'cls' or 'pooled' or 'avg'
                      num_attention = 0,
                      hidden_dim=[10,10,10],
                      dropout_rate=0.3,
                      hidden_layer_activation = 'relu',
                      output_layer_size = 4,
                      output_activation = 'softmax',
                      learning_rate=0.001):
    """
    Build a simple classification model with BERT.
    """
    
    # Load BERT
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    #restrict training to the train_layers outer transformer layers (SPECIFY WHICH BERT LAYERS ARE TRAINABLE)
    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False
    
    #Input Layer
    input_ids = tf.keras.layers.Input(shape = (512,),dtype=tf.int64, name='input_ids_layer') 
    #Get Contextual Embeddings + Single Vector Representations of Input (CLS or Pooled)
    bert_out = bert_model(input_ids) 
    
    if token == 'cls':
        token = bert_out[0][:,0] #Get CLS Tokens
    elif token == 'pooled':
        token = bert_out[1] #Pooled Token
    elif token == 'avg':
        token = tf.math.reduce_mean(bert_out[0][:,1:-1],axis=1)
    elif token == 'word_embeddings':
        token = bert_out[0][:,1:-1]
    
    # Attention to Combine CLS/Pooled Tokens into single representation in the event of chunking text for single example
    if num_attention == 0: # Single CLS/Pooled Token
        embedding = token
    elif num_attention == 1:
        #Apply Query Vector to BERT Token, returning a num_attention x 1 tensor
        query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query')(token)
        if token.shape == (None,768):
            reshaped_query = tf.keras.layers.Reshape((1,1))(query)
            token = tf.keras.layers.Reshape((1,token.shape[1]))(token)
        else:
            reshaped_query = tf.keras.layers.Reshape((1,token.shape[1]))(query)
        #Softmax over query * key (words) to obtain weights
        weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                            name='attention_weights')(reshaped_query)
        #weight attention embeddings according to weights
        embedding = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((token,weights)))
    else:
        #Create attention based single vector representations of words according to alternative query vectors
        attention_embeddings = []
        for num in range(num_attention):
            #Apply Query Vector to words in embeddings, returning a embedding_dim x 1 tensor
            l1_query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query_l' + str(num+1))(token)
            if token.shape == (None,768):
                l1_reshaped_query = tf.keras.layers.Reshape((1,1))(l1_query)
                l1_token = tf.keras.layers.Reshape((1,token.shape[1]))(token)
            else:
                l1_reshaped_query = tf.keras.layers.Reshape((1,token.shape[1]))(l1_query)
                l1_token = token
                
            #Softmax over query * key (words) to obtain weights
            l1_weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                                name='attention_weights_l' + str(num+1))(l1_reshaped_query)
            
            #weight attention embeddings according to weights
            l1_attention = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((l1_token,l1_weights)))
            attention_embeddings.append(l1_attention)

        concat_attention = tf.keras.layers.Concatenate()(attention_embeddings)
        concat_attention = tf.keras.layers.Reshape((num_attention,embedding_dim))(concat_attention)
        
        #Apply Query Vector to BERT Embeddings with Various Attention-Based representations, returning a num_attention x 1 tensor
        query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query')(concat_attention)
        #reshape to 1 x num_attention
        reshaped_query = tf.keras.layers.Reshape((1,num_attention))(query)
        #Softmax over query * key (words) to obtain weights
        weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                            name='attention_weights')(reshaped_query)
        #weight attention embeddings according to weights
        embedding = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((concat_attention,weights)))
        
    x = embedding
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = dropout

    bert_classification = tf.keras.layers.Dense(output_layer_size, activation=output_activation,name='classification_layer')(x)
    
    bert_model = tf.keras.Model(inputs=[input_ids], outputs=[bert_classification])
    
    bert_model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics=['accuracy'],
                     run_eagerly=True) 
    
    print(bert_model.summary())
    
    return bert_model

In [None]:
cls_bert_model = create_bert_model(learning_rate=0.0005,output_layer_size=7,num_attention=2,token='word_embeddings')
                        
cls_bert_model.fit(train_bert_ids[:100], df_train['Major Genre'].map(mapping).iloc[:100], 
                   validation_data=(test_bert_ids[:100],df_test['Major Genre'].map(mapping).iloc[:100]),
                   batch_size=8, epochs=2) 

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids_layer (InputLayer)   [(None, 512)]        0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_ids_layer[0][0]']        
                                thPoolingAndCrossAt                                               
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                               
                                 768),                                                            
                                 pooler_output=(Non                                         

























# 9 Extra Cleaning Functions

In [None]:
df_train['Lyrics']

16303    [go, take, life, dreams, fire, go, take, day, ...
8721     [bitch, one, click, ruin, life, trip, yea, tak...
11930    [eyes, like, face, bit, different, bit, fucked...
7945     [ugh, ugh, what, what, ugh, what, what, ugh, u...
15504    [age, darkness, light, appears, wards, away, a...
                               ...                        
7303     [saucey, genius, aztro, cut, put, magnum, bott...
9125     [i’ve, loved, /, i’ve, done, months, made, fee...
5125     [another, it's, kel, p, vibes, wanna, give, ev...
15805    [woke, mornin', understand, means, give, life,...
2952     [läppar, döljer, dina, tänder, och, din, tunga...
Name: Lyrics, Length: 14778, dtype: object

In [13]:
def split_text_into_regions(text):
    string = text
   
    #mark line breaks
    string = string.replace('\n','[]')
    string = string.replace('embed','')
    #find language indicators of song sections
    splits = re.findall('\[.*?\]',string)
    #find ad libs to remove
    ad_libs = re.findall('\(.*?\)',string)
   
    #remove ad libs
    if len(ad_libs) > 0:
        for ad_lib in ad_libs:
            string = string.replace(ad_lib,'')
        string = string.replace('  ',' ')
   
    #If there is no splitting criteria, single string is entire song without any additional groupings
    if len(splits) == 0:
        string = [string]
    else:
        #replace split criteria with makers for splitting
        for delim in splits:
            string = string.replace(delim,'[]')
        string = string.split('[]')
   
    #Identify sections of song, made up of groups of lyrics
    sections = []
    section = []
    last_part = ''
    for part in string:
        if part == '' and last_part != '':
            sections.append(section)
            section = []
        elif part != '':
            section.append(part)
       
        last_part = part
   
    try:
        if section != sections[-1]:
            sections.append(section)
    except:
        sections.append(section)
   
    return sections    


def single_text_lyrics(group_of_lyrics):
    lyrics = ''
    for group in group_of_lyrics:
        lyrics = lyrics + ' ' + ' '.join(group)
    return lyrics.strip() 

# 10 Term Density

In [2]:
term_freq = CountVectorizer()

In [3]:
df_test = pkl.load(open('genre_sub_genre_test.pkl', 'rb'))
df_train = pkl.load(open('genre_sub_genre_train.pkl', 'rb'))

In [4]:
df_train = df_train[:8250]

In [5]:
# filter out results
df_train.drop(df_train[df_train['Lyrics'].str.len() > 5000].index, inplace=True)
df_train[df_train['Artist Name'].str.contains("ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") == False]
df_train[df_train['Track Name'].str.contains("ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") == False]

df_test.drop(df_test[df_test['Lyrics'].str.len() > 5000].index, inplace=True)
df_test[df_test['Artist Name'].str.contains("ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") == False]
df_test[df_test['Track Name'].str.contains("ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ") == False]

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,...,Sub-Genre: modern alternative rock,Sub-Genre: southern hip hop,Sub-Genre: nu metal,Sub-Genre: israeli mediterranean,Sub-Genre: thrash metal,Sub-Genre: pop rock,Sub-Genre: chicago blues,Sub-Genre: indie pop,Sub-Genre: classic rock,Sub-Genre: hardcore hip hop
5,Stevie Ray Vaughan,Life By The Drop,51,0.659,0.163,6,-11.864,0,0.0388,0.76600,...,0,0,0,0,0,0,0,0,1,0
6,DARKSIDE,Paper Trails,55,0.947,0.419,8,-13.043,0,0.0578,0.77800,...,0,0,0,0,0,0,0,0,0,0
11,"Christone ""Kingfish"" Ingram",Outside Of This Town,48,0.418,0.866,11,-4.033,0,0.0513,0.00381,...,0,0,0,0,0,0,0,0,0,0
28,Jesse Cook,I Put A Spell On You,34,0.420,0.373,1,-9.302,0,0.0320,0.92200,...,0,0,0,0,0,0,0,0,0,0
31,"Christone ""Kingfish"" Ingram",Before I'm Old,41,0.534,0.649,2,-5.526,1,0.0410,0.04380,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19316,Woolbright,Tuesday,23,0.514,0.819,11,-6.713,0,0.0375,0.01220,...,0,0,0,0,0,0,0,0,0,0
19321,Runnin' Wild,How You Want It Done,27,0.614,0.953,9,-3.539,1,0.0517,0.07710,...,0,0,0,0,0,0,0,0,0,0
19344,Four Year Strong,Go Down in History,48,0.505,0.985,5,-4.401,1,0.1190,0.00006,...,0,0,0,0,0,0,0,0,0,0
19355,Nathaniel Rateliff & The Night Sweats,S.O.B.,66,0.699,0.579,1,-6.504,1,0.0416,0.26700,...,0,0,0,0,0,0,0,0,0,0


In [6]:
train_labels = df_train['Major Genre']
test_labels = df_test['Major Genre']
print(train_labels.value_counts())

Rock           2406
Indie          1196
Pop            1057
Metal           934
Alternative     720
Hip Hop         675
Blues           420
Name: Major Genre, dtype: int64


In [35]:
print(test_labels.value_counts())

Rock           781
Indie          427
Metal          353
Pop            346
Hip Hop        272
Alternative    239
Blues          163
Name: Major Genre, dtype: int64


In [7]:
# create mapper so we can use numeric labels in our networks
mapping = {}
count = 0
for label in train_labels.unique():
    mapping[label] = count
    count = count + 1
print(mapping)

{'Rock': 0, 'Indie': 1, 'Alternative': 2, 'Hip Hop': 3, 'Metal': 4, 'Pop': 5, 'Blues': 6}


In [8]:
weights = 2406/train_labels.value_counts()
class_weights = {}
for num in range(len(weights)):
    class_weights[mapping[weights.index[num]]] = weights.iloc[num]
class_weights

{0: 1.0,
 1: 2.011705685618729,
 2: 3.341666666666667,
 3: 3.5644444444444443,
 4: 2.576017130620985,
 5: 2.2762535477767267,
 6: 5.728571428571429}

In [9]:
df_train['Lyrics']

16303    Unbreakable Lyrics[Intro]\nGo take it all\nYou...
8721     NOBODY LyricsTake a bitch\nThat I have in one ...
11930    Worth It Lyrics[Verse 1]\nYour eyes are just l...
7945     Bloodrush Lyrics[Intro: Denzel Curry]\nUgh\nUg...
15504    Age of Man Lyrics[Intro]\nIn an age of darknes...
                               ...                        
3883     Kid Milli & dress - Kitty ft. MIYEON (Romanize...
2531     Earthless Lyrics[Verse 1]\nDescending through ...
1133     We Get By Lyrics[Verse 1: Mavis Staples and Be...
14500    Fly Away LyricsI wish that I could fly\nInto t...
4249     Shney Yeladim Ba’olam - שני ילדים בעולם Lyrics...
Name: Lyrics, Length: 7408, dtype: object

In [10]:
df_train['modified_lyrics'] = df_train['Lyrics'].apply(lambda x: ' '.join(str(x).split('Lyrics')[1:]).lower())
df_test['modified_lyrics'] = df_test['Lyrics'].apply(lambda x: ' '.join(str(x).split('Lyrics')[1:]).lower())

In [14]:
df_train['modified_lyrics'] = df_train['modified_lyrics'].apply(lambda x: split_text_into_regions(x))
df_test['modified_lyrics'] = df_test['modified_lyrics'].apply(lambda x: split_text_into_regions(x))

In [15]:
df_train['modified_lyrics']

16303    [[go take it all, your life, your dreams, your...
8721     [[take a bitch, that i have in one click, ruin...
11930    [[your eyes are just like his, but your face i...
7945     [[ugh, ugh,  ugh,  ugh, ugh], [behind every sm...
15504    [[in an age of darkness light appears, and it ...
                               ...                        
3883     [[i don't tryna be a good boy, nae saenggage, ...
2531     [[descending through the mouth, engulfing teet...
1133     [[we get by on love and faith, we get by with ...
14500    [[i wish that i could fly, into the sky so ver...
4249     [[איך הכל ממהר לי פתאום, רציתי לראות את השמיים...
Name: modified_lyrics, Length: 7408, dtype: object

In [16]:
df_train['final_modified_lyrics'] = df_train['modified_lyrics'].apply(lambda x: single_text_lyrics(x))
df_test['final_modified_lyrics'] = df_test['modified_lyrics'].apply(lambda x: single_text_lyrics(x))

In [17]:
df_train['final_modified_lyrics']

16303    go take it all your life, your dreams, your fi...
8721     take a bitch that i have in one click ruin my ...
11930    your eyes are just like his but your face is a...
7945     ugh ugh  ugh  ugh ugh behind every smile, it b...
15504    in an age of darkness light appears and it war...
                               ...                        
3883     i don't tryna be a good boy nae saenggage you ...
2531     descending through the mouth engulfing teeth p...
1133     we get by on love and faith we get by with a s...
14500    i wish that i could fly into the sky so very h...
4249     איך הכל ממהר לי פתאום רציתי לראות את השמיים שנ...
Name: final_modified_lyrics, Length: 7408, dtype: object

In [18]:
term_freq.fit(df_train['final_modified_lyrics'])

CountVectorizer()

In [19]:
train_term_df = pd.DataFrame(term_freq.transform(df_train['final_modified_lyrics']).todense(), columns = term_freq.get_feature_names())



In [20]:
train_term_df.shape

(7408, 66198)

In [21]:
train_term_sums = np.array(train_term_df.sum(axis = 1)).astype('float16')

In [22]:
print(train_term_sums[0])

189.0


In [23]:
train_term_sums_reshaped = train_term_sums.repeat(len(term_freq.get_feature_names())).reshape(train_term_df.shape)



In [24]:
# bigger numbers = better words for that song
train_term_df = (train_term_df / train_term_sums_reshaped).astype('float16')

In [25]:
train_term_df.shape
df_train.shape

(7408, 74)

In [26]:
test_term_df = pd.DataFrame(term_freq.transform(df_test['final_modified_lyrics']).todense(), columns = term_freq.get_feature_names())
test_term_sums = np.array(test_term_df.sum(axis = 1)).astype('float16')
test_term_sums_reshaped = test_term_sums.repeat(len(term_freq.get_feature_names())).reshape(test_term_df.shape)
test_term_df = (test_term_df / test_term_sums_reshaped).astype('float16')



In [32]:
print(train_labels.map(mapping)[0:5])
print(train_labels[0:5])

16303    0
8721     1
11930    2
7945     3
15504    0
Name: Major Genre, dtype: int64
16303           Rock
8721           Indie
11930    Alternative
7945         Hip Hop
15504           Rock
Name: Major Genre, dtype: object


In [39]:
print(test_labels.map(mapping)[0:5])
print(test_labels[0:5])

5     6
6     6
11    6
28    6
31    6
Name: Major Genre, dtype: int64
5     Blues
6     Blues
11    Blues
28    Blues
31    Blues
Name: Major Genre, dtype: object


In [47]:
test_term_df.iloc[0].sort_values()

00              0.000000
thes            0.000000
thesaurus       0.000000
thescrivener    0.000000
these           0.000000
                  ...   
how             0.027176
that            0.032623
living          0.043488
the             0.054352
it              0.054352
Name: 0, Length: 66198, dtype: float16

In [48]:
test_term_df.iloc[1].sort_values()

00              0.000000
thesaurus       0.000000
thescrivener    0.000000
these           0.000000
thesis          0.000000
                  ...   
go              0.040649
the             0.040649
on              0.048767
to              0.065063
you             0.081299
Name: 1, Length: 66198, dtype: float16

In [49]:
# first lets see if we can do a basic FFN with just the dfs
# a standard feed-forward network
# note: audio features have been normalized

model = keras.Sequential([
    keras.layers.Dense(50,activation='leaky_relu'),
    keras.layers.Dense(50,activation='leaky_relu'),
    keras.layers.Dense(10,activation='leaky_relu'),
    keras.layers.Dense(7,activation='softmax')
])



#Compile the model, specifying loss function, optimizer, and performance metric
model.compile(loss = keras.losses.SparseCategoricalCrossentropy(),
             optimizer = keras.optimizers.Adam(learning_rate=0.0001),
             metrics=['accuracy'],
             )


#model.fit(x = np.array(train_term_df),y = train_labels.map(mapping),batch_size=20,epochs=3,
  #       validation_data = (np.array(test_term_df), test_labels.map(mapping)),
   #      use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1)

model.fit(x = np.array(train_term_df),y = train_labels.map(mapping),batch_size=8,epochs=10,
         validation_data = (np.array(test_term_df) ,test_labels.map(mapping)),
         use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1, class_weight = class_weights)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7a0573b210>

In [27]:
# let's try with audio features too...this will be a big model
# define two sets of inputs
inputA = Input(shape=(66198,))
inputB = Input(shape=(9,))
# the first branch operates on the first input
x = Dense(1000, activation="relu")(inputA)
x = Dense(1000, activation="relu")(x)
x = Dense(100, activation="relu")(x)
x = Model(inputs=inputA, outputs=x)
# the second branch opreates on the second input
y = Dense(10, activation="relu")(inputB)
y = Dense(50, activation="relu")(y)
y = Dense(100, activation="relu")(y)
y = Model(inputs=inputB, outputs=y)
# combine the output of the two branches
combined = concatenate([x.output, y.output])
# apply a FC layer and then a classification prediction on the
# combined outputs
z = Dense(50, activation="relu")(combined)
z = Dense(20, activation="relu")(z)
z = Dense(7, activation="softmax")(z)
#z = Dense(1, activation="linear")(z)
# our model will accept the inputs of the two branches and
# then output a single value
model = Model(inputs=[x.input, y.input], outputs=z)

In [28]:
model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=.001,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
             metrics='accuracy',
             )

In [31]:
#model.fit(x=[np.array(train_term_df), np.array(df_train_audio_normalized)], y=train_labels.map(mapping), validation_data=([np.array(test_term_df), np.array(df_test_audio_normalized)], test_labels.map(mapping)), epochs=10, batch_size=8,
 #            class_weight = class_weights)

model.fit(x=[np.array(train_term_df), np.array(df_train_audio_normalized)], y=train_labels.map(mapping), validation_data=([np.array(test_term_df), np.array(df_test_audio_normalized)], test_labels.map(mapping)), epochs=10, batch_size=8)

Epoch 1/10
Epoch 2/10
 66/926 [=>............................] - ETA: 3:06 - loss: nan - accuracy: 0.2803

KeyboardInterrupt: ignored

In [None]:
# concatenate with audio features