In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding
import keras.backend as K

from sklearn.feature_extraction.text import CountVectorizer

import pickle as pkl
import pandas as pd
import numpy as np
import multiprocessing

import gensim
import nltk
#nltk.download('word2vec_sample')
from nltk.data import find

In [2]:
# import train and test data
df_test = pkl.load(open('Train_Test_Data/genre_sub_genre_test.pkl', 'rb'))
df_train = pkl.load(open('Train_Test_Data/genre_sub_genre_train.pkl', 'rb'))

In [3]:
# let's create a column that is all of the normalized audio features we want to use
df_train_audio_normalized = df_train[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].copy()
df_train_audio_normalized = (df_train_audio_normalized-df_train_audio_normalized.mean())/df_train_audio_normalized.std()

df_test_audio_normalized = df_test[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].copy()
df_test_audio_normalized = (df_test_audio_normalized-df_test_audio_normalized.mean())/df_test_audio_normalized.std()

df_train['normalized_audio_feature_array'] = df_train_audio_normalized.to_numpy()
df_test['normalized_audio_feature_array'] = df_test_audio_normalized.to_numpy()

#df_train['normalized_audio_feature_array'] = df_train_audio_normalized[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].to_numpy()
#df_test['normalized_audio_feature_array'] = df_test_audio_normalized[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].to_numpy()

#df_train['normalized_audio_feature_array'] = np.array(df_train_audio_normalized[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].values)
#df_test['normalized_audio_feature_array'] = np.array(df_test_audio_normalized[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].values)

#df_train['normalized_audio_feature_array'] = df_train_audio_normalized[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].values.tolist()
#df_test['normalized_audio_feature_array'] = df_test_audio_normalized[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].values.tolist()

#df_train['normalized_audio_feature_array'] = df_train['normalized_audio_feature_array'].apply(lambda x: np.array(x, dtype=np.float64))
#df_test['normalized_audio_feature_array'] = df_test['normalized_audio_feature_array'].apply(lambda x: np.array(x, dtype=np.float64))

#df_train['normalized_audio_feature_array'] = np.array(df_train['normalized_audio_feature_array'])
#df_test['normalized_audio_feature_array'] = np.array(df_test['normalized_audio_feature_array'])

#df_train['normalized_audio_feature_array'] = np.array(df_train_audio_normalized[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].values.tolist())
#df_test['normalized_audio_feature_array'] = np.array(df_test_audio_normalized[['danceability', 'energy', 'loudness', 'acousticness', 'speechiness', 'instrumentalness', 'valence', 'tempo','duration_ms']].values.tolist())

ValueError: Wrong number of items passed 9, placement implies 1

In [None]:
df_train['normalized_audio_feature_array']

In [4]:
# get word2vec model
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [5]:
# how big does our embedding matrix need to be
print(len(model.key_to_index.items()))

43981


In [6]:
#construct embedding matrix w/ prebuilt embedding
vocab_dict = model.key_to_index.copy()
embedding_matrix = np.zeros((43982,300))
for word,index in model.key_to_index.items():
    embedding_matrix[index] = model[word]

In [7]:
# bringing in lyric tokenizer function
def text_to_index(text_data,mapping,max_size):
    return_data = []
    for text in text_data:
        new_text = text.lower()
        new_text = text.replace('\n',' ')
        new_text = text.replace('  ',' ')
        new_text = new_text.split()
        mapped_text = []
        for token in new_text:
            try:
                mapped_text.append(mapping[token])
            except:
                mapped_text.append(len(mapping))
        
        if len(mapped_text) > max_size:
            mapped_text = mapped_text[:max_size]
        else:
            while len(mapped_text) < max_size:
                mapped_text.append(len(mapping))
                
        return_data.append(mapped_text)
    
    return return_data

In [8]:
# tokenize train/test lyrics - "get X"
train_tokens_prebuilt = text_to_index(df_train['Lyrics'],vocab_dict,1000)
test_tokens_prebuilt = text_to_index(df_test['Lyrics'],vocab_dict,1000)

In [9]:
# get labels "Y"
train_labels = df_train['Major Genre']
test_labels = df_test['Major Genre']

In [10]:
# create mapper so we can use numeric labels in our networks
mapping = {}
count = 0
for label in train_labels.unique():
    mapping[label] = count
    count = count + 1

In [11]:
# want to keep the functions consistent across notebooks, so defining this so i can use the DAN and WAN models as-is
embedding_matrix_custom = None

DAN Model

In [12]:
def create_dan_model(retrain_embeddings=False, 
                     max_sequence_length=1000,
                     embedding_matrix=embedding_matrix_custom, 
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001):
    """
    Construct the DAN model including the compilation and return it. Parametrize it using the arguments.
    retrain_embeddings: bool, indicates whether embeddings are retrainable
    max_sequence_length: Number of token IDs to expect in a given input
    embedding_matrix: initialize embedding layer with embedding matrix, specifying weights
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    #Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
    dan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                  embedding_matrix.shape[1],
                                  weights = [embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=retrain_embeddings,
                                   name = 'embedding_layer')
    
    
    #Input Layer, sequence of max_sequence_length tokens
    dan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64',name='input')
    #Inputs go into embedding layer, form max_sequence_length x embedding dim matrix
    dan_embeddings = dan_embedding_layer(dan_input_layer)
    #Embeddings are averaged, forming single vector represenation of size embedding matrix
    dan_avg_input_embeddings = tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1), name='averaging')(dan_embeddings)
    
    #input into hidden layers
    x = dan_avg_input_embeddings #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = dropout
        
    #dan_hidden_out_1 = tf.keras.layers.Dense(hidden_dim, activation='relu', name='hidden_1')(dan_avg_input_embeddings)
    #dan_hidden_out_1 = tf.keras.layers.Dropout(dropout)(dan_hidden_out_1)
    dan_classification = tf.keras.layers.Dense(output_layer_size, activation='softmax', name='dan_classification')(x)
    dan_model = tf.keras.models.Model(inputs=dan_input_layer, outputs=[dan_classification])
    dan_model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics='accuracy')
    
    print(dan_model.summary())

    return dan_model

In [13]:
dan_model_sorted = create_dan_model(embedding_matrix = embedding_matrix, output_layer_size = 7)
dan_sorted_history = dan_model_sorted.fit(np.array(train_tokens_prebuilt),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(test_tokens_prebuilt), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=10,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1000)]            0         
                                                                 
 embedding_layer (Embedding)  (None, 1000, 300)        13194600  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 hidden_1 (Dense)            (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 hidden_2 (Dense)            (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0     

40% validation accuracy after 10 epochs - so not very good!  but the model is learning a little, at least

Lets see if a WAN will perform any better

In [14]:
def create_wan_model(retrain_embeddings=False, 
                     max_sequence_length=1000,
                     embedding_matrix=embedding_matrix_custom,
                     num_attention = 1,
                     hidden_dim=[100,100,100],
                     dropout_rate=0.3,
                     hidden_layer_activation = 'relu',
                     output_layer_size = 4,
                     output_activation = 'softmax',
                     learning_rate=0.001):
    """
    Construct the WAN model including the compilation and return it. Parametrize it using the arguments.
    retrain_embeddings: bool, indicates whether embeddings are retrainable
    max_sequence_length: Number of token IDs to expect in a given input
    embedding_matrix: initialize embedding layer with embedding matrix, specifying weights
    num_attention = number of parallel attention computations that learn how to balance embeddings into a single
    vector representation, final attention layer weights prior attention based representations
    hidden_dim = number of neurons in hidden layers
    dropout = dropout rate
    output_layer_size = # of neurons in output layer corresponding to # of classes, each neuron predicts P(class K | x)
    output_activation = activation function for output layer
    learning_rate = learning rate for gradient descent for finding model params to optimize loss
    """
    
    #Specify Embedding Layer, including shape, intialize with weights, expected input length, and whether it is trainable
    wan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                  embedding_matrix.shape[1],
                                  weights = [embedding_matrix],
                                  input_length=max_sequence_length,
                                  trainable=retrain_embeddings,
                                   name = 'embedding_layer')
    
    
    #Input Layer, sequence of max_sequence_length tokens
    wan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64',name='input')
    #Inputs go into embedding layer, form max_sequence_length x embedding dim matrix
    wan_embeddings = wan_embedding_layer(wan_input_layer)
    
    #Create attention based single vector representations of words according to alternative query vectors
    attention_embeddings = []
    for num in range(num_attention):
        #Apply Query Vector to words in embeddings, returning a max_sequence_length x 1 tensor
        l1_query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query' + str(num+1))(wan_embeddings)
        #reshape to 1 x max_sequence_length
        l1_reshape_query = tf.keras.layers.Reshape((1,max_sequence_length))(l1_query)
        #Softmax over query * key (words) to obtain weights
        l1_weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                            name='attention_weights' + str(num+1))(l1_reshape_query)
        #weight embeddings according to weights
        l1_attention = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((wan_embeddings,l1_weights)))
        attention_embeddings.append(l1_attention)
    
    concat_attention = tf.keras.layers.Concatenate()(attention_embeddings)
    concat_attention = tf.keras.layers.Reshape((num_attention,embedding_matrix.shape[1]))(concat_attention)
    
    #Apply Query Vector to attention based representations, returning a num_attention x 1 tensor
    wan_query = tf.keras.layers.Dense(1,activation='linear',use_bias=False,name='attention_query')(concat_attention)
    #reshape to 1 x num_attention
    reshaped_query = tf.keras.layers.Reshape((1,num_attention))(wan_query)
    #Softmax over query * key (words) to obtain weights
    wan_weights = tf.keras.layers.Lambda(lambda x:tf.keras.activations.softmax(x),
                                        name='attention_weights')(reshaped_query)
    #weight attention embeddings according to weights, learning how to balance attention based vector representations 
    #from prior layer
    wan_attention = tf.keras.layers.Flatten()(tf.keras.layers.Dot((1,2))((concat_attention,wan_weights)))
    
    #input into hidden layers
    x = wan_attention #hidden layer initial input
    count = 1
    for layer in hidden_dim:
        hidden = tf.keras.layers.Dense(layer,activation = hidden_layer_activation,name='hidden_' + str(count))(x)
        dropout = tf.keras.layers.Dropout(dropout_rate,name='dropout_' + str(count))(hidden)
        count = count + 1
        x = dropout
        
    #wan_hidden_out_1 = tf.keras.layers.Dense(hidden_dim, activation='relu', name='hidden_1')(wan_avg_input_embeddings)
    #wan_hidden_out_1 = tf.keras.layers.Dropout(dropout)(wan_hidden_out_1)
    wan_classification = tf.keras.layers.Dense(output_layer_size, activation='softmax', name='wan_classification')(x)
    wan_model = tf.keras.models.Model(inputs=wan_input_layer, outputs=[wan_classification])
    wan_model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-07,
                                                amsgrad=False,
                                                name='Adam'),
                 metrics='accuracy')
    
    print(wan_model.summary())

    return wan_model

In [15]:
wan_model_sorted = create_wan_model(embedding_matrix=embedding_matrix, output_layer_size = 7,
                                   num_attention=1)
wan_sorted_history = wan_model_sorted.fit(np.array(train_tokens_prebuilt),
                        np.array(train_labels.map(mapping)),
                        validation_data=(np.array(test_tokens_prebuilt), np.array(test_labels.map(mapping))),
                        batch_size=8,
                        epochs=10,
                        shuffle=True,
                        use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 1000)]       0           []                               
                                                                                                  
 embedding_layer (Embedding)    (None, 1000, 300)    13194600    ['input[0][0]']                  
                                                                                                  
 attention_query1 (Dense)       (None, 1000, 1)      300         ['embedding_layer[0][0]']        
                                                                                                  
 reshape (Reshape)              (None, 1, 1000)      0           ['attention_query1[0][0]']       
                                                                                            

41%...so not much better

Both of those models used prebuilt embeddings, what happens if we use custom ones?

In [16]:
# initialize vectorizer with preprocessing
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('  ',' ')
    return text

vectorizer = CountVectorizer(preprocessor=preprocess_text)

DO NOT RUN BELOW CELL - it will crash the kernel.  need to figure out a workaround

In [17]:
#Vectorize Train Lyrics
#train_lyrics = vectorizer.fit_transform(df_train['Lyrics'])
#train_lyrics = pd.DataFrame(train_lyrics.todense(),columns = vectorizer.get_feature_names())
#train_lyrics_token_count = train_lyrics.sum(axis=1)
#train_lyrics = train_lyrics/np.array(train_lyrics_token_count.repeat(len(train_lyrics.columns))).reshape(train_lyrics.shape)

#Vectorize Test Lyrics
#test_lyrics = vectorizer.transform(df_test['Lyrics'])
#test_lyrics = pd.DataFrame(test_lyrics.todense(),columns = vectorizer.get_feature_names())
#test_lyrics_token_count = test_lyrics.sum(axis=1)
#test_lyrics = test_lyrics/np.array(test_lyrics_token_count.repeat(len(test_lyrics.columns))).reshape(test_lyrics.shape)



In [19]:
#If we can get vectorizer working, use this to make custom embedding matrix

#vocab_dict_custom = {}
#count = 0
#for word in vectorizer.get_feature_names():
#    vocab_dict_custom[word] = count
#    count = count + 1
#embedding_matrix_custom = np.random.random((len(vectorizer.get_feature_names()) + 1,300))
#embedding_matrix_custom[-1] = 0

lets look at the audio features we have too

In [42]:
# audio features should help too: lets see what results we get from
# a standard feed-forward network
# note: audio features have been normalized

train_normalized_audio_features = np.array(df_train['normalized_audio_feature_array'])
test_normalized_audio_features = np.array(df_test['normalized_audio_feature_array'])

model = keras.Sequential([
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dense(7,activation='softmax')
])

#Compile the model, specifying loss function, optimizer, and performance metric
model.compile(loss = keras.losses.SparseCategoricalCrossentropy(),
             optimizer = keras.optimizers.Adam(learning_rate=0.01),
             metrics=['accuracy'],
             )

model.fit(x = train_normalized_audio_features,y = train_labels.map(mapping),batch_size=8,epochs=10,
         validation_data = (test_normalized_audio_features,test_labels.map(mapping)),
         use_multiprocessing=True,workers=multiprocessing.cpu_count() - 1)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [43]:
print(train_normalized_audio_features)

[list([0.08667260380184501, 0.5543217870346929, 0.1220273049840551, -0.674109859950688, -0.41543317804459345, -0.440099973618899, 0.27353173919337254, -0.04503025561781331, -0.63357693290532])
 list([1.769134455639869, 0.3955264896487572, 0.02601600085691811, -0.3318981833522548, 0.5337352175503375, -0.4389348974290065, 0.19657635700507578, -0.45322352624784257, -1.4860406055792426])
 list([0.16450891723207395, 0.22229525613682674, 0.5907140659204748, -0.6469305932172383, -0.6667172298266449, -0.4406102259648373, -0.48747148466867257, -0.7234102089486819, -0.4817594483694913])
 ...
 list([1.793084090541478, 0.18379942646750885, 0.009908841282957543, 0.7023810152118274, -0.055640103902110626, -0.4366089971521044, 0.9062759927415893, -0.8241204656775679, -0.4788146847200311])
 list([-0.3803452767795279, -1.4234014622265112, -2.145608101702935, 0.250680133420123, -0.6564374277082883, 0.6247158823995809, -0.8252201064950858, 1.4164032781183464, 0.7708677674537697])
 list([0.254320048113107