## Part 1.2 - Neural Network Models
This notebook contains all RNN models, variants and embeddings for Genre prediction using 2 input feature: song lyrics and artist.

### Data preprocessing

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
import string
from nltk.corpus import stopwords

# import file
music_df = pd.read_csv("p1_data/sample_train.csv", index_col=False, sep=",", quotechar='"')

# function to lowercase, remove punctuation & stopwords
def preprocess_text(text):
    text = text.lower()
    text = text.strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    filtered_text = ' '.join([word for word in text.split() if word not in stop_words])
    return filtered_text

# preprocess lyrics
music_df["Prsd_Lyrics"] = music_df["Lyrics"].apply(preprocess_text)

# preprocess artist
music_df["Prsd_Artist"] = music_df["Artist"].apply(preprocess_text)

# extract labels & convert to one-hot encoded vectors
labels = music_df["Genre"]
label_dict = {label: i for i, label in enumerate(labels.unique())}
labels_encoded = labels.map(label_dict)
labels_categorical = tf.keras.utils.to_categorical(labels_encoded)

# tokenize & pad lyrics and artist
tokenizer_lyrics = Tokenizer(char_level=True)
tokenizer_artist = Tokenizer()
tokenizer_lyrics.fit_on_texts(music_df['Prsd_Lyrics'])
sequences_lyrics = tokenizer_lyrics.texts_to_sequences(music_df['Prsd_Lyrics'])
tokenizer_artist.fit_on_texts(music_df['Prsd_Artist'])
sequences_artist = tokenizer_artist.texts_to_sequences(music_df['Prsd_Artist'])

max_lyric_length = 4000 # chosen based on distribution above, excluding extreme values
max_artist_length = 25 # chosen based on distribution above, excluding extreme values

# combine input features
X_lyrics = pad_sequences(sequences_lyrics, maxlen=max_lyric_length)
X_artist = pad_sequences(sequences_artist, maxlen=max_artist_length)
X = np.concatenate((X_lyrics, X_artist), axis=1)

# randomly shuffle data indices of dataframe
data_indices = list(range(len(music_df)))
random.shuffle(data_indices)

# split data into train/test using indices
split_ratio = 0.8
split_index = int(len(music_df) * split_ratio)
train_indices = data_indices[:split_index]
test_indices = data_indices[split_index:]

X_train = X[train_indices]
X_test = X[test_indices]
y_train = labels_categorical[train_indices]
y_test = labels_categorical[test_indices]

# reshape for RNN & CNN architecture compatibility
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

### Basic Models
Exploring Simple RNN, LSTM and dense layers

In [21]:
# early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# learning rate
opt = tf.keras.optimizers.Adam(learning_rate=0.001)

#### Simple RNN

In [None]:
# simple rnn model
rnn_model = tf.keras.Sequential([
    tf.keras.layers.SimpleRNN(units=64, return_sequences=False),
    tf.keras.layers.Dense(units=64, activation="relu"),
    tf.keras.layers.Dense(units=len(label_dict), activation="softmax") # final output layer
])

# compile the model
rnn_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 

# train & evaluate the model
rnn_model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test, y_test)
print("Test Accuracy:", rnn_accuracy)

In [None]:
rnn_model.summary()

#### RNN + LSTM

In [None]:
# LSTM rnn model
lstm_model = tf.keras.Sequential([
    tf.keras.layers.SimpleRNN(units=8, return_sequences=True),
    tf.keras.layers.LSTM(16),
    tf.keras.layers.Dense(units=len(label_dict), activation="softmax") # final output layer
])

# compile the model
lstm_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 

# train & evaluate the model
lstm_model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test, y_test)
print("Test Accuracy:", lstm_accuracy)

In [None]:
lstm_model.summary()

### In-model Embedding
Variations of "on-the-job" embedding with different NN layers

In [14]:
# redefine test/train data
X_train_lyric = X_lyrics[train_indices]
X_train_artist = X_artist[train_indices]
y_train = labels_categorical[train_indices]

X_test_lyric = X_lyrics[test_indices]
X_test_artist = X_artist[test_indices]
y_test = labels_categorical[test_indices]

# define vocab size & embedding dimension
embedding_dim = 100 
artist_vocab_size = len(tokenizer_artist.word_index) + 1 # number of unique words
lyric_vocab_size = len(tokenizer_lyrics.word_index) + 1 # number of unique words

# define input layers
lyric_input = tf.keras.layers.Input(shape=(max_lyric_length,))
artist_input = tf.keras.layers.Input(shape=(max_artist_length,))

# embedding layer
lyric_embedding = tf.keras.layers.Embedding(input_dim=lyric_vocab_size, output_dim=embedding_dim, input_length=max_lyric_length)(lyric_input)
lyric_flatten = tf.keras.layers.Flatten()(lyric_embedding)
artist_embedding = tf.keras.layers.Embedding(input_dim=artist_vocab_size, output_dim=embedding_dim, input_length=max_artist_length)(artist_input)
artist_flatten = tf.keras.layers.Flatten()(artist_embedding)
concatenated = tf.keras.layers.concatenate([lyric_flatten, artist_flatten])

# to reshape LSTM & RNN to 1 timestep
reshaped = tf.keras.layers.Reshape((1, -1))(concatenated)

#### LSTM + Embedding

In [None]:
# LSTM layer
lstm_layer = tf.keras.layers.LSTM(units=64)(reshaped)

# output layer
output1 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(lstm_layer)

# define model
emb_model1 = tf.keras.Model(inputs=[lyric_input, artist_input], outputs=output1)

# compile , train & evaluate the model
emb_model1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
emb_model1.fit([X_train_lyric, X_train_artist], y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
emb_loss1, emb_accuracy1 = emb_model1.evaluate([X_test_lyric, X_test_artist], y_test)
print("Test Accuracy:", emb_accuracy1)

In [None]:
emb_model1.summary()

#### RNN + Embedding


In [None]:
# simple RNN layers
rnn_layer1 = tf.keras.layers.SimpleRNN(units=64)(reshaped)

# output layer
output2 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(rnn_layer1)

# define model
emb_model2 = tf.keras.Model(inputs=[lyric_input, artist_input], outputs=output2)

# compile , train & evaluate the model
emb_model2.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
emb_model2.fit([X_train_lyric, X_train_artist], y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
emb_loss2, emb_accuracy2 = emb_model2.evaluate([X_test_lyric, X_test_artist], y_test)
print("Test Accuracy:", emb_accuracy2)

In [None]:
emb_model2.summary()

#### Dense + Embedding

In [None]:
# dense layer
dense1 = tf.keras.layers.Dense(units=64)(concatenated)

# output layer
output3 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(dense1)

# define model
emb_model3 = tf.keras.Model(inputs=[lyric_input, artist_input], outputs=output3)

# compile , train & evaluate the model
emb_model3.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
emb_model3.fit([X_train_lyric, X_train_artist], y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
emb_loss3, emb_accuracy3 = emb_model3.evaluate([X_test_lyric, X_test_artist], y_test)
print("Test Accuracy:", emb_accuracy3)

In [None]:
emb_model3.summary()

### Pre-trained embeddings
Using the gensim library, I will pretrain a Word2Vec model on the lyric data and train same set of models to compare with in-model embedding

In [2]:
import gensim
from gensim.models import Word2Vec

# select lyrics & artist, convert to list, split into sentences & concatenate
lyrics_list = music_df["Prsd_Lyrics"].tolist()
artist_list = music_df["Prsd_Artist"].tolist()
combined_texts = [lyrics + " " + artist for lyrics, artist in zip(lyrics_list, artist_list)]
sentences_combined = [text.split() for text in combined_texts]

# train the Word2Vec model
w2v_pmodel = Word2Vec(sentences=sentences_combined, window=5, min_count=1, workers=4)

# save model
w2v_pmodel.save("models_p1/word2vec_model2")

In [3]:
# define vocab_size & embedding dimension
embedding_dim = 100 
w2v_lyric_vocab_size = len(w2v_pmodel.wv.key_to_index)
w2v_artist_vocab_size = len(w2v_pmodel.wv.key_to_index)
vocab_size1 = max(w2v_lyric_vocab_size, w2v_artist_vocab_size)

# load pre-trained Word2vec embedding weights
word2vec_model = gensim.models.Word2Vec.load("models_p1/word2vec_model")
w2v_embedding_matrix = np.zeros((vocab_size1, embedding_dim))

# define input layers
w2v_lyric_input = tf.keras.layers.Input(shape=(max_lyric_length,))
w2v_artist_input = tf.keras.layers.Input(shape=(max_artist_length,))

# define embedding layer
w2v_lyric_embedding = tf.keras.layers.Embedding(input_dim=w2v_lyric_vocab_size, output_dim=embedding_dim, input_length=max_lyric_length, weights=[w2v_embedding_matrix])(w2v_lyric_input)
w2v_lyric_flatten = tf.keras.layers.Flatten()(w2v_lyric_embedding)

w2v_artist_embedding = tf.keras.layers.Embedding(input_dim=w2v_artist_vocab_size, output_dim=embedding_dim, input_length=max_artist_length, weights=[w2v_embedding_matrix])(w2v_artist_input)
w2v_artist_flatten = tf.keras.layers.Flatten()(w2v_artist_embedding)

w2v_concatenated = tf.keras.layers.concatenate([w2v_lyric_flatten, w2v_artist_flatten])

# to reshape LSTM & RNN to 1 timestep
w2v_reshaped = tf.keras.layers.Reshape((1, -1))(w2v_concatenated)

#### LSTM + Pre-trained Word2Vec embedding

In [16]:
# LSTM layer
lstm_layer2 = tf.keras.layers.LSTM(units=64)(w2v_reshaped)

# output layer
w2v_output1 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(lstm_layer2)

# define model
w2v_model1 = tf.keras.Model(inputs=[w2v_lyric_input, w2v_artist_input], outputs=w2v_output1)

# compile , train & evaluate the model
w2v_model1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
w2v_model1.fit([X_train_lyric, X_train_artist], y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
w2v_loss1, w2v_accuracy1 = w2v_model1.evaluate([X_test_lyric, X_test_artist], y_test)
print("Test Accuracy:", w2v_accuracy1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Test Accuracy: 0.6830318570137024


In [17]:
w2v_model1.summary()
w2v_model1.save("models_p11/emb/w2v_lstm.keras")

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 4000)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 25)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 4000, 100)    4527200     ['input_1[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, 25, 100)      4527200     ['input_2[0][0]']                
                                                                                            

#### RNN + Pre-trained Word2Vec embedding

In [18]:
# simple RNN layer
rnn_layer2 = tf.keras.layers.SimpleRNN(units=64)(w2v_reshaped)

# output layer
w2v_output2 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(rnn_layer2)

# define model
w2v_model2 = tf.keras.Model(inputs=[w2v_lyric_input, w2v_artist_input], outputs=w2v_output2)

# compile , train & evaluate the model
w2v_model2.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
w2v_model2.fit([X_train_lyric, X_train_artist], y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
w2v_loss2, w2v_accuracy2 = w2v_model2.evaluate([X_test_lyric, X_test_artist], y_test)
print("Test Accuracy:", w2v_accuracy2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Test Accuracy: 0.403962105512619


#### Dense + Pre-trained Word2Vec embedding

In [19]:
# dense layer
w2v_flattened = tf.keras.layers.Flatten()(w2v_concatenated) # to convert multi-D data into 1D tensor
dense2 = tf.keras.layers.Dense(units=64)(w2v_flattened) 

# output layer
w2v_output3 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(dense2)

# pre_trained embedding model
w2v_model3 = tf.keras.Model(inputs=[w2v_lyric_input, w2v_artist_input], outputs=w2v_output3)

# compile , train & evaluate the model
w2v_model3.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
w2v_model3.fit([X_train_lyric, X_train_artist], y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
w2v_loss3, w2v_accuracy3 = w2v_model3.evaluate([X_test_lyric, X_test_artist], y_test)
print("Test Accuracy:", w2v_accuracy3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Test Accuracy: 0.6778638958930969


### Convolutional layers
Experimenting with convolutional layers-only models and mixing in LSTM

#### Basic CNN

In [25]:
# define CNN model
cnn_model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(filters=128, kernel_size=7, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=128, activation="relu"),
    tf.keras.layers.Dense(units=len(label_dict), activation="softmax")
])

# compile model
cnn_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# train & evaluate model
cnn_model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
loss, accuracy = cnn_model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Test Accuracy: 0.49440136551856995


#### CNN + LSTM

In [27]:
# define CNN + LSTM model
cnnlstm_model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Reshape((-1, 32)), # reshape for LSTM input
    tf.keras.layers.LSTM(units=64, return_sequences=False),
    tf.keras.layers.Dense(units=len(label_dict), activation="softmax")
])

# compile model
cnnlstm_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# train & evalutate model
cnnlstm_model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
loss1, accuracy1 = cnnlstm_model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Test Accuracy: 0.42721790075302124
