## Part 1.1 - Neural Network Models
This notebook contains preliminary data exploration, data processing and training for all RNN & CNN models, variants and embeddings for Genre prediction using 1 input feature: song lyrics.

### Data exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# import file
music_df = pd.read_csv("p1_data/sample_train.csv", index_col=False, sep=",", quotechar='"')

# view class distribution
print(music_df.groupby(["Genre"])["Genre"].count())

# lyric length calculation
music_df["Lyric_Length"] = music_df["Lyrics"].apply(len)
text_length_distribution = music_df["Lyric_Length"].describe()

# artist length calculation
music_df["Artist_Length"] = music_df["Artist"].apply(len)
text_length_distribution = music_df["Artist_Length"].describe()

# visualize both distributions
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

# lyric length distribution
axs[0].hist(music_df["Lyric_Length"])
axs[0].set_xlabel("Text Length")
axs[0].set_ylabel("Number of Songs")
axs[0].set_title("Distribution of Text Lengths in Lyrics")

# artist length distribution
axs[1].hist(music_df["Artist_Length"], color="green")
axs[1].set_xlabel("Text Length")
axs[1].set_ylabel("Number of Songs")
axs[1].set_title("Distribution of Text Lengths in Artist")

plt.tight_layout()
plt.show()


### Data preprocessing

In [None]:
import numpy as np
import tensorflow as tf
import random
import string
from nltk.corpus import stopwords

# function to lowercase, remove punctuation & stopwords
def preprocess_text(text):
    text = text.lower()
    text = text.strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    filtered_text = ' '.join([word for word in text.split() if word not in stop_words])
    return filtered_text

# preprocess lyrics
music_df["Prsd_Lyrics"] = music_df["Lyrics"].apply(preprocess_text)

# extract labels & convert to one-hot encoded vectors
labels = music_df["Genre"]
label_dict = {label: i for i, label in enumerate(labels.unique())}
labels_encoded = labels.map(label_dict)
labels_categorical = tf.keras.utils.to_categorical(labels_encoded)

# tokenize and pad lyrics
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(music_df['Prsd_Lyrics'])
sequences = tokenizer.texts_to_sequences(music_df['Prsd_Lyrics'])
vocab_size = len(tokenizer.word_index) + 1 # number of unique words
max_text_length = 4000 # chosen based on distribution above, excluding extreme values
X = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_text_length)

# randomly shuffle data indices of dataframe
data_indices = list(range(len(music_df)))
random.shuffle(data_indices)

# split data into train/test using indices
split_ratio = 0.8
split_index = int(len(music_df) * split_ratio)
train_indices = data_indices[:split_index]
test_indices = data_indices[split_index:]

X_train = X[train_indices]
X_test = X[test_indices]
y_train = labels_categorical[train_indices]
y_test = labels_categorical[test_indices]

# reshape for RNN & CNN architecture compatibility
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

### Basic Models
Exploring Simple RNN, LSTM and dense layers

In [None]:
# define learning rate
opt = tf.keras.optimizers.Adam(learning_rate=0.001)

# implement Early Stopping for overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

#### Simple RNN

In [None]:
# simple rnn model
rnn_model = tf.keras.Sequential([
    tf.keras.layers.SimpleRNN(units=64, return_sequences=True),
    tf.keras.layers.SimpleRNN(units=16, dropout=0.2),
    tf.keras.layers.Dense(units=len(label_dict), activation="softmax") # final output layer
])

# compile the model
rnn_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 

# train & evaluate the model
rnn_model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test, y_test)
print("Test Accuracy:", rnn_accuracy)

In [None]:
rnn_model.summary()
rnn_model.save("models_p1/rnn/rnn.keras")

#### LSTM

In [None]:
# LSTM model
lstm_model = tf.keras.Sequential([
    tf.keras.layers.LSTM(units=64),
    tf.keras.layers.Dense(units=len(label_dict), activation="softmax") # final output layer
])

# compile the model
lstm_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 

# train & evaluate the model
lstm_model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test, y_test)
print("Test Accuracy:", lstm_accuracy)

In [None]:
lstm_model.summary()

#### RNN + LSTM

In [None]:
# LSTM RNN model
lstm_rnn_model = tf.keras.Sequential([
    tf.keras.layers.SimpleRNN(units=8, return_sequences=True),
    tf.keras.layers.LSTM(units=16),
    tf.keras.layers.Dense(units=len(label_dict), activation="softmax")
])

# compile the model
lstm_rnn_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 

# train & evaluate the model
lstm_rnn_model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
lstm_rnn_loss, lstm_rnn_accuracy = lstm_rnn_model.evaluate(X_test, y_test)
print("Test Accuracy:", lstm_rnn_accuracy)

In [None]:
lstm_rnn_model.summary()

### In-model Embedding
Below are variations of in-model embedding with different NN layers

In [None]:
# define embedding dimension
embedding_dim = 50 

# define input layer
input_layer = tf.keras.layers.Input(shape=(max_text_length,))

# embedding layer
embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_text_length)(input_layer)

# to reshape LSTM & RNN to 1 timestep
reshaped = tf.keras.layers.Reshape((1, -1))(embedding_layer)

#### LSTM + Embedding

In [None]:
# LSTM layer
lstm_layer = tf.keras.layers.LSTM(units=64)(reshaped)

# output layer
output1 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(lstm_layer)

# define model
emb_model1 = tf.keras.Model(inputs=input_layer, outputs=output1)

# compile , train & evaluate the model
emb_model1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
emb_model1.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
emb_loss1, emb_accuracy1 = emb_model1.evaluate(X_test, y_test)
print("Test Accuracy:", emb_accuracy1)

In [None]:
emb_model1.summary()
emb_model1.save("models_p1/emb/emb_lstm.keras")

#### RNN + Embedding

In [None]:
# simple RNN layer
rnn_layer = tf.keras.layers.SimpleRNN(units=64)(reshaped)

# output layer
output2 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(rnn_layer)

# define model
emb_model2 = tf.keras.Model(inputs=input_layer, outputs=output2)

# compile , train & evaluate the model
emb_model2.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
emb_model2.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
emb_loss2, emb_accuracy2 = emb_model2.evaluate(X_test, y_test)
print("Test Accuracy:", emb_accuracy2)

In [None]:
emb_model2.summary()

#### Dense + Embedding

In [None]:
# dense layer
flattened = tf.keras.layers.Flatten()(embedding_layer) # to convert multi-D data into 1D tensor
dense1 = tf.keras.layers.Dense(units=64)(flattened)

# output layer
output3 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(dense1)

# define MODEL
emb_model3 = tf.keras.Model(inputs=input_layer, outputs=output3)

# compile , train & evaluate the model
emb_model3.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
emb_model3.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
emb_loss3, emb_accuracy3 = emb_model3.evaluate(X_test, y_test)
print("Test Accuracy:", emb_accuracy3)

In [None]:
emb_model3.summary()

### Pre-trained embeddings
Using the gensim library, I will pretrain a Word2Vec model on the lyric data and train same set of models to compare with in-model embedding

In [None]:
import gensim
from gensim.models import Word2Vec

# select lyrics & split into sentences
lyrics_list = music_df["Prsd_Lyrics"].tolist()
sentences = [lyric.split() for lyric in lyrics_list]

# train the Word2Vec model
w2v_pmodel = Word2Vec(sentences=sentences, window=5, min_count=1, workers=4)

# save model
w2v_pmodel.save("models_p1/word2vec_model")

# define vocab_size & embedding dimension
embedding_dim = 100 
vocab_size = len(tokenizer.word_index) + 1

# load pre-trained Word2vec embedding weights
word2vec_model = gensim.models.Word2Vec.load("models_p1/word2vec_model")
w2v_embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [None]:
# define input layer
w2v_input_layer = tf.keras.layers.Input(shape=(max_text_length,))

# define embedding layer
w2v_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_text_length, weights=[w2v_embedding_matrix])(w2v_input_layer)

# to reshape LSTM & RNN to 1 timestep
w2v_reshaped = tf.keras.layers.Reshape((1, -1))(w2v_embedding_layer)


#### LSTM + Pre-trained Word2Vec embedding

In [None]:
# LSTM layer
lstm_layer2 = tf.keras.layers.LSTM(units=64)(w2v_reshaped)

# output layer
w2v_output1 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(lstm_layer2)

# define model
w2v_model1 = tf.keras.Model(inputs=w2v_input_layer, outputs=w2v_output1)

# compile , train & evaluate the model
w2v_model1.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
w2v_model1.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
w2v_loss1, w2v_accuracy1 = w2v_model1.evaluate(X_test, y_test)
print("Test Accuracy:", w2v_accuracy1)

In [None]:
w2v_model1.summary()
w2v_model1.save("models_p1/emb/w2v_lstm.keras")

#### RNN + Pre-trained Word2Vec embedding

In [None]:
# simple RNN layer
rnn_layer2 = tf.keras.layers.SimpleRNN(units=64)(w2v_reshaped)

# output layer
w2v_output2 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(rnn_layer2)

# define model
w2v_model2 = tf.keras.Model(inputs=w2v_input_layer, outputs=w2v_output2)

# compile , train & evaluate the model
w2v_model2.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
w2v_model2.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
w2v_loss2, w2v_accuracy2 = w2v_model2.evaluate(X_test, y_test)
print("Test Accuracy:", w2v_accuracy2)

In [None]:
w2v_model2.summary()

#### Dense + Pre-trained Word2Vec embedding

In [None]:
# dense layer
w2v_flattened = tf.keras.layers.Flatten()(w2v_embedding_layer) # to convert multi-D data into 1D tensor
dense2 = tf.keras.layers.Dense(units=64)(w2v_flattened)

# output layer
w2v_output3 = tf.keras.layers.Dense(len(label_dict), activation='softmax')(dense2)

# pre_trained embedding model
w2v_model3 = tf.keras.Model(inputs=w2v_input_layer, outputs=w2v_output3)

# compile , train & evaluate the model
w2v_model3.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
w2v_model3.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
w2v_loss3, w2v_accuracy3 = w2v_model3.evaluate(X_test, y_test)
print("Test Accuracy:", w2v_accuracy3)

In [None]:
w2v_model3.summary()

### Convolutional layers
Experimenting with convolutional layers-only models and mixing in LSTM

#### Basic CNN

In [None]:
# define CNN model
cnn_model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(filters=128, kernel_size=7, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=128, activation="relu"),
    tf.keras.layers.Dense(units=len(label_dict), activation="softmax")
])

# compile model
cnn_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# train & evalutate model
cnn_model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
loss, accuracy = cnn_model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

#### CNN + LSTM

In [None]:
# define CNN + LSTM model
cnnlstm_model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Reshape((-1, 32)), # reshape for LSTM input
    tf.keras.layers.LSTM(units=16, return_sequences=True),
    tf.keras.layers.LSTM(units=32),
    tf.keras.layers.Dense(units=len(label_dict), activation="softmax")
])

# compile model
cnnlstm_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# train & evalutate model
cnnlstm_model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])
loss1, accuracy1 = cnnlstm_model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy1)