# Part 1: Lyric genre prediction task
This notebook contains the top 2 models selected based on their performance during training and validation for 2 input features = Model L and Model H.

## Testing on original test data

### Preprocessing data

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import string
from nltk.corpus import stopwords

# import test dataset
music_test = pd.read_csv("data/test.csv", index_col=False, sep=",", quotechar='"')

# function to lowercase, remove punctuation & stopwords
def preprocess_text(text):
    text = text.lower()
    text = text.strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    filtered_text = ' '.join([word for word in text.split() if word not in stop_words])
    return filtered_text

# preprocess lyrics
music_test["Prsd_Lyrics"] = music_test["Lyrics"].apply(preprocess_text)

# preprocess artist
music_test["Prsd_Artist"] = music_test["Artist"].apply(preprocess_text)

# extract labels & convert to one-hot encoded vectors
labels = music_test["Genre"]
label_dict = {label: i for i, label in enumerate(labels.unique())}
labels_encoded = labels.map(label_dict)
labels_categorical = tf.keras.utils.to_categorical(labels_encoded)

# tokenize & pad lyrics and artist
tokenizer_lyrics = Tokenizer(char_level=True)
tokenizer_artist = Tokenizer()
tokenizer_lyrics.fit_on_texts(music_test['Prsd_Lyrics'])
sequences_lyrics = tokenizer_lyrics.texts_to_sequences(music_test['Prsd_Lyrics'])
tokenizer_artist.fit_on_texts(music_test['Prsd_Artist'])
sequences_artist = tokenizer_artist.texts_to_sequences(music_test['Prsd_Artist'])

max_lyric_length = 4000 # chosen based on distribution above, excluding extreme values
max_artist_length = 25

# filter out OOV tokens from input features
filtered_sequences_lyrics = [[token for token in seq if token in tokenizer_lyrics.word_index] for seq in sequences_lyrics]
filtered_sequences_artist = [[token for token in seq if token in tokenizer_artist.word_index] for seq in sequences_artist]

# Pad filtered sequences
X_lyrics_filtered = pad_sequences(filtered_sequences_lyrics, maxlen=max_lyric_length)
X_artist_filtered = pad_sequences(filtered_sequences_artist, maxlen=max_artist_length)

# ensure max length is respected
X_test = np.concatenate((X_lyrics_filtered, X_artist_filtered), axis=1)
y_test = labels_categorical

### Evaluate top models from training & validation

In [None]:
# function to evaluate top models

def evaluate_model(model, X_test, y_test):
    loss, accuracy = model.evaluate(X_test, y_test)
    y_pred = model.predict(X_test)

    # calculate precision, recall, and F1-score
    precision_metric = tf.keras.metrics.Precision()
    recall_metric = tf.keras.metrics.Recall()

    # update the metrics
    precision_metric.update_state(y_test.argmax(axis=1), y_pred.argmax(axis=1))
    recall_metric.update_state(y_test.argmax(axis=1), y_pred.argmax(axis=1))

    # get the metric values
    precision_value = precision_metric.result().numpy()
    recall_value = recall_metric.result().numpy()

    return loss, accuracy, precision_value, recall_value


#### Model H - In-training embeddeding

In [None]:
#if gdown not installed
!pip install gdown

In [None]:
# download model from drive
emb_url = 'https://drive.google.com/file/d/1K0TRAstgjqjvwsEpUnCPV_tdvkfbo5-x/uc?usp=drive_link'
emb = 'emb_dense.keras'
gdown.download(emb_url, emb, quiet=False)

# load model
emb_model = tf.keras.models.load_model("models/emb/emb_dense.keras")

# evaluate model
emb_loss, emb_accuracy, emb_precision_value, emb_recall_value = evaluate_model(emb_model, [X_lyrics_filtered, X_artist_filtered], y_test)

print("Loss:", emb_loss)
print("Accuracy:", emb_accuracy)
print("Precision:", emb_precision_value)
print("Recall:", emb_recall_value)


#### Model L - Pre-trained embedding

In [None]:
# download model from drive
w2v_url = 'https://drive.google.com/file/d/1f42AjyMPQNISW1opPp_BSX8TeZnu5XXo/uc?usp=drive_link'
w2v = 'w2v_lstm.keras'
gdown.download(w2v_url, w2v, quiet=False)

# load model
w2v_model = tf.keras.models.load_model(w2v)

# evaluate model
w2v_loss, w2v_accuracy, w2v_precision_value, w2v_recall_value = evaluate_model(w2v_model, [X_lyrics_filtered, X_artist_filtered], y_test)

print("Loss:", w2v_loss)
print("Accuracy:", w2v_accuracy)
print("Precision:", w2v_precision_value)
print("Recall:", w2v_recall_value)

## Testing on downsampled test data

### Preprocess data

In [None]:
# choose 2% sample for training & validation and export
newlen = int(len(music_test) * 0.02)
sample_test = music_test.sample(n=newlen, random_state=53)

# preprocess lyrics & artist
sample_test["Prsd_Lyrics"] = sample_test["Lyrics"].apply(preprocess_text)
sample_test["Prsd_Artist"] = sample_test["Artist"].apply(preprocess_text)

# extract labels & convert to one-hot encoded vectors
labels_b = sample_test["Genre"]
label_dict_b = {label_b: i for i, label_b in enumerate(labels_b.unique())}
labels_encoded_b = labels_b.map(label_dict_b)
labels_categorical_b = tf.keras.utils.to_categorical(labels_encoded_b)

# tokenize & pad lyrics and artist
tokenizer_lyrics_b = Tokenizer(char_level=True)
tokenizer_artist_b = Tokenizer()
tokenizer_lyrics_b.fit_on_texts(sample_test['Prsd_Lyrics'])
sequences_lyrics_b = tokenizer_lyrics.texts_to_sequences(sample_test['Prsd_Lyrics'])
tokenizer_artist_b.fit_on_texts(sample_test['Prsd_Artist'])
sequences_artist_b = tokenizer_artist.texts_to_sequences(sample_test['Prsd_Artist'])

# filter out OOV tokens from input features
filtered_sequences_lyrics_b = [[token for token in seq if token in tokenizer_lyrics_b.word_index] for seq in sequences_lyrics_b]
filtered_sequences_artist_b = [[token for token in seq if token in tokenizer_artist_b.word_index] for seq in sequences_artist_b]

# Pad filtered sequences
X_lyrics_filtered_b = pad_sequences(filtered_sequences_lyrics_b, maxlen=max_lyric_length)
X_artist_filtered_b = pad_sequences(filtered_sequences_artist_b, maxlen=max_artist_length)

# ensure max length is respected
X_test_b = np.concatenate((X_lyrics_filtered_b, X_artist_filtered_b), axis=1)
y_test_b = labels_categorical_b

### Evaluate top models from training & validation

#### Model H - In-training embeddeding

In [None]:
# evaluate model
emb_loss_b, emb_accuracy_b, emb_precision_value_b, emb_recall_value_b = evaluate_model(emb_model, [X_lyrics_filtered_b, X_artist_filtered_b], y_test_b)

print("Loss:", emb_loss_b)
print("Accuracy:", emb_accuracy_b)
print("Precision:", emb_precision_value_b)
print("Recall:", emb_recall_value_b)


#### Model L - Pre-trained embedding

In [None]:
# evaluate model
w2v_loss_b, w2v_accuracy_b, w2v_precision_value_b, w2v_recall_value_b = evaluate_model(w2v_model, [X_lyrics_filtered_b, X_artist_filtered_b], y_test_b)

print("Loss:", w2v_loss_b)
print("Accuracy:", w2v_accuracy_b)
print("Precision:", w2v_precision_value_b)
print("Recall:", w2v_recall_value_b)