## Predicting Lyric Genre Using Natural Language Model 

#### Introduction: Packages and Data

In [2]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
keras.utils.set_random_seed(42)

In [5]:
#data
train_df = pd.read_csv("lyric_genre_train.csv")
test_df = pd.read_csv("lyric_genre_test.csv")
val_df = pd.read_csv("lyric_genre_val.csv")

In [6]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,Lyric,Genre
0,0,"Oh, girl. I can't get ready (Can't get ready f...",Pop
1,1,We met on a rainy evening in the summertime. D...,Pop
2,2,We carried you in our arms. On Independence Da...,Rock
3,3,I know he loved you. A long time ago. I ain't ...,Pop
4,4,Paralysis through analysis. Yellow moral uncle...,Rock


In [7]:
train_df.tail()

Unnamed: 0.1,Unnamed: 0,Lyric,Genre
48986,48986,"[Hook]. Beamer, Benz, Or Bentley. Beamer, Benz...",Hip Hop
48987,48987,You never listen to me. I know I'm better off ...,Pop
48988,48988,Things have come to a pretty pass. Our romance...,Pop
48989,48989,"Little baby, on my shoulder. I could fall into...",Pop
48990,48990,Music : Rudolf Schenker. Lyrics: Klaus Meine. ...,Rock


In [8]:
train_df.shape

(48991, 3)

In [9]:
test_df.shape

(21774, 3)

In [10]:
val_df.shape

(16331, 3)

In [12]:
#frequencies for each category
train_df['Genre'].value_counts()/train_df.shape[0]

Genre
Rock       0.549448
Pop        0.295136
Hip Hop    0.155416
Name: count, dtype: float64

In [32]:
#create dummies for the dependent variable Genre in each dataset by one hot encoding them
y_train = pd.get_dummies(train_df['Genre']).to_numpy()
y_val = pd.get_dummies(val_df['Genre']).to_numpy()
y_test = pd.get_dummies(test_df['Genre']).to_numpy()

In [14]:
y_train

array([[False,  True, False],
       [False,  True, False],
       [False, False,  True],
       ...,
       [False,  True, False],
       [False,  True, False],
       [False, False,  True]])

#### Model 1: Bag of Words

In [15]:
#text vextorization with multi hot encoding
max_tokens = 5000
text_vectorization = keras.layers.TextVectorization(
    max_tokens = max_tokens,
    output_mode= 'multi_hot'
)

In [16]:
#define the vocabolary to be indexed
text_vectorization.adapt(train_df['Lyric'])

In [20]:
#check out the least common 10 words
text_vectorization.get_vocabulary()[-10:]

[np.str_('stickin'),
 np.str_('rumble'),
 np.str_('rug'),
 np.str_('pam'),
 np.str_('os'),
 np.str_('ooohh'),
 np.str_('motto'),
 np.str_('marshall'),
 np.str_('loyalty'),
 np.str_('legacy')]

In [22]:
#create the input variables from the text_vectorization layer for each dataset
x_train = text_vectorization(train_df['Lyric'])
x_test = text_vectorization(test_df['Lyric'])
x_val = text_vectorization(val_df['Lyric'])


In [26]:
x_train[-30:]

<tf.Tensor: shape=(30, 5000), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>

In [28]:
#build simple 1-hidden layer model
inputs = keras.Input(shape = (max_tokens, ))
x = keras.layers.Dense(8, activation = "relu")(inputs)
outputs = keras.layers.Dense(3, activation = "softmax")(x)

model = keras.Model(inputs, outputs)

model.summary()


In [29]:
#optimize model
model.compile(optimizer = 'adam',
             loss = 'categorical_crossentropy',
             metrics = ["accuracy"])

In [30]:
#fit model
model.fit(x= x_train, y = y_train,
         validation_data = (x_val, y_val),
         epochs = 10,
         batch_size = 32)

Epoch 1/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.6962 - loss: 0.6988 - val_accuracy: 0.7513 - val_loss: 0.5734
Epoch 2/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.7671 - loss: 0.5334 - val_accuracy: 0.7502 - val_loss: 0.5769
Epoch 3/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.7830 - loss: 0.5013 - val_accuracy: 0.7475 - val_loss: 0.5866
Epoch 4/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.7939 - loss: 0.4779 - val_accuracy: 0.7452 - val_loss: 0.5990
Epoch 5/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8041 - loss: 0.4569 - val_accuracy: 0.7421 - val_loss: 0.6162
Epoch 6/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8135 - loss: 0.4374 - val_accuracy: 0.7398 - val_loss: 0.6394
Epoch 7/10

<keras.src.callbacks.history.History at 0x1e92b44ff10>

In [33]:
#evaluate model
model.evaluate(x = x_test, y = y_test)

[1m681/681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7230 - loss: 0.7545


[0.766316831111908, 0.7204004526138306]

#### Model 2: Word Embedings

In [35]:
# glove embeding trained with 2014 english wikipedia will be used here.
#It includes 100-dimensional emmbeding vectors of 400 thousand words

import requests

url = "http://nlp.stanford.edu/data/glove.6B.zip"
response = requests.get(url)

with open("glove.6B.zip", "wb") as f:
    f.write(response.content)


In [36]:
import zipfile

with zipfile.ZipFile("glove.6B.zip", "r") as zip_ref:
    zip_ref.extractall()

In [38]:
#create a dictionary to hold pre-trained embeding vectors
embedding_dim = 100
path_to_glove_file = f"glove.6B.{embedding_dim}d.txt"

embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [42]:
embeddings_index['power']

array([-4.0623e-02,  1.2911e-01,  9.2652e-01, -7.2253e-02,  4.3828e-01,
       -3.7762e-01, -2.7500e-01, -9.7944e-02, -1.7680e-01,  3.8279e-01,
        9.2663e-03,  9.0631e-03, -3.1502e-01,  6.2815e-02, -2.2111e-01,
       -9.9742e-01, -1.8360e-01,  3.9113e-01, -9.2952e-02, -1.2779e-01,
        4.8426e-01, -4.9320e-01,  5.0948e-01, -4.0813e-01,  6.3657e-01,
       -3.5722e-01, -2.9193e-01,  3.8334e-01,  5.3071e-01,  3.5986e-01,
        9.4441e-01,  5.1081e-01, -2.8931e-01, -1.8275e-01, -6.4469e-01,
        1.7839e-03,  2.9478e-01, -1.6024e-01, -3.6157e-01, -3.5547e-01,
       -1.7029e-01, -3.6866e-01,  2.1928e-01, -8.1945e-01, -9.6375e-02,
       -9.4109e-02,  3.1669e-01, -5.9285e-01,  5.9422e-01, -3.3568e-01,
       -5.5049e-01,  5.8094e-02, -2.0299e-02,  1.5526e+00,  1.0057e+00,
       -2.2807e+00,  9.0735e-02,  2.5548e-01,  1.9764e+00,  1.9240e-01,
        2.1717e-01, -5.1021e-01, -5.1359e-01,  5.1908e-01,  1.0555e+00,
        5.8991e-01,  3.1111e-01, -6.3756e-01,  8.6152e-02,  3.56

In [43]:
#incorporate GloVe embedings into the model to train it
max_length = 300
max_tokens = 5000
text_vectorization = keras.layers.TextVectorization(
    max_tokens = max_tokens,
    output_mode = 'int',
    output_sequence_length = max_length
)

In [45]:
# use text vectorization on our train data
text_vectorization.adapt(train_df['Lyric'])

In [46]:
#create input variables
x_train = text_vectorization(train_df['Lyric'])
x_test = text_vectorization(test_df['Lyric'])
x_val = text_vectorization(val_df['Lyric'])

In [47]:
x_train

<tf.Tensor: shape=(48991, 300), dtype=int64, numpy=
array([[  40,   83,    4, ...,   22,  729,    3],
       [  20,  649,   13, ...,    0,    0,    0],
       [  20, 2872,    3, ...,    0,    0,    0],
       ...,
       [ 153,   66,   62, ...,    0,    0,    0],
       [ 119,   51,   13, ...,    0,    0,    0],
       [ 358,    1,    1, ...,    0,    0,    0]])>

In [50]:
#create an embedding matrix to use in the NN model 
vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

counter = 0
embedding_matrix = np.zeros((max_tokens, embedding_dim))

for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            counter += 1

In [51]:
embedding_matrix.shape

(5000, 100)

In [None]:
#create embedding layer


In [52]:
embedding_layer = keras.layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer = keras.initializers.Constant(embedding_matrix),
    trainable = False
)

In [62]:
#create a NN model that includes the embedding layer
inputs = keras.Input(shape = (max_length, ))
embedded = embedding_layer(inputs)
embedded = keras.layers.GlobalAveragePooling1D()(embedded)
x = keras.layers.Dense(8)(embedded)
x = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(3, activation = "softmax")(x)

model = keras.Model(inputs, outputs)

model.summary()


In [63]:
#omptimize model
model.compile(optimizer = "adam",
             loss = "categorical_crossentropy",
             metrics = ["accuracy"])

In [64]:
#fit model
model.fit(x=x_train, y=y_train,
          validation_data=(x_val, y_val),
          epochs=10,
          batch_size=32,)

Epoch 1/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.5431 - loss: 0.9695 - val_accuracy: 0.6371 - val_loss: 0.7952
Epoch 2/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.6361 - loss: 0.8133 - val_accuracy: 0.6605 - val_loss: 0.7649
Epoch 3/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.6450 - loss: 0.8002 - val_accuracy: 0.6630 - val_loss: 0.7621
Epoch 4/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.6448 - loss: 0.7935 - val_accuracy: 0.6554 - val_loss: 0.7596
Epoch 5/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.6496 - loss: 0.7858 - val_accuracy: 0.6552 - val_loss: 0.7592
Epoch 6/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.6516 - loss: 0.7873 - val_accuracy: 0.6669 - val_loss: 0.7524
Epoch 7/10
[1

<keras.src.callbacks.history.History at 0x1ebf297d030>

In [65]:
model.evaluate(x = x_test, y = y_test)

[1m681/681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6764 - loss: 0.7489


[0.7584152221679688, 0.6693763136863708]

It performs worst than model 1 above because we set the trainable_parameters above to False. That means that it has very few trainable parameters. When we change it below, it performs much better. 

In [66]:
embedding_layer = keras.layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer = keras.initializers.Constant(embedding_matrix),
    trainable = True
)

inputs = keras.Input(shape = (max_length, ))
embedded = embedding_layer(inputs)
embedded = keras.layers.GlobalAveragePooling1D()(embedded)
x = keras.layers.Dense(8)(embedded)
x= keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(3,activation = "softmax")(x)

model = keras.Model(inputs, outputs)
model.summary()

In [67]:
#compile model
model.compile(optimizer = "adam",
             loss = "categorical_crossentropy",
             metrics = ['accuracy'])

In [68]:
#fit model
model.fit(x= x_train, y = y_train,
         validation_data = (x_val, y_val),
         epochs = 10,
         batch_size = 32)

Epoch 1/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 10ms/step - accuracy: 0.6346 - loss: 0.8350 - val_accuracy: 0.7218 - val_loss: 0.6521
Epoch 2/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.7144 - loss: 0.6828 - val_accuracy: 0.7299 - val_loss: 0.6286
Epoch 3/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.7274 - loss: 0.6483 - val_accuracy: 0.7333 - val_loss: 0.6195
Epoch 4/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.7354 - loss: 0.6275 - val_accuracy: 0.7310 - val_loss: 0.6190
Epoch 5/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.7404 - loss: 0.6166 - val_accuracy: 0.7312 - val_loss: 0.6206
Epoch 6/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - accuracy: 0.7444 - loss: 0.6095 - val_accuracy: 0.7322 - val_loss: 0.6256
Epoch 7/

<keras.src.callbacks.history.History at 0x1ebf2d01ab0>

In [69]:
model.evaluate(x = x_test, y = y_test)

[1m681/681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7302 - loss: 0.6407


[0.6435191035270691, 0.731330931186676]

In [70]:
#lets see how it works with actuall lyrics 
def lyric_predict(phrase):
    raw_text_data = tf.convert_to_tensor([[phrase],])

    vect_data = text_vectorization(raw_text_data)
    predictions = model.predict(vect_data) 
    predictions
    print(f"{float(predictions[0,0] * 100):.2f} % Hip-Hop")
    print(f"{float(predictions[0,1] * 100):.2f} % Pop")
    print(f"{float(predictions[0,2] * 100):.2f} % Rock")

In [71]:
phrase = '''I grew up on the crime side, the New York Times side
Stayin' alive was no jive
Had secondhands, Mom's bounced on old man
So then we moved to Shaolin land
A young youth, yo, rockin' the gold tooth, 'Lo goose
Only way I begin the G off was drug loot
And let's start it like this, son
Rollin' with this one and that one, pullin' out gats for fun
But it was just a dream for the teen
Who was a fiend, started smokin' woolies at 16
And runnin' up in gates and doin' hits for high stakes
Makin' my way on fire escapes
No question, I would speed for cracks and weed
The combination made my eyes bleed
No question, I would flow off and try to get the dough all
Stickin' up white boys in ball courts
My life got no better, same damn 'Lo sweater
Times is rough and tough like leather
Figured out I went the wrong route
So I got with a sick-ass clique and went all out
Catchin' keys from 'cross seas
Rollin' in MPV's, every week we made forty G's
Yo, ****, respect mine, or here go the TEC-9
Ch-chick-pow! Move from the gate now'''

lyric_predict(phrase)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
43.99 % Hip-Hop
16.03 % Pop
39.98 % Rock


In [75]:
phrase = '''It still feels like our first night together
Feels like the first kiss
It's getting better baby
No one can better this
Still holding on
You're still the one
First time our eyes met
Same feeling I get
Only feels much stronger
I want to love you longer
Do you still turn the fire on?
So if you're feeling lonely, don't
You're the only one I'll ever want
I only want to make it good
So if I love you, a little more than I should
Please forgive me, I know not what I do
Please forgive me, I can't stop loving you
Don't deny me, this pain I'm going through
Please forgive me, if I need you like I do
Please believe me, every word I say is true
Please forgive me, I can't stop loving you
Still feels like our best times are together
Feels like the first touch
We're still getting closer baby
Can't get closer enough
Still holding on
You're still number one
I remember the smell of your skin
I remember everything
I remember all your moves
I remember you yeah
I remember the nights, you know I still do
So if you're feeling lonely, don't
You're the only one I'll ever want
I only want to make it good
So if I love you a little more than I should
Please forgive me, I know not what I do
Please forgive me, I can't stop loving you
Don't deny me, this pain I'm going through
Please forgive me, if I need you like I do
Please believe me, every word I say is true
Please forgive me, I can't stop loving you
The one thing I'm sure of
Is the way we make love
The one thing I depend on
Is for us to stay strong
With every word and every breath I'm praying
That's why I'm saying,
Please forgive me, I know not what I do
Please forgive me, I can't stop loving you
Don't deny me, this pain I'm going through
Please forgive me, if I need you like I do
Babe believe it, every word I say is true
Please forgive me, if I can't stop loving you
No, believe me, I don't know what I do
Please forgive me, I can't stop loving you
I can't stop, loving you'''

lyric_predict(phrase)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
4.01 % Hip-Hop
56.56 % Pop
39.43 % Rock
