In [1]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
w2v_model = KeyedVectors.load_word2vec_format("w2v_embeddings.txt", binary=False)
w2v_word_to_embeddings = {word: w2v_model[word] for word in w2v_model.vocab}

In [3]:
glove_model = KeyedVectors.load_word2vec_format("glove_embeddings.txt", binary=False)
glove_word_to_embeddings = {word: glove_model[word] for word in glove_model.vocab}

In [4]:
dataset = pd.read_csv("Processed_Lyric_Dataset.csv")

In [5]:
dataset = dataset.drop(columns=["Unnamed: 0", "LyricsList"], axis=1)
dataset["LyricsList"] = dataset.apply(lambda x: eval(x['LyricProcessed']), axis=1)
dataset = dataset.drop(columns=["Artist", "Lyric", "LyricProcessed"])
dataset.head(5)

Unnamed: 0,SName,Genre,LyricsList
0,Careless Whisper,pop,"[[feel, unsure], [take, hand, lead, dance, flo..."
1,Could You Be Loved / Citação Musical do Rap: S...,pop,"[[let, fool, ya], [even, try, school, ya, oh],..."
2,Cruisin' (Part. Saulo),pop,"[[baby, let, cruise, away], [confuse, way, cle..."
3,Easy,pop,"[[know, sound, funny], [cant, stand, pain], [g..."
4,For Your Babies (The Voice cover),pop,"[[get, look], [one, hop, lad], [face, beam], [..."


In [6]:
dataset["Genre"].value_counts()

rock           29916
metal          19420
pop            12812
hip-hop        10333
indie           6020
electronic      5866
rap             4535
rnb             4271
soul            3739
punk            3440
country         3135
jazz            2969
folk            2724
alternative     1994
blues           1838
reggae          1697
Name: Genre, dtype: int64

In [7]:
w2v_training_data = []
w2v_training_labels = []

for _, row in dataset.iterrows():
    label = row["Genre"]
    lyrics = row["LyricsList"]
    feature_vector = np.zeros((200,))
    song_length = 0
    for line in lyrics:
        song_length += len(line)
        for word in line:
            if word in w2v_word_to_embeddings:
                feature_vector += w2v_word_to_embeddings[word]
    if song_length == 0:
        w2v_training_data.append(feature_vector)
    else:
        w2v_training_data.append(feature_vector/song_length)
    w2v_training_labels.append(label)

In [8]:
glove_training_data = []
glove_training_labels = []

for _, row in dataset.iterrows():
    label = row["Genre"]
    lyrics = row["LyricsList"]
    feature_vector = np.zeros((100,))
    song_length = 0
    for line in lyrics:
        song_length += len(line)
        for word in line:
            if word in glove_word_to_embeddings:
                feature_vector += glove_word_to_embeddings[word]
    if song_length == 0:
        glove_training_data.append(feature_vector)
    else:
        glove_training_data.append(feature_vector/song_length)
    glove_training_labels.append(label)

In [9]:
word2vec_X_train, word2vec_X_test, word2vec_y_train, word2vec_y_test = train_test_split(w2v_training_data, w2v_training_labels, test_size=0.2, random_state=42)

In [10]:
glove_X_train, glove_X_test, glove_y_train, glove_y_test = train_test_split(glove_training_data, glove_training_labels, test_size=0.2, random_state=42)

In [11]:
print(word2vec_X_train[0][:10], word2vec_y_train[0])
print(glove_X_train[0][:10], glove_y_train[0])

[-0.11428045 -0.2247762  -0.16993003  0.10288301 -0.00463599  0.16851274
 -0.31411238  0.31439397  0.17510919  0.11866679] pop
[-0.02343596  0.11188111  0.42710342  0.15697903 -0.00969928  0.1432713
 -0.02187555  0.03734092  0.20610544  0.09226854] pop


In [12]:
word2vec_gaussian_nb = GaussianNB()

word2vec_gaussian_nb.fit(word2vec_X_train, word2vec_y_train)

GaussianNB()

In [13]:
word2vec_gaussian_y_pred = word2vec_gaussian_nb.predict(word2vec_X_test)

word2vec_gaussian_accuracy = accuracy_score(word2vec_y_test, word2vec_gaussian_y_pred)

print("Accuracy of the Gaussian Naive Bayes Classifier with Word2Vec embeddings is:", word2vec_gaussian_accuracy)

Accuracy of the Gaussian Naive Bayes Classifier with Word2Vec embeddings is: 0.26440589312178536


In [14]:
word2vec_logistic_classifier = LogisticRegression(multi_class='multinomial', max_iter=1000, )

word2vec_logistic_classifier.fit(word2vec_X_train, word2vec_y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [15]:
word2vec_logistic_y_pred = word2vec_logistic_classifier.predict(word2vec_X_test)

word2vec_logistic_accuracy = accuracy_score(word2vec_y_test, word2vec_logistic_y_pred)

print("Accuracy of the Multiclass Logistic Regression with Word2Vec embeddings is:", word2vec_logistic_accuracy)

Accuracy of the Multiclass Logistic Regression with Word2Vec embeddings is: 0.4301281492459245


In [16]:
glove_gaussian_nb = GaussianNB()

glove_gaussian_nb.fit(glove_X_train, glove_y_train)

GaussianNB()

In [17]:
glove_y_pred = glove_gaussian_nb.predict(glove_X_test)

glove_gaussian_accuracy = accuracy_score(glove_y_test, glove_y_pred)

print("Accuracy of the Gaussian Naive Bayes Classifier with GLoVe embeddings is:", glove_gaussian_accuracy)

Accuracy of the Gaussian Naive Bayes Classifier with GLoVe embeddings is: 0.1391334670037486


In [18]:
glove_logistic_classifier = LogisticRegression(multi_class='multinomial', max_iter=1000, )

glove_logistic_classifier.fit(glove_X_train, glove_y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [19]:
glove_logistic_y_pred = glove_logistic_classifier.predict(glove_X_test)

glove_logistic_accuracy = accuracy_score(glove_y_test, glove_logistic_y_pred)

print("Accuracy of the Multiclass Logistic Regression with GLoVe embeddings is:", glove_logistic_accuracy)

Accuracy of the Multiclass Logistic Regression with GLoVe embeddings is: 0.32704210618080376
