# SVM implementation for Music Genre Classification using song lyrics

### Common code to import libraries and read dataset

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from gensim.models import KeyedVectors
import ast

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score, recall_score
from sklearn.model_selection import GridSearchCV

In [9]:
df = pd.read_csv('Processed_Lyric_Dataset (1).csv')

In [10]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Artist,SName,Lyric,Genre,LyricProcessed,LyricsList
0,0,Ivete Sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,pop,"[['feel', 'unsure'], ['take', 'hand', 'lead', ...","[['feel', 'unsure'], ['take', 'hand', 'lead', ..."
1,1,Ivete Sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",pop,"[['let', 'fool', 'ya'], ['even', 'try', 'schoo...","[['let', 'fool', 'ya'], ['even', 'try', 'schoo..."
2,2,Ivete Sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",pop,"[['baby', 'let', 'cruise', 'away'], ['confuse'...","[['baby', 'let', 'cruise', 'away'], ['confuse'..."
3,3,Ivete Sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",pop,"[['know', 'sound', 'funny'], ['cant', 'stand',...","[['know', 'sound', 'funny'], ['cant', 'stand',..."
4,4,Ivete Sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,pop,"[['get', 'look'], ['one', 'hop', 'lad'], ['fac...","[['get', 'look'], ['one', 'hop', 'lad'], ['fac..."
5,5,Ivete Sangalo,Human Nature,Looking out\nAcross the night time\nThe city w...,pop,"[['look'], ['across', 'night', 'time'], ['city...","[['look'], ['across', 'night', 'time'], ['city..."
6,6,Ivete Sangalo,Losing Control (Miss Cady feat. Ivete Sangalo),"Uh, yeah.\nGo, go, go.\nUh, yeah.\nUh, Uh, Uhh...",pop,"[['uh', 'yeah..', 'go', 'go', 'go..', 'uh', 'y...","[['uh', 'yeah..', 'go', 'go', 'go..', 'uh', 'y..."
7,7,Ivete Sangalo,Master Blaster (Jammin'),Everyone's feeling pretty\nIt's hotter than Ju...,pop,"[['everyones', 'feel', 'pretty'], ['hotter', '...","[['everyones', 'feel', 'pretty'], ['hotter', '..."
8,8,Ivete Sangalo,More Than Words,Saying 'I Love you'\nIs not the words I want t...,pop,"[['say', 'love'], ['word', 'want', 'hear'], ['...","[['say', 'love'], ['word', 'want', 'hear'], ['..."
9,10,Ivete Sangalo,Where It Begins (feat. Nelly Furtado),"When you're alone and you don't know how,\nTo ...",pop,"[['alone', 'know'], ['fill', 'hole', 'inside',...","[['alone', 'know'], ['fill', 'hole', 'inside',..."


In [11]:
# Performing label encoding on the Genre column
label_encoder = LabelEncoder()
df['Genre_Encoded'] = label_encoder.fit_transform(df['Genre'])
df.head(5)

Unnamed: 0.1,Unnamed: 0,Artist,SName,Lyric,Genre,LyricProcessed,LyricsList,Genre_Encoded
0,0,Ivete Sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,pop,"[['feel', 'unsure'], ['take', 'hand', 'lead', ...","[['feel', 'unsure'], ['take', 'hand', 'lead', ...",9
1,1,Ivete Sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",pop,"[['let', 'fool', 'ya'], ['even', 'try', 'schoo...","[['let', 'fool', 'ya'], ['even', 'try', 'schoo...",9
2,2,Ivete Sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",pop,"[['baby', 'let', 'cruise', 'away'], ['confuse'...","[['baby', 'let', 'cruise', 'away'], ['confuse'...",9
3,3,Ivete Sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",pop,"[['know', 'sound', 'funny'], ['cant', 'stand',...","[['know', 'sound', 'funny'], ['cant', 'stand',...",9
4,4,Ivete Sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,pop,"[['get', 'look'], ['one', 'hop', 'lad'], ['fac...","[['get', 'look'], ['one', 'hop', 'lad'], ['fac...",9


In [12]:
df['LyricsList'] = df['LyricsList'].apply(lambda x: ast.literal_eval(x))

### Using Word2Vec embeddings

In [None]:
# load word embedding files
def read_embeddings(training_file):
    '''Loads and parses embeddings trained in earlier.
    Parameters and return values are up to you.
    '''
    # you may find generating the following two dicts useful:
    # word to embedding : {'the':[0....], ...}
    # index to embedding : {1:[0....], ...} 
    # use your tokenizer's word_index to find the index of
    # a given word
    model = KeyedVectors.load_word2vec_format(training_file, binary=False)
    
    # Create a dictionary that maps words to their embeddings
    word_to_embeddings = {word: model[word] for word in model.key_to_index}

    # Create a dictionary that maps indexes to their embeddings
    # index_to_embeddings = {index: model.wv.vectors[index] for index in range(len(model.vocab))}
    
    return word_to_embeddings

In [None]:
word_to_embeddings = read_embeddings('w2v_embeddings (1).txt')

## SVM Model without any tuning: Vector by averaging

In [None]:
# convert lyrics to vectors based on the word embeddings
data = [] 
labels = [] 

for _, row in df.iterrows(): 
  label = row["Genre"] 
  lyrics = row["LyricsList"] 
  feature_vector = np.zeros((200,)) 
  song_length = 0 
  for line in lyrics: 
    song_length += len(line) 
    for word in line: 
      try:
        feature_vector += word_to_embeddings[word] 
      except:
        pass
  data.append(feature_vector/song_length) 
  labels.append(label)

  data.append(feature_vector/song_length)


In [None]:
# check for nan values in data
# if present, remove that row in data and labels
index_to_remove = list()
c = 0
for i in range(len(data)):
  if np.isnan(data[i]).any():
    index_to_remove.append(i)


index_to_remove = sorted(index_to_remove, reverse=True)
for i in index_to_remove:
  data.pop(i)
  labels.pop(i)

In [None]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [None]:
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.4182721645889635


In [None]:
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)

### Tune the SVM model using the parameter C

In [None]:

# Defining the range of C values to consider
C_range = [0.1, 1, 10]

# Creating a dictionary of parameter values to search
param_grid = {'C': C_range}

# Defining the SVM model
svm = SVC(kernel='linear')

# Performing a grid search to find the best value of C
grid = GridSearchCV(svm, param_grid, cv=2, verbose=3)
grid.fit(X_train, y_train)

# Print the best value of C and the corresponding accuracy score
print("Best C:", grid.best_params_['C'])
print("Accuracy:", grid.best_score_)


Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV 1/2] END .............................C=0.1;, score=0.412 total time=10.1min
[CV 2/2] END .............................C=0.1;, score=0.410 total time= 9.8min
[CV 1/2] END ...............................C=1;, score=0.421 total time= 9.6min
[CV 2/2] END ...............................C=1;, score=0.419 total time= 9.7min
[CV 1/2] END ..............................C=10;, score=0.422 total time=11.7min
[CV 2/2] END ..............................C=10;, score=0.421 total time=11.7min
Best C: 10
Accuracy: 0.42146328705620817


### Read Glove embeddings

In [None]:
word_to_embeddings = read_embeddings('GloVe_Embeddings (1).txt')

## Using GLoVe embeddings

In [None]:
# convert lyrics to vectors based on the word embeddings
data = [] 
labels = [] 

for _, row in df.iterrows(): 
  label = row["Genre"] 
  lyrics = row["LyricsList"] 
  feature_vector = np.zeros((200,)) 
  song_length = 0 
  for line in lyrics: 
    song_length += len(line) 
    for word in line: 
      try:
        feature_vector += word_to_embeddings[word] 
      except:
        pass
  data.append(feature_vector/song_length) 
  labels.append(label)

  data.append(feature_vector/song_length)


In [None]:
# check for nan values in data
# if present, remove that row in data and labels
index_to_remove = list()
c = 0
for i in range(len(data)):
  if np.isnan(data[i]).any():
    index_to_remove.append(i)


index_to_remove = sorted(index_to_remove, reverse=True)
for i in index_to_remove:
  data.pop(i)
  labels.pop(i)

In [None]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.25664719728009766
Precision: 0.06586778387172937
Recall: 0.25664719728009766


  _warn_prf(average, modifier, msg_start, len(result))


### Reading improved GloVe embeddings

In [13]:
# load word embedding files
def read_embeddings(training_file):
    '''Loads and parses embeddings trained in earlier.
    Parameters and return values are up to you.
    '''
    # you may find generating the following two dicts useful:
    # word to embedding : {'the':[0....], ...}
    # index to embedding : {1:[0....], ...} 
    # use your tokenizer's word_index to find the index of
    # a given word
    model = KeyedVectors.load_word2vec_format(training_file, binary=False)
    
    # Create a dictionary that maps words to their embeddings
    word_to_embeddings = {word: model[word] for word in model.key_to_index}

    # Create a dictionary that maps indexes to their embeddings
    # index_to_embeddings = {index: model.wv.vectors[index] for index in range(len(model.vocab))}
    
    return word_to_embeddings

In [17]:
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = "glove.6B.200d.txt"
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
glove_model = KeyedVectors.load_word2vec_format(tmp_file)
glove_word_to_embeddings = {word: glove_model[word] for word in glove_model.key_to_index}

  _ = glove2word2vec(glove_file, tmp_file)


In [18]:
# convert lyrics to vectors based on the word embeddings
data = [] 
labels = [] 

for _, row in df.iterrows(): 
  label = row["Genre"] 
  lyrics = row["LyricsList"] 
  feature_vector = np.zeros((200,)) 
  song_length = 0 
  for line in lyrics: 
    song_length += len(line) 
    for word in line: 
      try:
        feature_vector += glove_word_to_embeddings[word] 
      except:
        pass
  data.append(feature_vector/song_length) 
  labels.append(label)

  data.append(feature_vector/song_length)


In [19]:
# check for nan values in data
# if present, remove that row in data and labels
index_to_remove = list()
c = 0
for i in range(len(data)):
  if np.isnan(data[i]).any():
    index_to_remove.append(i)


index_to_remove = sorted(index_to_remove, reverse=True)
for i in index_to_remove:
  data.pop(i)
  labels.pop(i)

In [21]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.3987010722692006
Precision: 0.3005364445028348
Recall: 0.3987010722692006


  _warn_prf(average, modifier, msg_start, len(result))
