# Predicting the Genre (pop or prog. rock) of a Song by its lyrics

<a id = "2"></a>
# 2 - Importing

<a id = "3"></a>
##  Importing the Libraries

In [1]:
# these will be the libraries we will use
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
import os
import re

<a id = "4"></a>
##  Importing the Data

In [2]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [3]:
data = pd.read_csv('../data/dataset.csv')

data

Unnamed: 0,genre,lyrics,SongInfo
0,Christian,"Who am I, that the Lord of all the earth Woul...",CASTING CROWNS - WHO AM I LYRICS
1,Christian,Glory Revealed By His Wounds He was pierced ...,GLORY REVEALED - BY HIS WOUNDS LYRICS
2,Christian,Lord of heaven and earth Lord of all creation...,CAEDMON'S CALL - GOD OF WONDERS LYRICS
3,Christian,I can only imagine what it will be like When ...,MERCYME - I CAN ONLY IMAGINE LYRICS
4,Christian,I am not skilled to understand What God has w...,AARON SHUST - MY SAVIOR MY GOD LYRICS
...,...,...,...
553,R&B,"Ha I dont care ha, about your past I just wan...",JAMES BROWN - COLD SWEAT LYRICS
554,R&B,Hoverin by my suitcase Tryin to find a warm ...,BROOK BENTON - RAINY NIGHT IN GEORGIA LYRICS
555,R&B,I dont know why I love you like I do After a...,AL GREEN - TAKE ME TO THE RIVER LYRICS
556,R&B,"C. C. Rider Elvis Presley Well now see., C. ...",CHUCK WILLIS - C.C. RIDER LYRICS


In [4]:
data["genre"].unique()

array(['Christian', 'Country', 'Hip-Hop', 'Pop', 'Rock', 'R&B'],
      dtype=object)

<a id = "3"></a>
# 3 -  PART I : Data Preprocessing

<a id = "6"></a>
## Removing Numbers, Punctuations and Lowercasing the Words

In [5]:
def rid_of_specials(words):
    new= ''
    for i in range(len(words)):
        a = re.sub('[^A-Za-z]+', ' ', words[i]).lower()
        new += a
    return new

<a id = "7"></a>
### a) Removing special characters

In [6]:
data["lyrics"] = data["lyrics"].apply(rid_of_specials)
data["lyrics"].head(10)

0     who am i  that the lord of all the earth woul...
1     glory revealed  by his wounds he was pierced ...
2     lord of heaven and earth lord of all creation...
3     i can only imagine what it will be like when ...
4     i am not skilled to understand what god has w...
5     this is the air i breathe this is the air i b...
6     we do not have the lyrics for how great is ou...
7     everyone needs compassion a love thats never ...
8     blessed be your name in the land that is plen...
9     we do not have the lyrics for holy is the lor...
Name: lyrics, dtype: object

In [7]:
data.genre

0      Christian
1      Christian
2      Christian
3      Christian
4      Christian
         ...    
553          R&B
554          R&B
555          R&B
556          R&B
557          R&B
Name: genre, Length: 558, dtype: object

<a id = "9"></a>
## Removing Stopwords

In [8]:
data["lyrics"]

0       who am i  that the lord of all the earth woul...
1       glory revealed  by his wounds he was pierced ...
2       lord of heaven and earth lord of all creation...
3       i can only imagine what it will be like when ...
4       i am not skilled to understand what god has w...
                             ...                        
553     ha i dont care ha  about your past i just wan...
554     hoverin by my suitcase  tryin to find a warm ...
555     i dont know why i love you like i do  after a...
556     c  c  rider elvis presley  well now see   c  ...
557     cynthia get up and dance to the music   get o...
Name: lyrics, Length: 558, dtype: object

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
sw_nltk = (stopwords.words('english'))
stop_words = set(sw_nltk)

In [10]:
def remove_sw(x):
    x = x.split(' ')
    return  ' '.join(z for z in x if z not in stop_words)
stopped = data["lyrics"].apply(remove_sw)
stopped

0        lord earth would care know name would care f...
1       glory revealed  wounds pierced transgressions...
2       lord heaven earth lord creation lord heaven e...
3       imagine like walk side imagine eyes see face ...
4       skilled understand god willed  god planned kn...
                             ...                        
553     ha dont care ha  past want ho  love last  uh ...
554     hoverin suitcase  tryin find warm place spend...
555     dont know love like  changes put  stole money...
556     c  c  rider elvis presley  well see   c  c  r...
557     cynthia get dance music   get dance  music   ...
Name: lyrics, Length: 558, dtype: object

<a id = "15"></a>
## Vectorization

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
# Step 1
vectorizer = CountVectorizer()
# Step 2
X = vectorizer.fit_transform(stopped)
# Step 3
feature_names = vectorizer.get_feature_names_out()
# Step 4
lyrics_vectorized = pd.DataFrame(X.toarray(), columns = feature_names)


In [12]:
lyrics_vectorized

Unnamed: 0,aa,aaa,aaaaoooo,aaalll,aaaooo,aah,aai,aay,abandoned,ability,...,zippers,zips,zircon,zone,zones,zoo,zoom,zooms,zorro,zulu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
X.shape

(558, 8442)

In [15]:
# splitwords = [ nltk.word_tokenize( str(sentence) ) for sentence in  data['Lyrics_Processed'] ]
# print(splitwords)

<a id = "19"></a>
## Target Preperation

In [16]:
# our genre column of 2 unique variables

data.genre

0      Christian
1      Christian
2      Christian
3      Christian
4      Christian
         ...    
553          R&B
554          R&B
555          R&B
556          R&B
557          R&B
Name: genre, Length: 558, dtype: object

In [17]:
data.genre.unique()

array(['Christian', 'Country', 'Hip-Hop', 'Pop', 'Rock', 'R&B'],
      dtype=object)

In [18]:
# we assign numerical values 1 and 0 to genres "prog" and "pop", respectively.

genre_array = data.genre.map({"Christian":0, "Country":1,'Hip-Hop':0,'Pop':0,'Rock':0,'R&B':0})
genre_array

0      0
1      0
2      0
3      0
4      0
      ..
553    0
554    0
555    0
556    0
557    0
Name: genre, Length: 558, dtype: int64

In [19]:
lyrics_vectorized

Unnamed: 0,aa,aaa,aaaaoooo,aaalll,aaaooo,aah,aai,aay,abandoned,ability,...,zippers,zips,zircon,zone,zones,zoo,zoom,zooms,zorro,zulu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<a id = "32"></a>
# 6 -  PART IV : Train - Test Splitting

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, genre_array, test_size=0.2, random_state=42)

<a id = "33"></a>
# 7 -  PART V : Modelling

<a id = "34"></a>
## Naive Bayes

In [21]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train,y_train)

In [22]:
print("Naive Bayes Score: ", nb.score(X_test,y_test) )

Naive Bayes Score:  0.8571428571428571


<a id = "35"></a>
## Decision Trees

In [23]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)

In [24]:
print("Decision Tree Score: ", dt.score(X_test,y_test) )

Decision Tree Score:  0.7946428571428571


<a id = "36"></a>
## Random Forest Algorithm

In [25]:
from sklearn.ensemble import RandomForestClassifier
    
classifier = RandomForestClassifier(n_estimators=550, max_depth=300, random_state=0)
classifier.fit(X_train, y_train)

In [27]:
y_pred = classifier.predict(X_test)

<a id = "37"></a>
# 7 -  PART VI : Model Evaluation

In [28]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred,zero_division = True))
print(accuracy_score(y_test, y_pred))

[[95  0]
 [17  0]]
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        95
           1       1.00      0.00      0.00        17

    accuracy                           0.85       112
   macro avg       0.92      0.50      0.46       112
weighted avg       0.87      0.85      0.78       112

0.8482142857142857


In [29]:
print("Naive Bayes Score: ", nb.score(X_test,y_test) )
print("Decision Tree Score: ", dt.score(X_test,y_test) )
print("Random Forest Score: ",accuracy_score(y_test, y_pred))

Naive Bayes Score:  0.8571428571428571
Decision Tree Score:  0.7946428571428571
Random Forest Score:  0.8482142857142857


~ 0.86, 0.79, 0.85 are the accuracy scores for our prediction, using Naive Bayes, Decision Tree and Random Forest respectively.