In [121]:
import pandas as pd
import numpy as np
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [183]:
lyrics = pd.read_csv('lyrics-data.csv')
genres = pd.read_csv('artists-data.csv')

lyrics = lyrics.dropna()

lyrics = lyrics[["ALink", "Lyric"]]
genres = genres[["Link", "Genre"]]

new_data = pd.merge(genres, lyrics, left_on="Link", right_on="ALink")
new_data = new_data[["Genre", "Lyric"]]
new_data = new_data.sample(frac = 0.3).reset_index(drop = True)

display(new_data)

(209522, 5) (3242, 6)


Unnamed: 0,Genre,Lyric
0,Rock,"Jogava a rede pra arrastar,. Teus olhos para a..."
1,Rock,You had a girlfriend. That wasn't good enough ...
2,Rock,I've done everything that can be done to heal ...
3,Rock,Carpinteiro do universo inteiro eu sou (2x). N...
4,Pop,sabes aprovechemos esta noche. para pedir por ...
...,...,...
67194,Pop,No escurinho do cinema. Chupando drops de anis...
67195,Rock,"Fourteen years, thirty minutes. Fifteen second..."
67196,Pop,Cuando estás cerca de mí. Siempre buscan maner...
67197,Rock,Loose and guilty and whipped. Sterility persec...


In [184]:
#preprocessing

#convert text to lowercase
new_data["Lyric_lower"] = new_data["Lyric"].str.lower()

#remove special characters, punctuations
new_data["Lyric_no_spec"] = new_data["Lyric_lower"].str.replace(r'[^a-zA-Z\s]+', '')

display(new_data)

Unnamed: 0,Genre,Lyric,Lyric_lower,Lyric_no_spec
0,Rock,"Jogava a rede pra arrastar,. Teus olhos para a...","jogava a rede pra arrastar,. teus olhos para a...",jogava a rede pra arrastar teus olhos para a m...
1,Rock,You had a girlfriend. That wasn't good enough ...,you had a girlfriend. that wasn't good enough ...,you had a girlfriend that wasnt good enough fo...
2,Rock,I've done everything that can be done to heal ...,i've done everything that can be done to heal ...,ive done everything that can be done to heal t...
3,Rock,Carpinteiro do universo inteiro eu sou (2x). N...,carpinteiro do universo inteiro eu sou (2x). n...,carpinteiro do universo inteiro eu sou x no se...
4,Pop,sabes aprovechemos esta noche. para pedir por ...,sabes aprovechemos esta noche. para pedir por ...,sabes aprovechemos esta noche para pedir por u...
...,...,...,...,...
67194,Pop,No escurinho do cinema. Chupando drops de anis...,no escurinho do cinema. chupando drops de anis...,no escurinho do cinema chupando drops de anis ...
67195,Rock,"Fourteen years, thirty minutes. Fifteen second...","fourteen years, thirty minutes. fifteen second...",fourteen years thirty minutes fifteen seconds ...
67196,Pop,Cuando estás cerca de mí. Siempre buscan maner...,cuando estás cerca de mí. siempre buscan maner...,cuando ests cerca de m siempre buscan maneras ...
67197,Rock,Loose and guilty and whipped. Sterility persec...,loose and guilty and whipped. sterility persec...,loose and guilty and whipped sterility persecu...


In [185]:
#tokenization

#remove stop words
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

new_data["Lyric_no_stop"] = new_data["Lyric_no_spec"].apply(lambda text: remove_stopwords(text))
display(new_data)

Unnamed: 0,Genre,Lyric,Lyric_lower,Lyric_no_spec,Lyric_no_stop
0,Rock,"Jogava a rede pra arrastar,. Teus olhos para a...","jogava a rede pra arrastar,. teus olhos para a...",jogava a rede pra arrastar teus olhos para a m...,jogava rede pra arrastar teus olhos para minha...
1,Rock,You had a girlfriend. That wasn't good enough ...,you had a girlfriend. that wasn't good enough ...,you had a girlfriend that wasnt good enough fo...,girlfriend wasnt good enough younger old enoug...
2,Rock,I've done everything that can be done to heal ...,i've done everything that can be done to heal ...,ive done everything that can be done to heal t...,ive done everything done heal wound left years...
3,Rock,Carpinteiro do universo inteiro eu sou (2x). N...,carpinteiro do universo inteiro eu sou (2x). n...,carpinteiro do universo inteiro eu sou x no se...,carpinteiro universo inteiro eu sou x sei por ...
4,Pop,sabes aprovechemos esta noche. para pedir por ...,sabes aprovechemos esta noche. para pedir por ...,sabes aprovechemos esta noche para pedir por u...,sabes aprovechemos esta noche para pedir por u...
...,...,...,...,...,...
67194,Pop,No escurinho do cinema. Chupando drops de anis...,no escurinho do cinema. chupando drops de anis...,no escurinho do cinema chupando drops de anis ...,escurinho cinema chupando drops de anis longe ...
67195,Rock,"Fourteen years, thirty minutes. Fifteen second...","fourteen years, thirty minutes. fifteen second...",fourteen years thirty minutes fifteen seconds ...,fourteen years thirty minutes fifteen seconds ...
67196,Pop,Cuando estás cerca de mí. Siempre buscan maner...,cuando estás cerca de mí. siempre buscan maner...,cuando ests cerca de m siempre buscan maneras ...,cuando ests cerca de siempre buscan maneras de...
67197,Rock,Loose and guilty and whipped. Sterility persec...,loose and guilty and whipped. sterility persec...,loose and guilty and whipped sterility persecu...,loose guilty whipped sterility persecutes plen...


In [187]:
#Stemming
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')
def stemming(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

new_data["cleaned_lyrics"] = new_data["Lyric_no_stop"].apply(lambda text: stemming(text))
display(new_data)

Unnamed: 0,Genre,Lyric,Lyric_lower,Lyric_no_spec,Lyric_no_stop,cleaned_lyrics
0,Rock,"Jogava a rede pra arrastar,. Teus olhos para a...","jogava a rede pra arrastar,. teus olhos para a...",jogava a rede pra arrastar teus olhos para a m...,jogava rede pra arrastar teus olhos para minha...,jogava rede pra arrastar teus olho para minha ...
1,Rock,You had a girlfriend. That wasn't good enough ...,you had a girlfriend. that wasn't good enough ...,you had a girlfriend that wasnt good enough fo...,girlfriend wasnt good enough younger old enoug...,girlfriend wasnt good enough younger old enoug...
2,Rock,I've done everything that can be done to heal ...,i've done everything that can be done to heal ...,ive done everything that can be done to heal t...,ive done everything done heal wound left years...,ive done everyth done heal wound left year ive...
3,Rock,Carpinteiro do universo inteiro eu sou (2x). N...,carpinteiro do universo inteiro eu sou (2x). n...,carpinteiro do universo inteiro eu sou x no se...,carpinteiro universo inteiro eu sou x sei por ...,carpinteiro universo inteiro eu sou x sei por ...
4,Pop,sabes aprovechemos esta noche. para pedir por ...,sabes aprovechemos esta noche. para pedir por ...,sabes aprovechemos esta noche para pedir por u...,sabes aprovechemos esta noche para pedir por u...,sabe aprovechemo esta noch para pedir por un m...
...,...,...,...,...,...,...
67194,Pop,No escurinho do cinema. Chupando drops de anis...,no escurinho do cinema. chupando drops de anis...,no escurinho do cinema chupando drops de anis ...,escurinho cinema chupando drops de anis longe ...,escurinho cinema chupando drop de ani long de ...
67195,Rock,"Fourteen years, thirty minutes. Fifteen second...","fourteen years, thirty minutes. fifteen second...",fourteen years thirty minutes fifteen seconds ...,fourteen years thirty minutes fifteen seconds ...,fourteen year thirti minut fifteen second ive ...
67196,Pop,Cuando estás cerca de mí. Siempre buscan maner...,cuando estás cerca de mí. siempre buscan maner...,cuando ests cerca de m siempre buscan maneras ...,cuando ests cerca de siempre buscan maneras de...,cuando est cerca de siempr buscan manera de se...
67197,Rock,Loose and guilty and whipped. Sterility persec...,loose and guilty and whipped. sterility persec...,loose and guilty and whipped sterility persecu...,loose guilty whipped sterility persecutes plen...,loos guilti whip steril persecut plenti bruis ...


In [188]:
new_data = new_data[["cleaned_lyrics", "Genre"]]
new_data.columns = ["Lyrics", "Genre"]

display(new_data)

Unnamed: 0,Lyrics,Genre
0,jogava rede pra arrastar teus olho para minha ...,Rock
1,girlfriend wasnt good enough younger old enoug...,Rock
2,ive done everyth done heal wound left year ive...,Rock
3,carpinteiro universo inteiro eu sou x sei por ...,Rock
4,sabe aprovechemo esta noch para pedir por un m...,Pop
...,...,...
67194,escurinho cinema chupando drop de ani long de ...,Pop
67195,fourteen year thirti minut fifteen second ive ...,Rock
67196,cuando est cerca de siempr buscan manera de se...,Pop
67197,loos guilti whip steril persecut plenti bruis ...,Rock


In [209]:
train, test = train_test_split(new_data, test_size = 0.2)

In [38]:
count_vector = CountVectorizer(strip_accents="ascii", stop_words="english", analyzer="word", ngram_range=(1, 1))

count_vector.fit(train["Lyrics"].values)

train_words = count_vector.transform(train["Lyrics"].values)
test_words = count_vector.transform(test["Lyrics"].values)

print(train_words.shape)
print(test_words.shape)

(2012, 21375)
(224, 21375)


In [39]:
reg = LogisticRegression(solver="lbfgs", max_iter=200, multi_class="auto", random_state=0).fit(train_words.toarray(), train["Genre"].values)
pred = reg.predict(test_words.toarray())



In [40]:
print(accuracy_score(test["Genre"].values, pred))
print(classification_report(test["Genre"].values, pred))
print(confusion_matrix(test["Genre"].values, pred))

0.5
              precision    recall  f1-score   support

Funk Carioca       0.40      0.25      0.31         8
     Hip Hop       0.53      0.48      0.51        33
         Pop       0.39      0.27      0.32        67
        Rock       0.62      0.70      0.65        76
       Samba       0.17      0.15      0.16        13
   Sertanejo       0.47      0.78      0.58        27

    accuracy                           0.50       224
   macro avg       0.43      0.44      0.42       224
weighted avg       0.48      0.50      0.48       224

[[ 2  1  2  1  0  2]
 [ 0 16 10  2  3  2]
 [ 1 10 18 27  2  9]
 [ 1  2 15 53  3  2]
 [ 0  1  0  1  2  9]
 [ 1  0  1  2  2 21]]


In [210]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

text_clf = Pipeline([
    ('count_vector', CountVectorizer()),
    ('tfidf_trans', TfidfTransformer()),
    ('model',LinearSVC())])

text_clf.fit(train["Lyrics"].values, train["Genre"].values)

predicted = text_clf.predict(test["Lyrics"].values)

In [211]:
print(accuracy_score(test["Genre"], predicted))
print(classification_report(test["Genre"], predicted))
print(confusion_matrix(test["Genre"], predicted))

0.6397321428571429
              precision    recall  f1-score   support

Funk Carioca       0.56      0.45      0.50       462
     Hip Hop       0.71      0.59      0.65      1764
         Pop       0.52      0.46      0.49      3632
        Rock       0.67      0.76      0.71      4549
       Samba       0.67      0.53      0.59       904
   Sertanejo       0.69      0.82      0.75      2129

    accuracy                           0.64     13440
   macro avg       0.64      0.60      0.62     13440
weighted avg       0.63      0.64      0.63     13440

[[ 210   27   60   21    9  135]
 [  30 1048  421  192   26   47]
 [  55  273 1657 1346   70  231]
 [   9  106  803 3454   56  121]
 [  11   24   72   74  483  240]
 [  57    6  150   93   77 1746]]
