# Load data

In [1]:
import pandas as pd
import string
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

import plotly.express as px
import plotly.io as pio

In [2]:
df = pd.read_csv('./dataset/lyrics.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371


In [3]:
print(df.isnull().sum())
df_test = df.dropna()

Unnamed: 0    0
artist        0
seq           0
song          0
label         0
dtype: int64


In [4]:
df = df.drop(['Unnamed: 0', 'song', 'artist'], axis=1)

In [5]:
#Removing unnecessary characters from lyrics
df['seq'] = df['seq'].str.replace("[\(\[].*?[\)\]]", '')
df['seq'] = df['seq'].str.replace("\n", ' ')
df['seq'] = df['seq'].str.replace("\r", ' ')
df['seq'] = df['seq'].str.lower()
df['seq'] = df['seq'].str.replace('[{}]'.format(string.punctuation), '')

  df['seq'] = df['seq'].str.replace("[\(\[].*?[\)\]]", '')
  df['seq'] = df['seq'].str.replace('[{}]'.format(string.punctuation), '')


In [6]:
def unique(list1):
   # intilize a null list
     unique_list = []
   # traverse for all elements
     for x in list1:
         # check if exists in unique_list or not
         if x not in unique_list:
              unique_list.append(x)
     return unique_list

In [7]:
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

comp_score = []
sentiment = []

for i in df_test.loc[:,'seq']:
    sentiment.append(sia.polarity_scores(i))


df.loc[:,'sent_scores'] = sentiment
df.loc[:,'comp_score'] = df.loc[:,'sent_scores'].apply(lambda x: x['compound'])
df.loc[:,'sentiment_vader'] = df.loc[:,'comp_score'].apply(lambda x: 'positive' if x>0.2 else 'negative' if x<-0.2 else 'neutral')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/pereira/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
df = df.rename(columns={'comp_score': 'label_vader'})

# Data Visualization

In [9]:
df

Unnamed: 0,seq,label,sent_scores,label_vader,sentiment_vader
0,no no i aint ever trapped out the bando but ...,0.626,"{'neg': 0.18, 'neu': 0.701, 'pos': 0.119, 'com...",-0.9819,negative
1,the drinks go down and smoke goes up i feel my...,0.630,"{'neg': 0.245, 'neu': 0.657, 'pos': 0.098, 'co...",-0.9983,negative
2,she dont live on planet earth no more she fou...,0.240,"{'neg': 0.158, 'neu': 0.813, 'pos': 0.029, 'co...",-0.9915,negative
3,trippin off that grigio mobbin lights low tri...,0.536,"{'neg': 0.173, 'neu': 0.733, 'pos': 0.094, 'co...",-0.9775,negative
4,i see a midnight panther so gallant and so bra...,0.371,"{'neg': 0.091, 'neu': 0.509, 'pos': 0.4, 'comp...",0.9964,positive
...,...,...,...,...,...
158348,and we live on borrowed time but this headsho...,0.737,"{'neg': 0.127, 'neu': 0.491, 'pos': 0.381, 'co...",0.9968,positive
158349,frozin in time forever carrying that torch fo...,0.482,"{'neg': 0.082, 'neu': 0.897, 'pos': 0.021, 'co...",-0.7599,negative
158350,hard to be a girl so nice to be a boy in m...,0.733,"{'neg': 0.168, 'neu': 0.661, 'pos': 0.172, 'co...",0.5326,positive
158351,i want to chose to die and be buried with a r...,0.361,"{'neg': 0.094, 'neu': 0.772, 'pos': 0.134, 'co...",0.3182,positive


In [10]:
#Creating a new column with the "sentiment" of the lyrics
def atribuir_sentimento(valor):
    if valor < 0.4:
        return 'negative'
    elif valor > 0.6:
        return 'positive'
    else:
        return 'neutral'

# Aplicar a função a cada valor da coluna 'label' e criar uma nova coluna 'sentiment'
df['sentiment'] = df['label'].apply(atribuir_sentimento)

df = df[['seq', 'sentiment', 'sentiment_vader']]

In [11]:
df

Unnamed: 0,seq,sentiment,sentiment_vader
0,no no i aint ever trapped out the bando but ...,positive,negative
1,the drinks go down and smoke goes up i feel my...,positive,negative
2,she dont live on planet earth no more she fou...,negative,negative
3,trippin off that grigio mobbin lights low tri...,neutral,negative
4,i see a midnight panther so gallant and so bra...,negative,positive
...,...,...,...
158348,and we live on borrowed time but this headsho...,positive,positive
158349,frozin in time forever carrying that torch fo...,neutral,negative
158350,hard to be a girl so nice to be a boy in m...,positive,positive
158351,i want to chose to die and be buried with a r...,negative,positive


In [12]:
from sklearn.metrics import accuracy_score

def calculate_accuracy(df):
    # Verificar se as colunas "sentiment" e "sentiment_vader" existem no DataFrame
    if 'sentiment' not in df.columns or 'sentiment_vader' not in df.columns:
        print("As colunas 'sentiment' e 'sentiment_vader' são necessárias no DataFrame.")
        return None
    
    # Obter os valores das colunas de sentimento real e sentimento previsto
    true_sentiment = df['sentiment']
    predicted_sentiment = df['sentiment_vader']
    
    # Calcular a precisão (accuracy)
    accuracy = accuracy_score(true_sentiment, predicted_sentiment)
    
    return accuracy

In [13]:
accuracy_score = calculate_accuracy(df)
print("Accuracy of Vader: {:.2f}%".format(accuracy_score * 100))

Accuracy of Vader: 37.64%
