# Load data

In [1]:
import pandas as pd
import string
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

import plotly.express as px
import plotly.io as pio


In [2]:
df = pd.read_csv('original_data.csv')

In [3]:
df.head()

Unnamed: 0,artist,seq,song,label,sentiment
0,Elijah Blake,no no i aint ever trapped out the bando but ...,Everyday,0.626,positive
1,Elijah Blake,the drinks go down and smoke goes up i feel my...,Live Till We Die,0.63,positive
2,Elijah Blake,she dont live on planet earth no more she fou...,The Otherside,0.24,negative
3,Elijah Blake,trippin off that grigio mobbin lights low tri...,Pinot,0.536,neutral
4,Elijah Blake,i see a midnight panther so gallant and so bra...,Shadows & Diamonds,0.371,negative


In [4]:
print(df.isnull().sum())
df = df.dropna()

artist         0
seq          101
song           0
label          0
sentiment      0
dtype: int64


In [5]:
def unique(list1):
   # intilize a null list
     unique_list = []
   # traverse for all elements
     for x in list1:
         # check if exists in unique_list or not
         if x not in unique_list:
              unique_list.append(x)
     return unique_list

In [6]:
def lyrics_to_words(document):
    """
    This function splits the text of lyrics to  single words, removing stopwords and doing the lemmatization to each word

    parameters:
    document: text to split to single words
    """
    stop_words = set(stopwords.words('english'))
    exclude = set(string.punctuation)
    lemma = WordNetLemmatizer()
    stopwordremoval = " ".join([i for i in document.lower().split() if i not in stop_words])
    punctuationremoval = ''.join(ch for ch in stopwordremoval if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punctuationremoval.split())
    return normalized

In [7]:
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

comp_score = []
sentiment = []

for i in df.loc[:,'seq']:
    sentiment.append(sia.polarity_scores(i))


df.loc[:,'sent_scores'] = sentiment
df.loc[:,'comp_score'] = df.loc[:,'sent_scores'].apply(lambda x: x['compound'])
df.loc[:,'sentiment_vader'] = df.loc[:,'comp_score'].apply(lambda x: 'positive' if x>=0.6 else 'negative' if x<=-0.4 else 'neutral')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/pereira/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
df = df.drop('sent_scores', axis=1)
df = df.drop('label', axis=1)
df = df.drop('comp_score', axis=1)

df = df.rename(columns={'comp_score': 'label_vader'})

# Data Visualization

In [9]:
df

Unnamed: 0,artist,seq,song,sentiment,sentiment_vader
0,Elijah Blake,no no i aint ever trapped out the bando but ...,Everyday,positive,negative
1,Elijah Blake,the drinks go down and smoke goes up i feel my...,Live Till We Die,positive,negative
2,Elijah Blake,she dont live on planet earth no more she fou...,The Otherside,negative,negative
3,Elijah Blake,trippin off that grigio mobbin lights low tri...,Pinot,neutral,negative
4,Elijah Blake,i see a midnight panther so gallant and so bra...,Shadows & Diamonds,negative,positive
...,...,...,...,...,...
158348,Adam Green,and we live on borrowed time but this headsho...,Friends of Mine,positive,positive
158349,Adam Green,frozin in time forever carrying that torch fo...,Frozen in Time,neutral,negative
158350,Adam Green,hard to be a girl so nice to be a boy in m...,Hard to Be a Girl,positive,positive
158351,Adam Green,i want to chose to die and be buried with a r...,I Wanna Die,negative,neutral


In [10]:
from sklearn.metrics import accuracy_score

def calculate_accuracy(df):
    # Verificar se as colunas "sentiment" e "sentiment_vader" existem no DataFrame
    if 'sentiment' not in df.columns or 'sentiment_vader' not in df.columns:
        print("As colunas 'sentiment' e 'sentiment_vader' são necessárias no DataFrame.")
        return None
    
    # Obter os valores das colunas de sentimento real e sentimento previsto
    true_sentiment = df['sentiment']
    predicted_sentiment = df['sentiment_vader']
    
    # Calcular a precisão (accuracy)
    accuracy = accuracy_score(true_sentiment, predicted_sentiment)
    
    return accuracy

In [11]:
accuracy_score = calculate_accuracy(df)
print("Accuracy of Vader: {:.2f}%".format(accuracy_score * 100))

Acurácia do Vader: 36.82%
