# Imports

In [1]:
import pandas as pd
import time
import numpy as np
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

# Load/Data Exploration

In [2]:
#Read the data
start = time.time()
df = pd.read_csv("labeled_lyrics_cleaned.csv")
end = time.time()

#Print the time it took to load the data
print("Time to load the data: ", end - start)

Time to load the data:  1.2235801219940186


In [3]:
#Show the first 5 rows
df.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371


In [4]:
#Show the last 5 rows
df.tail()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
158348,158348,Adam Green,"And we live on borrowed time,\r\nBut this head...",Friends of Mine,0.737
158349,158349,Adam Green,Frozin in time forever\r\nCarrying that torch ...,Frozen in Time,0.482
158350,158350,Adam Green,Hard to be a girl. \r\nSo nice to be a boy. \r...,Hard to Be a Girl,0.733
158351,158351,Adam Green,"I want to chose to die,\r\nAnd be buried with ...",I Wanna Die,0.361
158352,158352,Adam Green,Musical ladders\r\nLeaning on mountains\r\nBat...,Musical Ladders,0.263


In [5]:
#Summary of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158353 entries, 0 to 158352
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  158353 non-null  int64  
 1   artist      158353 non-null  object 
 2   seq         158353 non-null  object 
 3   song        158353 non-null  object 
 4   label       158353 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 6.0+ MB


In [6]:
#Describe statistics of the data
df.describe()

Unnamed: 0.1,Unnamed: 0,label
count,158353.0,158353.0
mean,79176.0,0.491052
std,45712.717926,0.249619
min,0.0,0.0
25%,39588.0,0.286
50%,79176.0,0.483
75%,118764.0,0.691
max,158352.0,0.998


In [7]:
#Dimensions of the data as a tuple
df.shape

(158353, 5)

In [8]:
#Columns names of the data
df.columns

Index(['Unnamed: 0', 'artist', 'seq', 'song', 'label'], dtype='object')

In [9]:
#Data types of each column
df.dtypes

Unnamed: 0      int64
artist         object
seq            object
song           object
label         float64
dtype: object

In [10]:
#Missing or null values in the data
df.isnull()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
158348,False,False,False,False,False
158349,False,False,False,False,False
158350,False,False,False,False,False
158351,False,False,False,False,False


# Data Preparation

In [11]:
#Creating a new column with the "sentiment" of the lyrics
def atribuir_sentimento(valor):
    if valor <= 0.4:
        return 'negative'
    elif valor >= 0.6:
        return 'positive'
    else:
        return 'neutral'

# Aplicar a função a cada valor da coluna 'label' e criar uma nova coluna 'sentiment'
df['sentiment'] = df['label'].apply(atribuir_sentimento)

In [12]:
#Drop the column "Unnamed: 0"
df.drop(columns = ['Unnamed: 0'], inplace = True)

In [13]:
df#Removing unnecessary characters from lyrics
df['seq'] = df['seq'].str.replace("[\(\[].*?[\)\]]", '')
df['seq'] = df['seq'].str.replace("\n", ' ')
df['seq'] = df['seq'].str.replace("\r", ' ')
df['seq'] = df['seq'].str.lower()
df['seq'] = df['seq'].str.replace('[{}]'.format(string.punctuation), '')

  df['seq'] = df['seq'].str.replace("[\(\[].*?[\)\]]", '')
  df['seq'] = df['seq'].str.replace('[{}]'.format(string.punctuation), '')


In [15]:
df

Unnamed: 0,artist,seq,song,label,sentiment
0,Elijah Blake,no no i aint ever trapped out the bando but ...,Everyday,0.626,positive
1,Elijah Blake,the drinks go down and smoke goes up i feel my...,Live Till We Die,0.630,positive
2,Elijah Blake,she dont live on planet earth no more she fou...,The Otherside,0.240,negative
3,Elijah Blake,trippin off that grigio mobbin lights low tri...,Pinot,0.536,neutral
4,Elijah Blake,i see a midnight panther so gallant and so bra...,Shadows & Diamonds,0.371,negative
...,...,...,...,...,...
158348,Adam Green,and we live on borrowed time but this headsho...,Friends of Mine,0.737,positive
158349,Adam Green,frozin in time forever carrying that torch fo...,Frozen in Time,0.482,neutral
158350,Adam Green,hard to be a girl so nice to be a boy in m...,Hard to Be a Girl,0.733,positive
158351,Adam Green,i want to chose to die and be buried with a r...,I Wanna Die,0.361,negative


# Save in a new .csv

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Dividir os dados em treinamento e dados temporários
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
original_data = df.to_csv('original_data.csv', index=False)
test_data1 = test_data[['seq', 'sentiment']]

# Gravar o DataFrame com todas as colunas em um arquivo CSV
train_data.to_csv('train.csv', index=False)

# Gravar o DataFrame com apenas as colunas 'seq' e 'sentiment' em um arquivo CSV
test_data1.to_csv('test.csv', index=False)
