# Imports

In [1]:
import pandas as pd
import time
import numpy as np

# Data Exploration

In [2]:
#Read the data
start = time.time()
df_original = pd.read_csv("originalData.csv")
end = time.time()

#Print the time it took to load the data
print("Time to load the data: ", end - start)

Time to load the data:  131.07218599319458


In [3]:
n_rows = len(df_original)
n_rows_to_drop = int(0.95 * n_rows)  # Calculate the number of rows to drop

# Drop 80% of the rows randomly
df = df_original.sample(frac=1.0 - 0.95, random_state=42)  # Set a random state for reproducibility
df.reset_index(drop=True, inplace=True)  # Reset the index

# Verify the number of rows in the resulting DataFrame
print("Number of remaining rows:", len(df))

Number of remaining rows: 295671


In [4]:
#Show the first 5 rows
df.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id
0,Está Muy Bien,rock,David Lebn,1980,8,"{""David Lebón""}","[Letra de ""Está Muy Bien""]\n\n[Verso 1]\nMuy b...",4830984
1,Lil Peep - Runaway Türkçe Çeviri,rap,Genius Trke eviri,2017,5152,"{""Genius Türkçe Çeviri""}",[Intro]\nBuradan kaç\nHerkes çok sahtekâr\nHer...,3923651
2,Pas ce soir,rap,Oxmo Puccino,2012,3384,{},Passé la trentaine tu comprendras\nQue les bon...,89421
3,Überall Kirchen Session,pop,Die Hchste Eisenbahn,2020,77,"{""Die Höchste Eisenbahn""}",[Instrumental],5730279
4,Une certaine mixture,rap,Toni-L,1996,68,"{Linguist,""Frero (FRA)""}",[Intro:]\nLa mixture est dans la place\nToni l...,703031


In [5]:
#Show the last 5 rows
df.tail()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id
295666,Look It Up,pop,Ashton Shepherd,2011,353,{},"The word is faithful, look it up\nIt don't mea...",1723643
295667,Bailão,pop,Ju Faustino,2019,29,{},[Verso: Ju Faustino]\nEu não vou mais ficar at...,4754196
295668,I Should Care,pop,Nicola Arigliano,2004,22,{},I should care\nI should go around weeping\nI s...,1123173
295669,In the Name of Scotland,rock,Serenity,2022,55,{},"Rise against English Crown, until freedom will...",7869402
295670,Love Theme from St. Elmos Fire Instrumental,pop,David Foster,1985,364,{},[Instrumental],971662


In [6]:
#Summary of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295671 entries, 0 to 295670
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   title     295650 non-null  object
 1   tag       295671 non-null  object
 2   artist    295671 non-null  object
 3   year      295671 non-null  int64 
 4   views     295671 non-null  int64 
 5   features  295671 non-null  object
 6   lyrics    295555 non-null  object
 7   id        295671 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 18.0+ MB


In [7]:
#Describe statistics of the data
df.describe()

Unnamed: 0,year,views,id
count,295671.0,295671.0,295671.0
mean,2010.683665,2638.833,4029159.0
std,45.115338,38292.86,2295500.0
min,1.0,0.0,1.0
25%,2010.0,16.0,1825062.0
50%,2016.0,64.0,4167588.0
75%,2019.0,353.0,6005609.0
max,2030.0,8351197.0,7882847.0


In [8]:
#Dimensions of the data as a tuple
df.shape

(295671, 8)

In [9]:
#Columns names of the data
df.columns

Index(['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id'], dtype='object')

In [10]:
#Data types of each column
df.dtypes

title       object
tag         object
artist      object
year         int64
views        int64
features    object
lyrics      object
id           int64
dtype: object

In [11]:
#Missing or null values in the data
df.isnull()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
295666,False,False,False,False,False,False,False,False
295667,False,False,False,False,False,False,False,False
295668,False,False,False,False,False,False,False,False
295669,False,False,False,False,False,False,False,False


# Data Preparation

In [12]:
#Drop columns "id", "views" and "features"
df.drop(["id", "views", "features"], axis=1, inplace=True)

In [13]:
#Check if the column "lyrics" has null values
null_values = df["lyrics"].isnull().sum()
print("Number of null values in the column lyrics: ", null_values)

#Drop rows with null values in the column "lyrics"
df.dropna(subset=['lyrics'], inplace=True)

Number of null values in the column lyrics:  116


In [17]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Set up NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Make a copy of the DataFrame
df_clean = df.copy()

def unique(list1):
    unique_list = []
    for x in list1:
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

def lyrics_to_words(document):
    stop_words = set(stopwords.words('english'))
    exclude = set(string.punctuation)
    lemma = WordNetLemmatizer()
    stopword_removal = " ".join([i for i in document.lower().split() if i not in stop_words])
    punctuation_removal = ''.join(ch for ch in stopword_removal if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punctuation_removal.split())
    return normalized

# Create a new column 'clean_words' to store the unique words of each lyrics song
words = []

# Define batch size
batch_size = 10000

# Iterate over batches of data
for i in range(0, len(df_clean), batch_size):
    # Process a batch of data
    batch = df_clean['lyrics'].iloc[i:i+batch_size].tolist()
    batch_words = [unique(lyrics_to_words(lyric).split()) for lyric in batch]
    words.extend(batch_words)

# Assign the 'clean_words' column with the processed data
df_clean['clean_words'] = words

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pereira/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pereira/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/pereira/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


: 

: 