# Data Pre-processing

In [1]:
# Import libraries
import pandas as pd
from langdetect import detect
import spacy
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# Import dataset
df = pd.read_excel('dataset.xlsx', header=0, index_col=0)
df.head()

Unnamed: 0,id,name,artist,disc,duration,episode,explicit,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,time_signature,valence,lyrics
0,1hA4856JVAa0qqgKg6olJf,"The Messenger - Live from Las Vegas, 2011",Linkin Park,1,233280,False,False,37,0.325,0.726,0.409,1.3e-05,0.188,-5.711,1,0.0295,109.001,4,0.187,6 ContributorsThe Messenger (Live from Las Veg...
1,0GgN4MhR5GKn5IcKN0e0rG,Cancer,My Chemical Romance,1,142973,False,False,72,0.457,0.358,0.515,0.0,0.278,-4.31,1,0.0261,74.984,4,0.222,93 ContributorsCancer Lyrics\nTurn away\nIf yo...
2,10nyNJ6zNy2YVYLrcwLccB,No Surprises,Radiohead,1,229120,False,False,83,0.255,0.0577,0.393,0.00361,0.113,-10.654,1,0.0278,76.426,4,0.118,129 ContributorsTranslationsРусскийDeutschEspa...
3,7lRlq939cDG4SzWOF4VAnd,I'm Not Okay (I Promise),My Chemical Romance,1,186480,False,True,78,0.21,0.00602,0.94,0.0,0.269,-3.427,1,0.123,179.722,4,0.255,109 ContributorsTranslationsEspañolI’m Not Oka...
4,5u2FOoFhp495GIj5BJC77J,Hold On Till May,Pierce The Veil,1,278586,False,False,66,0.46,0.000585,0.91,0.00155,0.0735,-3.71,1,0.0525,73.019,4,0.327,58 ContributorsHold on Till May Lyrics\nShe si...


In [3]:
# Remove irrelevant columns
song_df = df.drop(["id", "disc", "episode", "explicit", "mode", "time_signature"], axis=1)
song_df.head()

Unnamed: 0,name,artist,duration,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,lyrics
0,"The Messenger - Live from Las Vegas, 2011",Linkin Park,233280,37,0.325,0.726,0.409,1.3e-05,0.188,-5.711,0.0295,109.001,0.187,6 ContributorsThe Messenger (Live from Las Veg...
1,Cancer,My Chemical Romance,142973,72,0.457,0.358,0.515,0.0,0.278,-4.31,0.0261,74.984,0.222,93 ContributorsCancer Lyrics\nTurn away\nIf yo...
2,No Surprises,Radiohead,229120,83,0.255,0.0577,0.393,0.00361,0.113,-10.654,0.0278,76.426,0.118,129 ContributorsTranslationsРусскийDeutschEspa...
3,I'm Not Okay (I Promise),My Chemical Romance,186480,78,0.21,0.00602,0.94,0.0,0.269,-3.427,0.123,179.722,0.255,109 ContributorsTranslationsEspañolI’m Not Oka...
4,Hold On Till May,Pierce The Veil,278586,66,0.46,0.000585,0.91,0.00155,0.0735,-3.71,0.0525,73.019,0.327,58 ContributorsHold on Till May Lyrics\nShe si...


In [4]:
# Identify duplicates
song_df[song_df.duplicated()].shape

(38, 14)

In [5]:
# Remove duplicates and keep the first occurrence
song_df = song_df.drop_duplicates(keep='first')

In [6]:
# Look for missing values
print(song_df.isnull().sum())
# No Null cells

name                0
artist              0
duration            0
popularity          0
danceability        0
acousticness        0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
lyrics              0
dtype: int64


In [7]:
# Reset the index
song_df = song_df.reset_index(drop=True)

In [8]:
# Identify songs without lyrics
sum(song_df['lyrics'] == 'lyrics not available')

196

In [9]:
# Remove songs without lyrics 
song_df = song_df[song_df['lyrics'] != 'lyrics not available']

In [10]:
# Create a lambda function that deletes the number at the beggining of the lyrics
# Exclude the word "Lyrics" from the result
delete_text_before_lyrics = lambda x: x[x.find("Lyrics") + len("Lyrics"):] if x.find("Lyrics") != -1 else x

In [11]:
# Apply the lambda function to the lyrics column
song_df['lyrics'] = song_df['lyrics'].apply(delete_text_before_lyrics)

In [12]:
# Obtain information from the dataset
song_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 0 to 995
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              800 non-null    object 
 1   artist            800 non-null    object 
 2   duration          800 non-null    int64  
 3   popularity        800 non-null    int64  
 4   danceability      800 non-null    float64
 5   acousticness      800 non-null    float64
 6   energy            800 non-null    float64
 7   instrumentalness  800 non-null    float64
 8   liveness          800 non-null    float64
 9   loudness          800 non-null    float64
 10  speechiness       800 non-null    float64
 11  tempo             800 non-null    float64
 12  valence           800 non-null    float64
 13  lyrics            800 non-null    object 
dtypes: float64(9), int64(2), object(3)
memory usage: 93.8+ KB


In [13]:
# Detect the languages of the lyrics
song_df['language'] = song_df['lyrics'].apply(lambda x: detect(x) if isinstance(x, str) else 'unknown')
song_df.language.value_counts()

en    664
es     85
fr     38
ca      3
tl      2
it      2
pt      2
id      1
tr      1
so      1
sk      1
Name: language, dtype: int64

In [14]:
# Drop song that aren't in English, Spanish or French
song_df = song_df[song_df['language'].isin(['en', 'es', 'fr'])]

In [15]:
# Divide into 3 dataframes for language processing
df_en = song_df[song_df['language'] == 'en']
df_es = song_df[song_df['language'] == 'es']
df_fr = song_df[song_df['language'] == 'fr']

In [16]:
# Create a list for lyrics in each language
lyrics_en = df_en['lyrics'].tolist()
lyrics_es = df_es['lyrics'].tolist()
lyrics_fr = df_fr['lyrics'].tolist()

In [17]:
# Install the pre-trained language processing models

In [18]:
#!python -m spacy download en_core_web_sm

In [19]:
#!python -m spacy download es_core_news_sm

In [20]:
#!python -m spacy download fr_core_news_sm

In [21]:
# Define function to process the lyrics with the NLP model
def process_lyrics(lyrics, language_model):
    processed_lyrics = []
    for lyric in lyrics:
        doc = language_model(lyric)
        tokens = [token.lemma_ for token in doc if not token.is_stop]
        processed_lyrics.append(tokens)
    return processed_lyrics

# Define function to remove stopwords from lyrics
def remove_unwanted_words(processed_lyrics, unwanted_words):
    cleaned_lyrics = []

    for tokens in processed_lyrics:
        cleaned_tokens = [token for token in tokens if token not in unwanted_words]
        cleaned_lyrics.append(cleaned_tokens)

    return cleaned_lyrics

In [22]:
# Load the language models for each language
nlp_en = spacy.load('en_core_web_sm')
nlp_es = spacy.load('es_core_news_sm')
nlp_fr = spacy.load('fr_core_news_sm')

In [23]:
# Process English lyrics
processed_lyrics_en = process_lyrics(lyrics_en, nlp_en)
# Process Spanish lyrics
processed_lyrics_es = process_lyrics(lyrics_es, nlp_es)
# Process French lyrics
processed_lyrics_fr = process_lyrics(lyrics_fr, nlp_fr)

In [24]:
# Create a list of unwanted words to remove
unwanted_words = ["\n", "(", ")", ",", ]
# Remove unwanted words from processed lyrics
cleaned_lyrics_en = remove_unwanted_words(processed_lyrics_en, unwanted_words)
cleaned_lyrics_es = remove_unwanted_words(processed_lyrics_es, unwanted_words)
cleaned_lyrics_fr = remove_unwanted_words(processed_lyrics_fr, unwanted_words)

In [25]:
# Define function to obtain sentiment scores
def analyze_sentiment(lyrics, language_model):
    sentiment_scores = []
    for lyric in lyrics:
        doc = language_model(lyric)
        text = " ".join([token.lemma_ for token in doc if not token.is_stop])
        blob = TextBlob(text)
        sentiment_scores.append(blob.sentiment)
    return sentiment_scores

In [26]:
# Obtain the sentiment score of each lyric
language_dataframes = {
    'en': df_en,
    'es': df_es,
    'fr': df_fr}
new_dataframes = {}

for lang, df in language_dataframes.items():
    sentiment_scores = analyze_sentiment(df['lyrics'], language_model=nlp_en if lang == 'en' else nlp_es if lang == 'es' else nlp_fr)
    new_df = df.copy()  # Create a copy of the original DataFrame
    new_df['Sentiment'] = sentiment_scores
    new_dataframes[lang] = new_df

In [27]:
# Define function to extract Polarity and Subjectivity
def split_sentiment_column(df, sentiment_column_name, polarity_column_name, subjectivity_column_name):
    df[[polarity_column_name, subjectivity_column_name]] = pd.DataFrame(df[sentiment_column_name].tolist(), index=df.index)
    df = df.drop(sentiment_column_name, axis=1)
    return df

In [28]:
new_dataframes['en'] = split_sentiment_column(new_dataframes['en'], 'Sentiment', 'Polarity', 'Subjectivity')
new_dataframes['es'] = split_sentiment_column(new_dataframes['es'], 'Sentiment', 'Polarity', 'Subjectivity')
new_dataframes['fr'] = split_sentiment_column(new_dataframes['fr'], 'Sentiment', 'Polarity', 'Subjectivity')

In [29]:
# Drop lyrics column
new_dataframes['en'].drop('lyrics', axis=1, inplace=True)
new_dataframes['es'].drop('lyrics', axis=1, inplace=True)
new_dataframes['fr'].drop('lyrics', axis=1, inplace=True)

In [30]:
# Combine all dtaframes into a single one
cleaned_df = pd.concat([new_dataframes['en'], new_dataframes['es'], new_dataframes['fr']], axis=0)

In [31]:
cleaned_df.head()

Unnamed: 0,name,artist,duration,popularity,danceability,acousticness,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,language,Polarity,Subjectivity
0,"The Messenger - Live from Las Vegas, 2011",Linkin Park,233280,37,0.325,0.726,0.409,1.3e-05,0.188,-5.711,0.0295,109.001,0.187,en,0.141837,0.752551
1,Cancer,My Chemical Romance,142973,72,0.457,0.358,0.515,0.0,0.278,-4.31,0.0261,74.984,0.222,en,-0.06733,0.678125
2,No Surprises,Radiohead,229120,83,0.255,0.0577,0.393,0.00361,0.113,-10.654,0.0278,76.426,0.118,en,-0.036364,0.630303
3,I'm Not Okay (I Promise),My Chemical Romance,186480,78,0.21,0.00602,0.94,0.0,0.269,-3.427,0.123,179.722,0.255,en,0.259333,0.536667
4,Hold On Till May,Pierce The Veil,278586,66,0.46,0.000585,0.91,0.00155,0.0735,-3.71,0.0525,73.019,0.327,en,0.177059,0.499412


In [34]:
cleaned_df.to_excel('clean_songs_en_fr_sp.xlsx', index=False, index_label=False)