In [150]:
import pandas as pd

import nltk, os, glob, re

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

from nltk.stem import PorterStemmer

# Lyrics Read and Preprocessing

## Workflow: In this notebook I will read the lyrics into a dataframe and prepare it for work.

## Information about the data: 
The data has been supplied by [Kaggle](https://www.kaggle.com/datasets/ishikajohari/taylor-swift-all-lyrics-30-albums/data) and the original file contains all the lyrics + cover art for each album and datasets containing song id and title. From everything I needed only the lyrics directory and this is why its the only thing provided here. In the lyrics directory however there are way to many albums than there are actually are. Since we only care about the lyrics I sampled only the lyrics from the albums and ignored any other versioning of the albums like remixes, or different sound editions of the same songs because they are not needed in the analysis.

## Reading and exploring the files

The approach I have chosen is to make a big dataset with all lyrics so I can have an initial exploration and and clean up. After that I will decide if I

In [2]:
def make_data_frame(path):
    """
    The following function will go through the albums directory and its subdirectories,
    collect the songs lyrics, the song and album name they belong to and create a 
    dataframe, which we then are going to use to analyze the lyrics.
    """
    data  = []
    for album_dir in glob.glob(os.path.join(path, "*")):
        album_name = os.path.basename(album_dir)
        for song_file in glob.glob(os.path.join(album_dir, "*.txt")):
            song_name = os.path.basename(song_file)
            with open(song_file, "r") as f:
                lyrics = f.read()
            data.append((album_name, song_name, lyrics))

    df = pd.DataFrame(data, columns=["album", "song", "lyrics"])
    
    return df

In [3]:
songs = make_data_frame("./data/albums")

In [4]:
songs

Unnamed: 0,album,song,lyrics
0,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,HowDidItEnd_.txt,137 ContributorsTranslationsPortuguêsEspañolال...
1,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,TheBolter.txt,93 ContributorsTranslationsالعربيةFrançaisفارس...
2,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,Peter.txt,95 ContributorsTranslationsEspañolFrançaisDeut...
3,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,imgonnagetyouback.txt,97 ContributorsTranslationsEspañolالعربيةFranç...
4,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,DownBad.txt,117 ContributorsTranslationsTürkçePortuguêsEsp...
...,...,...,...
238,7.Lover,MissAmericana_TheHeartbreakPrince.txt,111 ContributorsTranslationsEnglishHrvatskiPor...
239,7.Lover,CruelSummer.txt,166 ContributorsTranslationsTürkçeEspañolHrvat...
240,7.Lover,LondonBoy.txt,106 ContributorsTranslationsEspañolHrvatskiPor...
241,7.Lover,FalseGod.txt,91 ContributorsTranslationsTürkçeEspañolHrvats...


After loading the data successfully it's time to clean it up.

## Data Preprocessing

### Cleaning the album titles

Albums with long names such as `The Tortured Poets Department` will get an abbreviation like `TTPD`. Empty spaces and decapitalized albums will be fixed. Since those are single cases there would not be any need for function implementation and will fix them by hand.

In [5]:
songs.album.unique()

array(['11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY', '4.Red',
       '10.Midnights', '5.1989', '9. evermore', '6.Reputation',
       '1.TaylorSwift', '3.SpeakNow', '2.Fearless', '8.Folklore',
       '7.Lover'], dtype=object)

In [6]:
songs.album[songs.album == "11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY"] = "11.TTPD"

In [7]:
songs.album[songs.album == "9. evermore"] = "9.Evermore"

In [8]:
songs.album.unique()

array(['11.TTPD', '4.Red', '10.Midnights', '5.1989', '9.Evermore',
       '6.Reputation', '1.TaylorSwift', '3.SpeakNow', '2.Fearless',
       '8.Folklore', '7.Lover'], dtype=object)

### Cleaning the song titles

I will trim the '.txt' from the names under song column. There are songs with suffixes like `_From_The_Vault_` `_Taylors_Version_` which are also redundant in that case.

In [9]:
def clean_song_name(song, characters = ["_",".txt"], patterns=["TaylorsVersion","FromTheVault"], replacement = ""):
    
    for character in characters:
        song = song.replace(character, replacement)

    for pattern in patterns:
        song = re.sub(pattern, replacement, song)

    return song

In [10]:
songs.song = songs.song.apply(clean_song_name).unique()

In [11]:
songs

Unnamed: 0,album,song,lyrics
0,11.TTPD,HowDidItEnd,137 ContributorsTranslationsPortuguêsEspañolال...
1,11.TTPD,TheBolter,93 ContributorsTranslationsالعربيةFrançaisفارس...
2,11.TTPD,Peter,95 ContributorsTranslationsEspañolFrançaisDeut...
3,11.TTPD,imgonnagetyouback,97 ContributorsTranslationsEspañolالعربيةFranç...
4,11.TTPD,DownBad,117 ContributorsTranslationsTürkçePortuguêsEsp...
...,...,...,...
238,7.Lover,MissAmericanaTheHeartbreakPrince,111 ContributorsTranslationsEnglishHrvatskiPor...
239,7.Lover,CruelSummer,166 ContributorsTranslationsTürkçeEspañolHrvat...
240,7.Lover,LondonBoy,106 ContributorsTranslationsEspañolHrvatskiPor...
241,7.Lover,FalseGod,91 ContributorsTranslationsTürkçeEspañolHrvats...


### Cleaning the lyrics

First things first I want to get rid of the precredits : "137 ContributorsTranslationsPortuguêsEspañolال..." since they do not carry any significant information about tendencies, trends or sentiment. Then I want to remove everything in [] like `[Intro]`, `[Verse 1]` and so on, commas and parenthesies.

In [12]:
def remove_credits(text):
    """
    Removes everything before and including "Lyrics" from a given text.
    """

    match = re.search(r"Lyrics", text, re.IGNORECASE)
    if match:
        return text[match.end():]
    else:
        return ""

In [13]:
songs.lyrics = songs.lyrics.apply(remove_credits)

In [15]:
songs["words"] = songs.lyrics.str.split('/s+')

In [16]:
songs

Unnamed: 0,album,song,lyrics,words
0,11.TTPD,HowDidItEnd,"[Intro]\n(Uh-oh, uh-oh)\n\n[Verse 1]\nWe hereb...","[[Intro]\n(Uh-oh, uh-oh)\n\n[Verse 1]\nWe here..."
1,11.TTPD,TheBolter,"[Verse 1]\nBy all accounts, she almost drowned...","[[Verse 1]\nBy all accounts, she almost drowne..."
2,11.TTPD,Peter,"[Verse 1]\nForgive me, Peter\nMy lost fearless...","[[Verse 1]\nForgive me, Peter\nMy lost fearles..."
3,11.TTPD,imgonnagetyouback,"[Intro]\nYeah\n\n[Verse 1]\nLilac short skirt,...",[[Intro]\nYeah\n\n[Verse 1]\nLilac short skirt...
4,11.TTPD,DownBad,[Verse 1]\nDid you really beam me up\nIn a clo...,[[Verse 1]\nDid you really beam me up\nIn a cl...
...,...,...,...,...
238,7.Lover,MissAmericanaTheHeartbreakPrince,"[Verse 1]\nYou know I adore you, I'm crazier f...","[[Verse 1]\nYou know I adore you, I'm crazier ..."
239,7.Lover,CruelSummer,"[Intro]\n(Yeah, yeah, yeah, yeah)\n\n[Verse 1]...","[[Intro]\n(Yeah, yeah, yeah, yeah)\n\n[Verse 1..."
240,7.Lover,LondonBoy,[Intro: Idris Elba & James Corden]\nWe can go ...,[[Intro: Idris Elba & James Corden]\nWe can go...
241,7.Lover,FalseGod,[Verse 1]\nWe were crazy to think\nCrazy to th...,[[Verse 1]\nWe were crazy to think\nCrazy to t...


It seems we have an outlier with length 0 which might cause exceptions later 

In [103]:
songs.words.apply(len).value_counts()

1    242
0      1
Name: words, dtype: int64

In [114]:
songs[songs['words'].str.len() == 0]

Unnamed: 0,album,song,lyrics,words,processed_words,joined_words
133,6.Reputation,ReputationMagazineVol1,5Embed,[],[],


In [116]:
songs.drop(133,inplace=True)

In [143]:
def clean_text(words,characters = [":",",","(",")","\n","-","\'","?","!","\""], patterns = [r"(\[.*?\])","\d{1,10}Embed"], replacement = ""):
    """
    Removes everything before and including the word "Lyrics" in a given text. 
    """
    cleaned_words = words.copy()

    for character in characters:
        
        if character == "\n":
            cleaned_words = [word.replace(character," ") for word in cleaned_words]

        cleaned_words = [word.replace(character,replacement) for word in cleaned_words]

    for pattern in patterns:
        cleaned_words = [re.sub(pattern,replacement,word) for word in cleaned_words]

    cleaned_words = [word for word in cleaned_words if word != "" ]

    return cleaned_words

In [144]:
clean_text(songs.words.loc[50])

[' took a deep breath in the mirror he didnt like it when i wore high heels but i do turn the lock and put my headphones on he always said he didnt get this song but i do i do   i walked in expecting youd be late but you got here early and you stand and wave i walk to you you pull my chair out and help me in and you dont know how nice that is but i do  and you throw your head back laughing like a little kid i think its strange that you think im funny cause he never did ive been spending the last eight months thinking all love ever does is break and burn and end but on a wеdnesday in a café i watched it begin again   you said you nеver met one girl who had as many james taylor records as you but i do we tell stories and you dont know why im coming off a little shy but i do   but you throw your head back laughing like a little kid i think its strange that you think im funny cause he never did ive been spending the last eight months thinking all love ever does is break and burn and end bu

In [145]:
songs.words = songs.words.apply(clean_text).apply(lambda lyrics: [word.lower() for word in lyrics])

In [146]:
songs

Unnamed: 0,album,song,lyrics,words,processed_words,joined_words
0,11.TTPD,HowDidItEnd,"[Intro]\n(Uh-oh, uh-oh)\n\n[Verse 1]\nWe hereb...",[ uhoh uhoh we hereby conduct this postmorte...,"[uhoh, uhoh, hereby, conduct, postmortem, hot,...",uhoh uhoh hereby conduct postmortem hot house ...
1,11.TTPD,TheBolter,"[Verse 1]\nBy all accounts, she almost drowned...",[ by all accounts she almost drowned when she ...,"[account, almost, drowned, six, frigid, water,...",account almost drowned six frigid water confir...
2,11.TTPD,Peter,"[Verse 1]\nForgive me, Peter\nMy lost fearless...",[ forgive me peter my lost fearless leader in ...,"[forgive, peter, lost, fearless, leader, close...",forgive peter lost fearless leader closet like...
3,11.TTPD,imgonnagetyouback,"[Intro]\nYeah\n\n[Verse 1]\nLilac short skirt,...",[ yeah lilac short skirt the one that fits m...,"[yeah, lilac, short, skirt, one, fit, like, sk...",yeah lilac short skirt one fit like skin resea...
4,11.TTPD,DownBad,[Verse 1]\nDid you really beam me up\nIn a clo...,[ did you really beam me up in a cloud of spar...,"[really, beam, cloud, sparkling, dust, experim...",really beam cloud sparkling dust experiment te...
...,...,...,...,...,...,...
238,7.Lover,MissAmericanaTheHeartbreakPrince,"[Verse 1]\nYou know I adore you, I'm crazier f...",[ you know i adore you im crazier for you than...,"[know, adore, im, crazier, 16, lost, film, sce...",know adore im crazier 16 lost film scene wavin...
239,7.Lover,CruelSummer,"[Intro]\n(Yeah, yeah, yeah, yeah)\n\n[Verse 1]...",[ yeah yeah yeah yeah fever dream high in th...,"[yeah, yeah, yeah, yeah, fever, dream, high, q...",yeah yeah yeah yeah fever dream high quiet nig...
240,7.Lover,LondonBoy,[Intro: Idris Elba & James Corden]\nWe can go ...,[ we can go drivin in on my scooter uh you kno...,"[go, drivin, scooter, uh, know, round, london,...",go drivin scooter uh know round london oh id— ...
241,7.Lover,FalseGod,[Verse 1]\nWe were crazy to think\nCrazy to th...,[ we were crazy to think crazy to think that t...,"[crazy, think, crazy, think, could, work, reme...",crazy think crazy think could work remember sa...


Now that we have the lyrics cleaned, trimmed and decapitalized its time to begin processing.

### Adapting the words column to be compatible for sentiment analysis.
To make the words from the lyrics understandable to the models we are about to use certain things must be done:

* Word tokenization : this is the process of splitting the lyrics string into separate strings, where each word (substring) will represent one token
* Stop Word Filtering : Just as any other text the song lyrics contain certain words that do not hold any relevant information for the analysis. Such words are called stop - words and need to be removed from the lyrics.
* Word Stemming/ Lemmatization: both methods are used to reduce the word to its root and remove suffixes like `-ing` etc. I chose to use lematization process simply because it is more accurate than the Stemming, where Stemming could produce words such as `runn` which is not correct.
  
Since I am going to use different Models and Methods to analyze the data each will have a separate notebook for that. Thus after cleaning and prepping the data for analysis I will save it in a csv file

In [122]:
songs.words[0]

[' uhoh uhoh   we hereby conduct this postmortem he was a hot house flower to my outdoorsman our maladies were such we could not cure them and so a touch that was my birthright became foreign   come one come all its happenin again the empathetic hunger descends well tell no one except all of our friends we must know how did it end uhoh uhoh   we were blind to unforeseen circumstances we learned thе right steps to different dancеs and fell victim to interlopers glances lost the game of chance what are the chances soon theyll go home to their husbands smug cause they know they can trust him then feverishly calling their cousins see taylor swift liveget tickets as low as $60you might also like guess who we ran into at the shops walking in circles like she was lost didnt you hear they called it all off one gasp and then how did it end   say it once again with feeling how the death rattle breathing silenced as the soul was leaving the deflation of our dreaming leaving me bereft and reeling 

In [123]:
word_tokenize("".join(songs.words[0]))

['uhoh',
 'uhoh',
 'we',
 'hereby',
 'conduct',
 'this',
 'postmortem',
 'he',
 'was',
 'a',
 'hot',
 'house',
 'flower',
 'to',
 'my',
 'outdoorsman',
 'our',
 'maladies',
 'were',
 'such',
 'we',
 'could',
 'not',
 'cure',
 'them',
 'and',
 'so',
 'a',
 'touch',
 'that',
 'was',
 'my',
 'birthright',
 'became',
 'foreign',
 'come',
 'one',
 'come',
 'all',
 'its',
 'happenin',
 'again',
 'the',
 'empathetic',
 'hunger',
 'descends',
 'well',
 'tell',
 'no',
 'one',
 'except',
 'all',
 'of',
 'our',
 'friends',
 'we',
 'must',
 'know',
 'how',
 'did',
 'it',
 'end',
 'uhoh',
 'uhoh',
 'we',
 'were',
 'blind',
 'to',
 'unforeseen',
 'circumstances',
 'we',
 'learned',
 'thе',
 'right',
 'steps',
 'to',
 'different',
 'dancеs',
 'and',
 'fell',
 'victim',
 'to',
 'interlopers',
 'glances',
 'lost',
 'the',
 'game',
 'of',
 'chance',
 'what',
 'are',
 'the',
 'chances',
 'soon',
 'theyll',
 'go',
 'home',
 'to',
 'their',
 'husbands',
 'smug',
 'cause',
 'they',
 'know',
 'they',
 'can',

In [125]:
def filter_stopwords(words):
    filtered_words = [word for word in words if word not in stopwords.words('english')]
    return filtered_words

In [126]:
lemmatizer = WordNetLemmatizer()

In [127]:
def lemmatize_words(lemmatizer,words):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return lemmatized_words

In [153]:
stemmer = PorterStemmer()

In [155]:
def stem_words(stemmer,words):
    stemmed_words = [stemmer.stem(word) for word in words]

    return stemmed_words

In [173]:
stemmer.stem('descends')

'descend'

In [174]:
stem_words(stemmer,songs.words[0])

[' uhoh uhoh   we hereby conduct this postmortem he was a hot house flower to my outdoorsman our maladies were such we could not cure them and so a touch that was my birthright became foreign   come one come all its happenin again the empathetic hunger descends well tell no one except all of our friends we must know how did it end uhoh uhoh   we were blind to unforeseen circumstances we learned thе right steps to different dancеs and fell victim to interlopers glances lost the game of chance what are the chances soon theyll go home to their husbands smug cause they know they can trust him then feverishly calling their cousins see taylor swift liveget tickets as low as $60you might also like guess who we ran into at the shops walking in circles like she was lost didnt you hear they called it all off one gasp and then how did it end   say it once again with feeling how the death rattle breathing silenced as the soul was leaving the deflation of our dreaming leaving me bereft and reeling 

In [172]:
lemmatize_words(lemmatizer,songs.words[0])

[' uhoh uhoh   we hereby conduct this postmortem he was a hot house flower to my outdoorsman our maladies were such we could not cure them and so a touch that was my birthright became foreign   come one come all its happenin again the empathetic hunger descends well tell no one except all of our friends we must know how did it end uhoh uhoh   we were blind to unforeseen circumstances we learned thе right steps to different dancеs and fell victim to interlopers glances lost the game of chance what are the chances soon theyll go home to their husbands smug cause they know they can trust him then feverishly calling their cousins see taylor swift liveget tickets as low as $60you might also like guess who we ran into at the shops walking in circles like she was lost didnt you hear they called it all off one gasp and then how did it end   say it once again with feeling how the death rattle breathing silenced as the soul was leaving the deflation of our dreaming leaving me bereft and reeling 

In [128]:
songs["processed_words"] = songs.words.apply(
    lambda text_arr: "".join(text_arr)).apply(word_tokenize).apply(filter_stopwords).apply(
    lambda words: lemmatize_words(lemmatizer, words))

Some models require passing of a string because they have an internal tokenization and trimming process. Thats why I am also going to save the processed words as text.

In [139]:
songs['joined_words'] = songs.processed_words.apply(lambda words: " ".join(words))

In [142]:
songs.processed_words[0]

['uhoh',
 'uhoh',
 'hereby',
 'conduct',
 'postmortem',
 'hot',
 'house',
 'flower',
 'outdoorsman',
 'malady',
 'could',
 'cure',
 'touch',
 'birthright',
 'became',
 'foreign',
 'come',
 'one',
 'come',
 'happenin',
 'empathetic',
 'hunger',
 'descends',
 'well',
 'tell',
 'one',
 'except',
 'friend',
 'must',
 'know',
 'end',
 'uhoh',
 'uhoh',
 'blind',
 'unforeseen',
 'circumstance',
 'learned',
 'thе',
 'right',
 'step',
 'different',
 'dancеs',
 'fell',
 'victim',
 'interloper',
 'glance',
 'lost',
 'game',
 'chance',
 'chance',
 'soon',
 'theyll',
 'go',
 'home',
 'husband',
 'smug',
 'cause',
 'know',
 'trust',
 'feverishly',
 'calling',
 'cousin',
 'see',
 'taylor',
 'swift',
 'liveget',
 'ticket',
 'low',
 '$',
 '60you',
 'might',
 'also',
 'like',
 'guess',
 'ran',
 'shop',
 'walking',
 'circle',
 'like',
 'lost',
 'didnt',
 'hear',
 'called',
 'one',
 'gasp',
 'end',
 'say',
 'feeling',
 'death',
 'rattle',
 'breathing',
 'silenced',
 'soul',
 'leaving',
 'deflation',
 'dre

In [135]:
songs[['album','song','processed_words','joined_words']].to_csv("./data/results/processed_lyrics", index=False)