In [1]:
import pandas as pd

import nltk, os, glob, re

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

from nltk.stem import PorterStemmer

# Lyrics Read and Preprocessing

## Workflow: In this notebook I will read the lyrics into a dataframe and prepare it for work.

## Information about the data: 


## Reading and exploring the files

The approach I have chosen is to make a big dataset with all lyrics so I can have an initial exploration and and clean up. After that I will decide if I

In [2]:
def make_data_frame(path):
    """
    The following function will go through the albums directory and its subdirectories,
    collect the songs lyrics, the song and album name they belong to and create a 
    dataframe, which we then are going to use to analyze the lyrics.
    """
    data  = []
    for album_dir in glob.glob(os.path.join(path, "*")):
        album_name = os.path.basename(album_dir)
        for song_file in glob.glob(os.path.join(album_dir, "*.txt")):
            song_name = os.path.basename(song_file)
            with open(song_file, "r") as f:
                lyrics = f.read()
            data.append((album_name, song_name, lyrics))

    df = pd.DataFrame(data, columns=["album", "song", "lyrics"])
    
    return df

In [3]:
songs = make_data_frame("../data/albums")

In [4]:
songs

Unnamed: 0,album,song,lyrics
0,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,HowDidItEnd_.txt,137 ContributorsTranslationsPortuguêsEspañolال...
1,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,TheBolter.txt,93 ContributorsTranslationsالعربيةFrançaisفارس...
2,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,Peter.txt,95 ContributorsTranslationsEspañolFrançaisDeut...
3,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,imgonnagetyouback.txt,97 ContributorsTranslationsEspañolالعربيةFranç...
4,11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY,DownBad.txt,117 ContributorsTranslationsTürkçePortuguêsEsp...
...,...,...,...
238,7.Lover,MissAmericana_TheHeartbreakPrince.txt,111 ContributorsTranslationsEnglishHrvatskiPor...
239,7.Lover,CruelSummer.txt,166 ContributorsTranslationsTürkçeEspañolHrvat...
240,7.Lover,LondonBoy.txt,106 ContributorsTranslationsEspañolHrvatskiPor...
241,7.Lover,FalseGod.txt,91 ContributorsTranslationsTürkçeEspañolHrvats...


After loading the data successfully it's time to clean it up.

## Data Preprocessing

### Cleaning the album titles

Albums with long names such as `The Tortured Poets Department` will get an abbreviation like `TTPD`. Empty spaces and decapitalized albums will be fixed. Since those are single cases there would not be any need for function implementation and will fix them by hand.

In [5]:
songs.album.unique()

array(['11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY', '4.Red',
       '10.Midnights', '5.1989', '9. evermore', '6.Reputation',
       '1.TaylorSwift', '3.SpeakNow', '2.Fearless', '8.Folklore',
       '7.Lover'], dtype=object)

In [6]:
songs.album[songs.album == "11.THETORTUREDPOETSDEPARTMENT_THEANTHOLOGY"] = "11.TTPD"

In [7]:
songs.album[songs.album == "9. evermore"] = "9.Evermore"

In [8]:
songs.album.unique()

array(['11.TTPD', '4.Red', '10.Midnights', '5.1989', '9.Evermore',
       '6.Reputation', '1.TaylorSwift', '3.SpeakNow', '2.Fearless',
       '8.Folklore', '7.Lover'], dtype=object)

### Cleaning the song titles

I will trim the '.txt' from the names under song column. There are songs with suffixes like `_From_The_Vault_` `_Taylors_Version_` which are also redundant in that case.

In [9]:
def clean_song_name(song, characters = ["_",".txt"], patterns=["TaylorsVersion","FromTheVault"], replacement = ""):
    
    for character in characters:
        song = song.replace(character, replacement)

    for pattern in patterns:
        song = re.sub(pattern, replacement, song)

    return song

In [10]:
songs.song = songs.song.apply(clean_song_name).unique()

In [11]:
songs

Unnamed: 0,album,song,lyrics
0,11.TTPD,HowDidItEnd,137 ContributorsTranslationsPortuguêsEspañolال...
1,11.TTPD,TheBolter,93 ContributorsTranslationsالعربيةFrançaisفارس...
2,11.TTPD,Peter,95 ContributorsTranslationsEspañolFrançaisDeut...
3,11.TTPD,imgonnagetyouback,97 ContributorsTranslationsEspañolالعربيةFranç...
4,11.TTPD,DownBad,117 ContributorsTranslationsTürkçePortuguêsEsp...
...,...,...,...
238,7.Lover,MissAmericanaTheHeartbreakPrince,111 ContributorsTranslationsEnglishHrvatskiPor...
239,7.Lover,CruelSummer,166 ContributorsTranslationsTürkçeEspañolHrvat...
240,7.Lover,LondonBoy,106 ContributorsTranslationsEspañolHrvatskiPor...
241,7.Lover,FalseGod,91 ContributorsTranslationsTürkçeEspañolHrvats...


### Cleaning the lyrics

First things first I want to get rid of the precredits : "137 ContributorsTranslationsPortuguêsEspañolال..." since they do not carry any significant information about tendencies, trends or sentiment. Then I want to remove everything in [] like `[Intro]`, `[Verse 1]` and so on, commas and parenthesies.

In [12]:
def remove_credits(text):
    """
    Removes everything before and including "Lyrics" from a given text.
    """

    match = re.search(r"Lyrics", text, re.IGNORECASE)
    if match:
        return text[match.end():]
    else:
        return ""

In [13]:
songs.lyrics = songs.lyrics.apply(remove_credits)

In [14]:
songs

Unnamed: 0,album,song,lyrics
0,11.TTPD,HowDidItEnd,"[Intro]\n(Uh-oh, uh-oh)\n\n[Verse 1]\nWe hereb..."
1,11.TTPD,TheBolter,"[Verse 1]\nBy all accounts, she almost drowned..."
2,11.TTPD,Peter,"[Verse 1]\nForgive me, Peter\nMy lost fearless..."
3,11.TTPD,imgonnagetyouback,"[Intro]\nYeah\n\n[Verse 1]\nLilac short skirt,..."
4,11.TTPD,DownBad,[Verse 1]\nDid you really beam me up\nIn a clo...
...,...,...,...
238,7.Lover,MissAmericanaTheHeartbreakPrince,"[Verse 1]\nYou know I adore you, I'm crazier f..."
239,7.Lover,CruelSummer,"[Intro]\n(Yeah, yeah, yeah, yeah)\n\n[Verse 1]..."
240,7.Lover,LondonBoy,[Intro: Idris Elba & James Corden]\nWe can go ...
241,7.Lover,FalseGod,[Verse 1]\nWe were crazy to think\nCrazy to th...


In [15]:
def remove_lyric_tags(text, patterns = [r"(\[.*?\])","\d{1,10}Embed"]):
    for pattern in patterns:
        text = re.sub(pattern,"",text)
    return text

In [16]:
songs.lyrics  = songs.lyrics.apply(remove_lyric_tags)

In [17]:
songs["words"] = songs.lyrics.str.split('\s+')

In [18]:
text = 'hi \na'

In [19]:
delimeters = "[\s+\n]"

In [20]:
re.split(delimeters, text)

['hi', '', 'a']

In [21]:
songs

Unnamed: 0,album,song,lyrics,words
0,11.TTPD,HowDidItEnd,"\n(Uh-oh, uh-oh)\n\n\nWe hereby conduct this p...","[, (Uh-oh,, uh-oh), We, hereby, conduct, this,..."
1,11.TTPD,TheBolter,"\nBy all accounts, she almost drowned\nWhen sh...","[, By, all, accounts,, she, almost, drowned, W..."
2,11.TTPD,Peter,"\nForgive me, Peter\nMy lost fearless leader\n...","[, Forgive, me,, Peter, My, lost, fearless, le..."
3,11.TTPD,imgonnagetyouback,"\nYeah\n\n\nLilac short skirt, the one that fi...","[, Yeah, Lilac, short, skirt,, the, one, that,..."
4,11.TTPD,DownBad,\nDid you really beam me up\nIn a cloud of spa...,"[, Did, you, really, beam, me, up, In, a, clou..."
...,...,...,...,...
238,7.Lover,MissAmericanaTheHeartbreakPrince,"\nYou know I adore you, I'm crazier for you\nT...","[, You, know, I, adore, you,, I'm, crazier, fo..."
239,7.Lover,CruelSummer,"\n(Yeah, yeah, yeah, yeah)\n\n\nFever dream hi...","[, (Yeah,, yeah,, yeah,, yeah), Fever, dream, ..."
240,7.Lover,LondonBoy,"\nWe can go drivin' in, on my scooter\nUh, you...","[, We, can, go, drivin', in,, on, my, scooter,..."
241,7.Lover,FalseGod,\nWe were crazy to think\nCrazy to think that ...,"[, We, were, crazy, to, think, Crazy, to, thin..."


It seems we have an outlier with length 0 which might cause exceptions later 

In [22]:
songs[songs.words.str.len() == 0]

Unnamed: 0,album,song,lyrics,words


In [23]:
songs.drop(133,inplace=True)

In [24]:
def clean_text(words,characters = [":",",","(",")","-","?","!","\""], patterns = [r"(\[.*?\])","\d{1,10}Embed"], replacement = ""):
    """
    Removes everything before and including the word "Lyrics" in a given text. 
    """
    cleaned_words = words.copy()

    for character in characters:

        cleaned_words = [word.replace(character,replacement) for word in cleaned_words]

    cleaned_words = [word.replace("\n"," ") for word in cleaned_words]

    for pattern in patterns:
        cleaned_words = [re.sub(pattern,replacement,word) for word in cleaned_words]

    cleaned_words = [word for word in cleaned_words if word != ""]

    return cleaned_words

In [25]:
songs.words = songs.words.apply(clean_text)

In [26]:
songs

Unnamed: 0,album,song,lyrics,words
0,11.TTPD,HowDidItEnd,"\n(Uh-oh, uh-oh)\n\n\nWe hereby conduct this p...","[Uhoh, uhoh, We, hereby, conduct, this, postmo..."
1,11.TTPD,TheBolter,"\nBy all accounts, she almost drowned\nWhen sh...","[By, all, accounts, she, almost, drowned, When..."
2,11.TTPD,Peter,"\nForgive me, Peter\nMy lost fearless leader\n...","[Forgive, me, Peter, My, lost, fearless, leade..."
3,11.TTPD,imgonnagetyouback,"\nYeah\n\n\nLilac short skirt, the one that fi...","[Yeah, Lilac, short, skirt, the, one, that, fi..."
4,11.TTPD,DownBad,\nDid you really beam me up\nIn a cloud of spa...,"[Did, you, really, beam, me, up, In, a, cloud,..."
...,...,...,...,...
238,7.Lover,MissAmericanaTheHeartbreakPrince,"\nYou know I adore you, I'm crazier for you\nT...","[You, know, I, adore, you, I'm, crazier, for, ..."
239,7.Lover,CruelSummer,"\n(Yeah, yeah, yeah, yeah)\n\n\nFever dream hi...","[Yeah, yeah, yeah, yeah, Fever, dream, high, i..."
240,7.Lover,LondonBoy,"\nWe can go drivin' in, on my scooter\nUh, you...","[We, can, go, drivin', in, on, my, scooter, Uh..."
241,7.Lover,FalseGod,\nWe were crazy to think\nCrazy to think that ...,"[We, were, crazy, to, think, Crazy, to, think,..."


### Adapting the words column to be compatible for sentiment analysis.
To make the words from the lyrics understandable to the models we are about to use certain things must be done:

* Word tokenization : this is the process of splitting the lyrics string into separate strings, where each word (substring) will represent one token
* Stop Word Filtering : Just as any other text the song lyrics contain certain words that do not hold any relevant information for the analysis. Such words are called stop - words and need to be removed from the lyrics.
* Word Stemming/ Lemmatization: both methods are used to reduce the word to its root and remove suffixes like `-ing` etc. I chose to use lematization process simply because it is more accurate than the Stemming, where Stemming could produce words such as `runn` which is not correct.

In [27]:
songs.words[0]

['Uhoh',
 'uhoh',
 'We',
 'hereby',
 'conduct',
 'this',
 'postmortem',
 'He',
 'was',
 'a',
 'hot',
 'house',
 'flower',
 'to',
 'my',
 'outdoorsman',
 'Our',
 'maladies',
 'were',
 'such',
 'we',
 'could',
 'not',
 'cure',
 'them',
 'And',
 'so',
 'a',
 'touch',
 'that',
 'was',
 'my',
 'birthright',
 'became',
 'foreign',
 'Come',
 'one',
 'come',
 'all',
 "it's",
 "happenin'",
 'again',
 'The',
 'empathetic',
 'hunger',
 'descends',
 "We'll",
 'tell',
 'no',
 'one',
 'except',
 'all',
 'of',
 'our',
 'friends',
 'We',
 'must',
 'know',
 'How',
 'did',
 'it',
 'end',
 'Uhoh',
 'uhoh',
 'We',
 'were',
 'blind',
 'to',
 'unforeseen',
 'circumstances',
 'We',
 'learned',
 'thе',
 'right',
 'steps',
 'to',
 'different',
 'dancеs',
 'And',
 'fell',
 'victim',
 'to',
 "interlopers'",
 'glances',
 'Lost',
 'the',
 'game',
 'of',
 'chance',
 'what',
 'are',
 'the',
 'chances',
 'Soon',
 "they'll",
 'go',
 'home',
 'to',
 'their',
 'husbands',
 'Smug',
 "'cause",
 'they',
 'know',
 'they',
 

In [28]:
songs.words = songs.words.apply(lambda words: [word.lower() for word in words])

In [29]:
songs

Unnamed: 0,album,song,lyrics,words
0,11.TTPD,HowDidItEnd,"\n(Uh-oh, uh-oh)\n\n\nWe hereby conduct this p...","[uhoh, uhoh, we, hereby, conduct, this, postmo..."
1,11.TTPD,TheBolter,"\nBy all accounts, she almost drowned\nWhen sh...","[by, all, accounts, she, almost, drowned, when..."
2,11.TTPD,Peter,"\nForgive me, Peter\nMy lost fearless leader\n...","[forgive, me, peter, my, lost, fearless, leade..."
3,11.TTPD,imgonnagetyouback,"\nYeah\n\n\nLilac short skirt, the one that fi...","[yeah, lilac, short, skirt, the, one, that, fi..."
4,11.TTPD,DownBad,\nDid you really beam me up\nIn a cloud of spa...,"[did, you, really, beam, me, up, in, a, cloud,..."
...,...,...,...,...
238,7.Lover,MissAmericanaTheHeartbreakPrince,"\nYou know I adore you, I'm crazier for you\nT...","[you, know, i, adore, you, i'm, crazier, for, ..."
239,7.Lover,CruelSummer,"\n(Yeah, yeah, yeah, yeah)\n\n\nFever dream hi...","[yeah, yeah, yeah, yeah, fever, dream, high, i..."
240,7.Lover,LondonBoy,"\nWe can go drivin' in, on my scooter\nUh, you...","[we, can, go, drivin', in, on, my, scooter, uh..."
241,7.Lover,FalseGod,\nWe were crazy to think\nCrazy to think that ...,"[we, were, crazy, to, think, crazy, to, think,..."


In [30]:
songs.words[0]

['uhoh',
 'uhoh',
 'we',
 'hereby',
 'conduct',
 'this',
 'postmortem',
 'he',
 'was',
 'a',
 'hot',
 'house',
 'flower',
 'to',
 'my',
 'outdoorsman',
 'our',
 'maladies',
 'were',
 'such',
 'we',
 'could',
 'not',
 'cure',
 'them',
 'and',
 'so',
 'a',
 'touch',
 'that',
 'was',
 'my',
 'birthright',
 'became',
 'foreign',
 'come',
 'one',
 'come',
 'all',
 "it's",
 "happenin'",
 'again',
 'the',
 'empathetic',
 'hunger',
 'descends',
 "we'll",
 'tell',
 'no',
 'one',
 'except',
 'all',
 'of',
 'our',
 'friends',
 'we',
 'must',
 'know',
 'how',
 'did',
 'it',
 'end',
 'uhoh',
 'uhoh',
 'we',
 'were',
 'blind',
 'to',
 'unforeseen',
 'circumstances',
 'we',
 'learned',
 'thе',
 'right',
 'steps',
 'to',
 'different',
 'dancеs',
 'and',
 'fell',
 'victim',
 'to',
 "interlopers'",
 'glances',
 'lost',
 'the',
 'game',
 'of',
 'chance',
 'what',
 'are',
 'the',
 'chances',
 'soon',
 "they'll",
 'go',
 'home',
 'to',
 'their',
 'husbands',
 'smug',
 "'cause",
 'they',
 'know',
 'they',
 

In [31]:
stopwords = stopwords.words('english').copy()

In [32]:
bonus_words = ['uhoh','we\'ll','ooh','radidididididididididada','yeah','woah']

In [33]:
for word in bonus_words:
    stopwords.append(word)

In [34]:
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [35]:
words_no_stop = [word for word in songs.words[0] if word not in stopwords]

In [36]:
words_no_stop

['hereby',
 'conduct',
 'postmortem',
 'hot',
 'house',
 'flower',
 'outdoorsman',
 'maladies',
 'could',
 'cure',
 'touch',
 'birthright',
 'became',
 'foreign',
 'come',
 'one',
 'come',
 "happenin'",
 'empathetic',
 'hunger',
 'descends',
 'tell',
 'one',
 'except',
 'friends',
 'must',
 'know',
 'end',
 'blind',
 'unforeseen',
 'circumstances',
 'learned',
 'thе',
 'right',
 'steps',
 'different',
 'dancеs',
 'fell',
 'victim',
 "interlopers'",
 'glances',
 'lost',
 'game',
 'chance',
 'chances',
 'soon',
 "they'll",
 'go',
 'home',
 'husbands',
 'smug',
 "'cause",
 'know',
 'trust',
 'feverishly',
 'calling',
 'cousins',
 'see',
 'taylor',
 'swift',
 'liveget',
 'tickets',
 'low',
 '$60you',
 'might',
 'also',
 'like',
 'guess',
 'ran',
 'shops',
 'walking',
 'circles',
 'like',
 'lost',
 'hear',
 'called',
 'one',
 'gasp',
 'end',
 'say',
 'feeling',
 'death',
 'rattle',
 'breathing',
 'silenced',
 'soul',
 'leaving',
 'deflation',
 'dreaming',
 'leaving',
 'bereft',
 'reeling',


In [37]:
songs.words = songs.words.apply(lambda words: [word for word in words if word not in stopwords])

In [38]:
lemmatizer = WordNetLemmatizer()

In [39]:
words_lematized = [lemmatizer.lemmatize(word) for word in words_no_stop]

In [40]:
words_lematized

['hereby',
 'conduct',
 'postmortem',
 'hot',
 'house',
 'flower',
 'outdoorsman',
 'malady',
 'could',
 'cure',
 'touch',
 'birthright',
 'became',
 'foreign',
 'come',
 'one',
 'come',
 "happenin'",
 'empathetic',
 'hunger',
 'descends',
 'tell',
 'one',
 'except',
 'friend',
 'must',
 'know',
 'end',
 'blind',
 'unforeseen',
 'circumstance',
 'learned',
 'thе',
 'right',
 'step',
 'different',
 'dancеs',
 'fell',
 'victim',
 "interlopers'",
 'glance',
 'lost',
 'game',
 'chance',
 'chance',
 'soon',
 "they'll",
 'go',
 'home',
 'husband',
 'smug',
 "'cause",
 'know',
 'trust',
 'feverishly',
 'calling',
 'cousin',
 'see',
 'taylor',
 'swift',
 'liveget',
 'ticket',
 'low',
 '$60you',
 'might',
 'also',
 'like',
 'guess',
 'ran',
 'shop',
 'walking',
 'circle',
 'like',
 'lost',
 'hear',
 'called',
 'one',
 'gasp',
 'end',
 'say',
 'feeling',
 'death',
 'rattle',
 'breathing',
 'silenced',
 'soul',
 'leaving',
 'deflation',
 'dreaming',
 'leaving',
 'bereft',
 'reeling',
 'beloved',


In [41]:
songs.words = songs.words.apply(lambda words: [lemmatizer.lemmatize(word) for word in words])

In [42]:
songs

Unnamed: 0,album,song,lyrics,words
0,11.TTPD,HowDidItEnd,"\n(Uh-oh, uh-oh)\n\n\nWe hereby conduct this p...","[hereby, conduct, postmortem, hot, house, flow..."
1,11.TTPD,TheBolter,"\nBy all accounts, she almost drowned\nWhen sh...","[account, almost, drowned, six, frigid, water,..."
2,11.TTPD,Peter,"\nForgive me, Peter\nMy lost fearless leader\n...","[forgive, peter, lost, fearless, leader, close..."
3,11.TTPD,imgonnagetyouback,"\nYeah\n\n\nLilac short skirt, the one that fi...","[lilac, short, skirt, one, fit, like, skin, re..."
4,11.TTPD,DownBad,\nDid you really beam me up\nIn a cloud of spa...,"[really, beam, cloud, sparkling, dust, experim..."
...,...,...,...,...
238,7.Lover,MissAmericanaTheHeartbreakPrince,"\nYou know I adore you, I'm crazier for you\nT...","[know, adore, i'm, crazier, 16, lost, film, sc..."
239,7.Lover,CruelSummer,"\n(Yeah, yeah, yeah, yeah)\n\n\nFever dream hi...","[fever, dream, high, quiet, night, know, caugh..."
240,7.Lover,LondonBoy,"\nWe can go drivin' in, on my scooter\nUh, you...","[go, drivin', scooter, uh, know, 'round, londo..."
241,7.Lover,FalseGod,\nWe were crazy to think\nCrazy to think that ...,"[crazy, think, crazy, think, could, work, reme..."


Some models require passing of a string because they have an internal tokenization and trimming process. Thats why I am also going to save the processed words as text.

In [43]:
songs['joined_words'] = songs.words.apply(lambda words: " ".join(words))

In [44]:
songs[['album','song','words','joined_words']].to_csv("../data/results/processed_lyrics", index=False)