# Data processing

In [1]:
# Import libraries
import pandas as pd
import os
import json
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
pd.options.mode.chained_assignment = None

In [2]:
# Read csv data with songs to process
# pd.set_option("display.max_rows", None, "display.max_columns", None)
songs = pd.read_csv('all_time_billboard_wrap_up_cleaned.csv')
display(songs)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file,Split Names,Song Language
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,0.json,Gordon Jenkins,en
1,2,Nat King Cole,Mona Lisa,1950,1.json,Nat King Cole,en
2,4,Gary and Bing Crosby,Sam's Song,1950,3.json,Gary,en
3,5,Gary and Bing Crosby,Simple Melody,1950,4.json,Gary,en
4,6,Teresa Brewer,"Music, Music, Music",1950,5.json,Teresa Brewer,en
...,...,...,...,...,...,...,...
6297,96,Morgan Wallen,More Than My Hometown,2020,6675.json,Morgan Wallen,en
6298,97,Luke Combs,Lovin' On You,2020,6676.json,Luke Combs,en
6299,98,Moneybagg Yo,Said Sum,2020,6677.json,Moneybagg Yo,en
6300,99,H.E.R. Featuring YG,Slide,2020,6678.json,H.E.R.,en


### Tokenizing and lemmatizing

In [3]:
# Download Punkt sentence tokenizing, parts of speech and lemmatizing models
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [4]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create columns with lyrics, tokenized lyrics, particular parts of speech and lemmatized words
wd = os.getcwd()
jsons_path = wd + '\\jsons\\'
stop = set(stopwords.words('english'))
stop.add('im')
stop.add('dont')
lemmatizer = WordNetLemmatizer()
songs['Lyrics'] = ''
songs['Cleaned Lyrics'] = [[] for _ in range(len(songs))]
songs['Tokenized Lyrics']  = [[] for _ in range(len(songs))]
songs['Lemmatized Words'] = [[] for _ in range(len(songs))]
songs['Parts of Speech'] = [[] for _ in range(len(songs))]
songs['Adjectives'] = [[] for _ in range(len(songs))] # JJ
songs['Nouns'] = [[] for _ in range(len(songs))] # NN
songs['Adverbs'] = [[] for _ in range(len(songs))] # RB
songs['Pronouns'] = [[] for _ in range(len(songs))] # PRB
songs['Verbs'] = [[] for _ in range(len(songs))] # VB
for i in range(len(songs)):
    with open(jsons_path + songs['JSON file'][i], 'r', encoding = 'UTF-8') as f:
        data = json.load(f)['lyrics'].lower()
        # Delete additional blank lines and comments in square brackets like [Verse 1], [Intro]
        # https://stackoverflow.com/questions/14596884/remove-text-between-and
        # https://stackoverflow.com/questions/1140958/whats-a-quick-one-liner-to-remove-empty-lines-from-a-python-string
        lyrics = re.sub(r'[\[].*?[\]]', '', data)
        lyrics = os.linesep.join([j for j in lyrics.splitlines() if j])
        lyrics = lyrics.translate(str.maketrans('', '', string.punctuation))
    songs['Lyrics'][i] = ' '.join(lyrics.split('\r\n'))
    cleaned_lyrics = songs['Lyrics'][i].split()
    cleaned_lyrics = ' '.join([word for word in cleaned_lyrics if word not in stop])
    songs['Cleaned Lyrics'][i] = cleaned_lyrics
    tokenized_lyrics = word_tokenize(lyrics)
    songs['Tokenized Lyrics'][i] = tokenized_lyrics
    songs['Lemmatized Words'][i] = [lemmatizer.lemmatize(word) for word in tokenized_lyrics]
    songs['Parts of Speech'][i] = nltk.pos_tag(songs['Lemmatized Words'][i])
    songs['Adjectives'][i] = []
    songs['Nouns'][i] = []
    songs['Adverbs'][i] = []
    songs['Pronouns'][i] = []
    songs['Verbs'][i] = []
    for m in songs['Parts of Speech'][i]:
        if m[1] == 'JJ':
            songs['Adjectives'][i].append(m[0])
        elif m[1] == 'NN':
            songs['Nouns'][i].append(m[0])
        elif m[1] == 'RB':
            songs['Adverbs'][i].append(m[0])
        elif m[1] == 'PRP':
            songs['Pronouns'][i].append(m[0])
        elif m[1] == 'VB':
            songs['Verbs'][i].append(m[0])
display(songs)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file,Split Names,Song Language,Lyrics,Cleaned Lyrics,Tokenized Lyrics,Lemmatized Words,Parts of Speech,Adjectives,Nouns,Adverbs,Pronouns,Verbs
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,0.json,Gordon Jenkins,en,irene goodnight irene goodnight goodnight iren...,irene goodnight irene goodnight goodnight iren...,"[irene, goodnight, irene, goodnight, goodnight...","[irene, goodnight, irene, goodnight, goodnight...","[(irene, NN), (goodnight, VBD), (irene, JJ), (...","[irene, goodnight, last, saturday, im, irene, ...","[irene, goodnight, irene, irene, ill, dream, n...","[down, now, sometimes, sometimes, sometimes, l...","[you, me, me, you, you, you, you]","[take, jump, stop, stop]"
1,2,Nat King Cole,Mona Lisa,1950,1.json,Nat King Cole,en,mona lisa mona lisa men have named you youre s...,mona lisa mona lisa men named youre like lady ...,"[mona, lisa, mona, lisa, men, have, named, you...","[mona, lisa, mona, lisa, men, have, named, you...","[(mona, NN), (lisa, NN), (mona, NN), (lisa, VB...","[mystic, broken, many, warm, real, mona, cold,...","[mona, lisa, mona, lady, smile, youre, mona, s...","[so, only, lonely, just, there, just, lonely, ...","[you, it, they, you, you, they, they, you, you...","[smile, tempt, hide, lie, smile, tempt, hide, ..."
2,4,Gary and Bing Crosby,Sam's Song,1950,3.json,Gary,en,ah heres a happy tune youll love to croon they...,ah heres happy tune youll love croon call sams...,"[ah, heres, a, happy, tune, youll, love, to, c...","[ah, here, a, happy, tune, youll, love, to, cr...","[(ah, NN), (here, RB), (a, DT), (happy, JJ), (...","[happy, song, youre, grim, grand, smile, wrong...","[ah, tune, youll, love, song, melody, nothing,...","[here, then, only, soon, so, never, ah, here, ...","[they, it, it, they, it, you, it, you, you, yo...","[croon, call, be, make, get, do, agree, forget..."
3,5,Gary and Bing Crosby,Simple Melody,1950,4.json,Gary,en,wont you play some simple melody like my mothe...,wont play simple melody like mother sang one g...,"[wont, you, play, some, simple, melody, like, ...","[wont, you, play, some, simple, melody, like, ...","[(wont, NN), (you, PRP), (play, VBP), (some, D...","[simple, good, old, simple, musical, classical...","[wont, melody, mother, sang, harmony, melody, ...","[just, simply, na, just, simply, na, just, sim...","[you, me, you, me, you, you, me, you, you, me,...","[play, play, rag, play, rag, play, rag, rag, r..."
4,6,Teresa Brewer,"Music, Music, Music",1950,5.json,Teresa Brewer,en,put another nickel in in the nickelodeon all i...,put another nickel nickelodeon want music musi...,"[put, another, nickel, in, in, the, nickelodeo...","[put, another, nickel, in, in, the, nickelodeo...","[(put, VB), (another, DT), (nickel, NN), (in, ...","[youd, dear, youd, dear, old]","[nickel, nickelodeon, i, music, music, music, ...","[danciong, close, so, danciong, close, so]","[you, you, me, you, me, you, you, you, me, you...","[put, keep]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,96,Morgan Wallen,More Than My Hometown,2020,6675.json,Morgan Wallen,en,girl our mamas are best friends and so are we ...,girl mamas best friends whole towns rooting us...,"[girl, our, mamas, are, best, friends, and, so...","[girl, our, mama, are, best, friend, and, so, ...","[(girl, VB), (our, PRP$), (mama, NN), (are, VB...","[whole, likely, few, real, deep, em, real, spi...","[mama, friend, town, u, home, team, plant, roo...","[so, just, twentyone, yet, twentyone, yet, jus...","[we, we, you, you, you, you, you, you, you, yo...","[girl, settle, let, grow, stop, go, i, need, k..."
6298,97,Luke Combs,Lovin' On You,2020,6676.json,Luke Combs,en,dont get me wrong i like a bobber on the water...,get wrong like bobber water hookin em reelin e...,"[dont, get, me, wrong, i, like, a, bobber, on,...","[dont, get, me, wrong, i, like, a, bobber, on,...","[(dont, NN), (get, VB), (me, PRP), (wrong, JJ)...","[wrong, friday, rewind, strong, enough, twodoo...","[dont, bobber, water, hookin, em, reelin, em, ...","[again, soon, youre, right, back, soon, youre,...","[me, it, me, you, you, you, you, you, you, you...","[get, slow, hit, get, get, im, get]"
6299,98,Moneybagg Yo,Said Sum,2020,6677.json,Moneybagg Yo,en,turn me up yc huh what ah i thought a broke ni...,turn yc huh ah thought broke nigga said someth...,"[turn, me, up, yc, huh, what, ah, i, thought, ...","[turn, me, up, yc, huh, what, ah, i, thought, ...","[(turn, VB), (me, PRP), (up, RP), (yc, RB), (h...","[i, somethin, nothin, fed, come, ah, pussy, so...","[broke, nigga, ah, talkin, shit, aint, sayin, ...","[yc, still, just, really, luckily, back, forev...","[me, they, we, it, it, she, it, you, she, you,...","[turn, huh, trap, run, huh, go, make, make, kn..."
6300,99,H.E.R. Featuring YG,Slide,2020,6678.json,H.E.R.,en,you always wearin them glasses you dont wanna ...,always wearin glasses wanna let sucker look ey...,"[you, always, wearin, them, glasses, you, dont...","[you, always, wearin, them, glass, you, dont, ...","[(you, PRP), (always, RB), (wearin, VBP), (the...","[wan, attractive, i, same, same, i, dont, gett...","[glass, sucker, look, eye, huh, passion, ooh, ...","[always, na, just, tryna, only, so, too, baby,...","[you, them, you, you, them, you, you, you, he,...","[let, show, get, keep, let, know, be, movin, b..."


In [5]:
songs.to_csv(wd + '\\lyrics_processed_data.csv', index = False)