# Data processing

In [1]:
# Import libraries
import pandas as pd
import os
import json
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
pd.options.mode.chained_assignment = None

In [2]:
# Read csv data with songs to process
# pd.set_option("display.max_rows", None, "display.max_columns", None)
songs = pd.read_csv('all_time_billboard_wrap_up_cleaned.csv')
display(songs)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file,Split Names,Song Language
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,0.json,Gordon Jenkins,en
1,2,Nat King Cole,Mona Lisa,1950,1.json,Nat King Cole,en
2,4,Gary and Bing Crosby,Sam's Song,1950,3.json,Gary,en
3,5,Gary and Bing Crosby,Simple Melody,1950,4.json,Gary,en
4,6,Teresa Brewer,"Music, Music, Music",1950,5.json,Teresa Brewer,en
...,...,...,...,...,...,...,...
6410,95,Cole Swindell,Single Saturday Night,2021,6774.json,Cole Swindell,en
6411,96,Lainey Wilson,Things A Man Oughta Know,2021,6775.json,Lainey Wilson,en
6412,97,BRS Kash,Throat Baby (Go Baby),2021,6776.json,BRS Kash,en
6413,98,Rod Wave,Tombstone,2021,6777.json,Rod Wave,en


### Tokenizing and lemmatizing

In [3]:
# Download Punkt sentence tokenizing, parts of speech and lemmatizing models
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [4]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)

# Create columns with lyrics, tokenized lyrics, particular parts of speech and lemmatized words
wd = os.getcwd()
jsons_path = wd + '\\jsons\\'
stop = stopwords.words('english')
stop_list = ['im', 'dont', 'lyrics', 'verse', 'intro', 'lyric']
stop.extend(stop_list)
lemmatizer = WordNetLemmatizer()
songs['Lyrics'] = ''
songs['Cleaned Lyrics'] = [[] for _ in range(len(songs))]
songs['Tokenized Lyrics']  = [[] for _ in range(len(songs))]
songs['Lemmatized Words'] = [[] for _ in range(len(songs))]
songs['Parts of Speech'] = [[] for _ in range(len(songs))]
songs['Adjectives'] = [[] for _ in range(len(songs))] # JJ
songs['Nouns'] = [[] for _ in range(len(songs))] # NN
songs['Adverbs'] = [[] for _ in range(len(songs))] # RB
songs['Pronouns'] = [[] for _ in range(len(songs))] # PRB
songs['Verbs'] = [[] for _ in range(len(songs))] # VB
for i in range(len(songs)):
    with open(jsons_path + songs['JSON file'][i], 'r', encoding = 'UTF-8') as f:
        data = json.load(f)['lyrics'].lower()
        # Delete additional blank lines and comments in square brackets like [Verse 1], [Intro]
        # https://stackoverflow.com/questions/14596884/remove-text-between-and
        # https://stackoverflow.com/questions/1140958/whats-a-quick-one-liner-to-remove-empty-lines-from-a-python-string
        lyrics = re.sub(r'[\[].*?[\]]', '', data)
        lyrics = os.linesep.join([j for j in lyrics.splitlines() if j])
        lyrics = lyrics.translate(str.maketrans('', '', string.punctuation))
    songs['Lyrics'][i] = ' '.join(lyrics.split('\r\n'))
    cleaned_lyrics = songs['Lyrics'][i].split()
    cleaned_lyrics = ' '.join([word for word in cleaned_lyrics if word not in stop])
    songs['Cleaned Lyrics'][i] = cleaned_lyrics
    tokenized_lyrics = word_tokenize(lyrics)
    songs['Tokenized Lyrics'][i] = tokenized_lyrics
    songs['Lemmatized Words'][i] = [lemmatizer.lemmatize(word) for word in tokenized_lyrics]
    songs['Parts of Speech'][i] = nltk.pos_tag(songs['Lemmatized Words'][i])
    songs['Adjectives'][i] = []
    songs['Nouns'][i] = []
    songs['Adverbs'][i] = []
    songs['Pronouns'][i] = []
    songs['Verbs'][i] = []
    for m in songs['Parts of Speech'][i]:
        if m[1] == 'JJ':
            songs['Adjectives'][i].append(m[0])
        elif m[1] == 'NN':
            songs['Nouns'][i].append(m[0])
        elif m[1] == 'RB':
            songs['Adverbs'][i].append(m[0])
        elif m[1] == 'PRP':
            songs['Pronouns'][i].append(m[0])
        elif m[1] == 'VB':
            songs['Verbs'][i].append(m[0])
display(songs)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file,Split Names,Song Language,Lyrics,Cleaned Lyrics,Tokenized Lyrics,Lemmatized Words,Parts of Speech,Adjectives,Nouns,Adverbs,Pronouns,Verbs
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,0.json,Gordon Jenkins,en,goodnight irene lyricsirene goodnight irene go...,goodnight irene lyricsirene goodnight irene go...,"[goodnight, irene, lyricsirene, goodnight, ire...","[goodnight, irene, lyricsirene, goodnight, ire...","[(goodnight, JJ), (irene, NN), (lyricsirene, N...","[goodnight, irene, goodnight, last, saturday, ...","[irene, lyricsirene, goodnight, irene, irene, ...","[down, now, sometimes, sometimes, sometimes, l...","[you, me, me, you, you, you, you]","[take, jump, stop, stop]"
1,2,Nat King Cole,Mona Lisa,1950,1.json,Nat King Cole,en,mona lisa lyrics mona lisa mona lisa men have ...,mona lisa mona lisa mona lisa men named youre ...,"[mona, lisa, lyrics, mona, lisa, mona, lisa, m...","[mona, lisa, lyric, mona, lisa, mona, lisa, me...","[(mona, NN), (lisa, VBZ), (lyric, JJ), (mona, ...","[lyric, mystic, broken, many, warm, real, mona...","[mona, mona, lisa, mona, lady, smile, youre, m...","[so, only, lonely, just, there, just, lonely, ...","[you, it, they, you, you, they, they, you, you...","[smile, tempt, hide, lie, smile, tempt, hide, ..."
2,4,Gary and Bing Crosby,Sam's Song,1950,3.json,Gary,en,sam’s song lyricsah heres a happy tune youll l...,sam’s song lyricsah heres happy tune youll lov...,"[sam, ’, s, song, lyricsah, heres, a, happy, t...","[sam, ’, s, song, lyricsah, here, a, happy, tu...","[(sam, NN), (’, NNP), (s, NN), (song, NN), (ly...","[happy, song, youre, grim, grand, smile, wrong...","[sam, s, song, lyricsah, tune, youll, love, so...","[here, then, only, soon, so, never, ah, here, ...","[they, it, it, they, it, you, it, you, you, yo...","[croon, call, be, make, get, do, agree, forget..."
3,5,Gary and Bing Crosby,Simple Melody,1950,4.json,Gary,en,play a simple melody lyricswont you play some ...,play simple melody lyricswont play simple melo...,"[play, a, simple, melody, lyricswont, you, pla...","[play, a, simple, melody, lyricswont, you, pla...","[(play, VB), (a, DT), (simple, JJ), (melody, N...","[simple, simple, good, old, simple, musical, c...","[melody, lyricswont, melody, mother, sang, har...","[just, simply, na, just, simply, na, just, sim...","[you, me, you, me, you, you, me, you, you, me,...","[play, play, play, rag, play, rag, play, rag, ..."
4,6,Teresa Brewer,"Music, Music, Music",1950,5.json,Teresa Brewer,en,music music music lyricsput another nickel in ...,music music music lyricsput another nickel nic...,"[music, music, music, lyricsput, another, nick...","[music, music, music, lyricsput, another, nick...","[(music, NN), (music, NN), (music, NN), (lyric...","[youd, dear, youd, dear, old]","[music, music, music, lyricsput, nickel, nicke...","[danciong, close, so, danciong, close, so]","[you, you, me, you, me, you, you, you, me, you...",[keep]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6410,95,Cole Swindell,Single Saturday Night,2021,6774.json,Cole Swindell,en,single saturday night lyrics i was out taking ...,single saturday night taking shots throwing sp...,"[single, saturday, night, lyrics, i, was, out,...","[single, saturday, night, lyric, i, wa, out, t...","[(single, JJ), (saturday, NN), (night, NN), (l...","[single, single, light, single, sittin, red, w...","[saturday, night, lyric, i, shot, spot, sheet,...","[just, not, there, nowhere, nowhere, pretty, l...","[me, them, they, me, me, you, they, me, they, ...","[catch, find, i, be, yeah, tell, i, wake, be, ..."
6411,96,Lainey Wilson,Things A Man Oughta Know,2021,6775.json,Lainey Wilson,en,things a man oughta know lyrics i can hook a t...,things man oughta know hook trailer twoinch hi...,"[things, a, man, oughta, know, lyrics, i, can,...","[thing, a, man, oughta, know, lyric, i, can, h...","[(thing, NN), (a, DT), (man, NN), (oughta, NN)...","[lyric, twoinch, fish, few, tough, good, late,...","[thing, man, oughta, i, trailer, hitch, i, sho...","[too, up, really, too, up, really, forever, ne...","[it, it, it, it, it, you, you, her, it, it, it...","[hook, shoot, catch, change, know, know, love,..."
6412,97,BRS Kash,Throat Baby (Go Baby),2021,6776.json,BRS Kash,en,throat baby go baby lyrics whats happenin chi ...,throat baby go baby whats happenin chi chi sex...,"[throat, baby, go, baby, lyrics, whats, happen...","[throat, baby, go, baby, lyric, whats, happeni...","[(throat, NN), (baby, NN), (go, VBP), (baby, N...","[lyric, chi, young, deep, throat, lil, young, ...","[throat, baby, baby, chi, sexy, lil, bitch, se...","[right, now, super, even, then, too, even]","[you, you, you, you, you, you, you, you, you, ...","[love, let, throat, love, let, throat, come, g..."
6413,98,Rod Wave,Tombstone,2021,6777.json,Rod Wave,en,tombstone lyrics damn this motherfucker too cr...,tombstone damn motherfucker crazy saucii let s...,"[tombstone, lyrics, damn, this, motherfucker, ...","[tombstone, lyric, damn, this, motherfucker, t...","[(tombstone, NN), (lyric, JJ), (damn, NN), (th...","[lyric, crazy, saucii, i, i, sad, bad, im, tru...","[tombstone, damn, motherfucker, let, shit, gun...","[too, ride, eighty8, so, so, so, just, em, nig...","[it, you, it, me, it, he, me, it, they, they, ...","[shit, keep, pop, keep, keep, spend, be, get, ..."


In [5]:
songs.to_csv(wd + '\\lyrics_processed_data.csv', index = False)