# Data processing

In [1]:
# Import libraries
import pandas as pd
import os
import json
import re
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
pd.options.mode.chained_assignment = None

In [2]:
# Read csv data with songs to process
songs = pd.read_csv('lyrics_metadata.csv')
display(songs)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file,Split Names,Song Language
0,2,Nat King Cole,Mona Lisa,1950,1.json,Nat King Cole,en
1,4,Gary and Bing Crosby,Sam's Song,1950,3.json,Gary,en
2,5,Gary and Bing Crosby,Simple Melody,1950,4.json,Gary,en
3,6,Teresa Brewer,"Music, Music, Music",1950,5.json,Teresa Brewer,en
4,8,Red Foley,Chattanoogie Shoe Shine Boy,1950,7.json,Red Foley,en
...,...,...,...,...,...,...,...
6283,95,Cole Swindell,Single Saturday Night,2021,6774.json,Cole Swindell,en
6284,96,Lainey Wilson,Things A Man Oughta Know,2021,6775.json,Lainey Wilson,en
6285,97,BRS Kash,Throat Baby (Go Baby),2021,6776.json,BRS Kash,en
6286,98,Rod Wave,Tombstone,2021,6777.json,Rod Wave,en


### Tokenizing and lemmatizing

In [3]:
# Download Punkt sentence tokenizing and lemmatizing models
nltk.download('punkt', quiet = True)
nltk.download('averaged_perceptron_tagger', quiet = True)
nltk.download('wordnet', quiet = True)
nltk.download('stopwords', quiet = True)

True

In [4]:
# Create columns with lyrics, tokenized lyrics, lemmatized words and corpus
wd = os.getcwd()
jsons_path = wd + '\\jsons\\'
stop = stopwords.words('english')
stop_list = ['im', 'dont', 'lyrics', 'verse', 'intro', 'lyric', 'aint', 'ill', 'ive', 't', 's', 'm', 'll', 're', 'd', 'don', 've', 'gon', 'don', 'wheres', 'whats', 'hows']
stop.extend(stop_list)
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
songs['Lyrics'] = ''
songs['Tokenized Lyrics']  = [[] for _ in range(len(songs))]
songs['Lemmatized Words'] = [[] for _ in range(len(songs))]
songs['Corpus'] = [[] for _ in range(len(songs))]

for i in range(len(songs)):
    with open(jsons_path + songs['JSON file'][i], 'r', encoding = 'UTF-8') as f:
        data = json.load(f)['lyrics'].lower()
        lyrics = re.sub(r'[\[].*?[\]]', ' ', data)
        last_word = lyrics.rsplit(None, 1)[-1]
        lyrics = lyrics.rsplit(' ', 1)[0]
        last_word = last_word.replace('embed','') # deleting "Embed" substring from last word
        last_word = ''.join([i for i in last_word if not i.isdigit()]) # deleting digit(s) from last word
        lyrics = lyrics + ' ' + last_word
        lyrics = os.linesep.join([j for j in lyrics.splitlines() if j])
        lyrics = lyrics.split('lyrics')[1].strip() # removing initial pattern from json: "[song title] lyrics"
    if type(lyrics) == 'float': # if lyrics is NaN
        lyrics = ''
    songs['Lyrics'][i] = ' '.join(lyrics.split('\r\n'))
    text_token = tokenizer.tokenize(songs['Lyrics'][i])
    songs['Tokenized Lyrics'][i] = text_token
    songs['Lemmatized Words'][i] = [lemmatizer.lemmatize(word, 'v') for word in text_token]
    songs['Corpus'][i] = ' '.join([word for word in songs['Lemmatized Words'][i] if word not in stop])
            
songs = songs[songs['Lyrics'] != '']
            
display(songs)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file,Split Names,Song Language,Lyrics,Tokenized Lyrics,Lemmatized Words,Corpus
0,2,Nat King Cole,Mona Lisa,1950,1.json,Nat King Cole,en,"mona lisa, mona lisa, men have named you you'r...","[mona, lisa, mona, lisa, men, have, named, you...","[mona, lisa, mona, lisa, men, have, name, you,...",mona lisa mona lisa men name like lady mystic ...
1,4,Gary and Bing Crosby,Sam's Song,1950,3.json,Gary,en,"ah, here's a happy tune, you'll love to croon ...","[ah, here, s, a, happy, tune, you, ll, love, t...","[ah, here, s, a, happy, tune, you, ll, love, t...",ah happy tune love croon call sam song catchy ...
2,5,Gary and Bing Crosby,Simple Melody,1950,4.json,Gary,en,won't you play some simple melody like my moth...,"[won, t, you, play, some, simple, melody, like...","[win, t, you, play, some, simple, melody, like...",win play simple melody like mother sing one go...
3,6,Teresa Brewer,"Music, Music, Music",1950,5.json,Teresa Brewer,en,put another nickel in in the nickelodeon all i...,"[put, another, nickel, in, in, the, nickelodeo...","[put, another, nickel, in, in, the, nickelodeo...",put another nickel nickelodeon want music musi...
4,8,Red Foley,Chattanoogie Shoe Shine Boy,1950,7.json,Red Foley,en,have you ever passed the corner of forth and g...,"[have, you, ever, passed, the, corner, of, for...","[have, you, ever, pass, the, corner, of, forth...",ever pass corner forth grand little ball rhyth...
...,...,...,...,...,...,...,...,...,...,...,...
6283,95,Cole Swindell,Single Saturday Night,2021,6774.json,Cole Swindell,en,i was out taking shots throwing down at the sp...,"[i, was, out, taking, shots, throwing, down, a...","[i, be, out, take, shots, throw, down, at, the...",take shots throw spot three sheet guy every si...
6284,96,Lainey Wilson,Things A Man Oughta Know,2021,6775.json,Lainey Wilson,en,i can hook a trailer on a two-inch hitch i can...,"[i, can, hook, a, trailer, on, a, two, inch, h...","[i, can, hook, a, trailer, on, a, two, inch, h...",hook trailer two inch hitch shoot shotgun catc...
6285,97,BRS Kash,Throat Baby (Go Baby),2021,6776.json,BRS Kash,en,"(what's happenin', chi chi?) sexy lil' bitch...","[what, s, happenin, chi, chi, sexy, lil, bitch...","[what, s, happenin, chi, chi, sexy, lil, bitch...",happenin chi chi sexy lil bitch sexy lil ho lo...
6286,98,Rod Wave,Tombstone,2021,6777.json,Rod Wave,en,"damn, this motherfucker too crazy, saucii let ...","[damn, this, motherfucker, too, crazy, saucii,...","[damn, this, motherfucker, too, crazy, saucii,...",damn motherfucker crazy saucii let shit ride e...


In [5]:
songs.to_csv(wd + '\\lyrics_processed_data.csv', index = False)