Data gathering

In [70]:
import pickle
import pandas as pd

def get_lyrics_from_file(file):
    '''Returns all the lyrics from a given file.'''
    file_object  = open(file, 'r')
    lyrics = file_object.read()
    return lyrics

#Top 10 artists of all time
artists = ['The Beatles', 'The Rolling Stones', 'Elton John', 'Mariah Carey', 'Madonna', 'Michael Jackson', 'Taylor Swift', 'Stevie Wonder', 'Whitney Houston', 'Elvis Presley', 'Drake', 'Queen']
print(artists)


['The Beatles', 'The Rolling Stones', 'Elton John', 'Mariah Carey', 'Madonna', 'Michael Jackson', 'Taylor Swift', 'Stevie Wonder', 'Whitney Houston', 'Elvis Presley', 'Drake', 'Queen']


In [71]:
#Reading lyrics from every artist file
artist_files = ["data\\lyrics\\" + artist.casefold().replace(' ', '') + ".txt" for artist in artists]
artists_lyrics = [get_lyrics_from_file(artist_file) for artist_file in artist_files]

#Creating data dictionary (for every artist -> lyrics)
data = {}
counter = 0
for artist in artists:
    data[artist] = artists_lyrics[counter]
    counter+=1

In [72]:
data.keys()

dict_keys(['The Beatles', 'The Rolling Stones', 'Elton John', 'Mariah Carey', 'Madonna', 'Michael Jackson', 'Taylor Swift', 'Stevie Wonder', 'Whitney Houston', 'Elvis Presley', 'Drake', 'Queen'])

Data cleaning

In [73]:
def combine_text(text):
    combined_text = ''.join(text)
    return combined_text

In [74]:
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [75]:
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['lyrics']
data_df['artist_name'] = artists
data_df

Unnamed: 0,lyrics,artist_name
The Beatles,"It was twenty years ago today\nSergeant Pepper taught the band to play,\nThey've been going in and out of style,\nBut they're guaranteed to raise ...",The Beatles
The Rolling Stones,Oh yeah\nI hear you talking\nWhen I'm on the street\nYour mouth don't move\nBut I can hear you speak\nWhat's the matter with the boy?\nHe don't co...,The Rolling Stones
Elton John,The roses in the window box\nHave tilted to one side\nEverything about this house\nWas born to grow and die\nOh it doesn't seem a year ago\nTo thi...,Elton John
Mariah Carey,"Oh, now you can have me\nWhen you want me\nYou simply ask me to be there\nAnd you're the only one\nWho makes me come running\n'Cause what you got\...",Mariah Carey
Madonna,"Life is a mystery, everyone must stand alone\nI hear you call my name\nAnd it feels like home\nWhen you call my name it's like a little prayer\nI'...",Madonna
Michael Jackson,I said you wanna be startin' somethin'\nYou got to be startin' somethin'\nI said you wanna be startin' somethin'\nYou got to be startin' somethin'...,Michael Jackson
Taylor Swift,There's somethin' bout the way\nThe street looks when it's just rained\nThere's a glow off the pavement\nWalk me to the car\nAnd you know I wanna ...,Taylor Swift
Stevie Wonder,"Spoken by SW:\n""When you say that you kill in the\nname of God or in the name of Allah,\nyou are truly cursing God, for that is not of God.\nWhen ...",Stevie Wonder
Whitney Houston,I found out what I've been missing\nAlways on the run\nI've been looking for someone\nNow you're here like you've been before\nAnd you know just w...,Whitney Houston
Elvis Presley,"Well, it's one for the money,\nTwo for the show,\nThree to get ready,\nNow go, cat, go.\n\nBut don't you step on my blue suede shoes.\nYou can do ...",Elvis Presley


In [76]:
import re
import string

def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\'', '', text)
    text = re.sub('\\n', ' ', text)
    text = re.sub('\*', ' ', text)
    text = re.sub('-', ' ', text)
    text = re.sub('\(', '', text)
    text = re.sub('\)', '', text)
    return text

cleaned_text = lambda x: clean_text(x)

In [77]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()

data_clean = pd.DataFrame(data_df.lyrics.apply(cleaned_text))
data_clean.columns = ['lyrics']
data_clean['artist_name'] = artists
data_clean.to_pickle('data_clean_NL.pkl')

dict_clean = {}
for i, j in data_clean.iterrows(): 
    dict_clean[i] = [' '.join(lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(j.lyrics))]
   

In [None]:
data_clean = pd.DataFrame.from_dict(dict_clean).transpose()
data_clean.columns = ['lyrics']
data_clean['artist_name'] = artists
data_clean

In [68]:
data_clean.to_pickle("corpus.pkl")

In [69]:
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

new_stop_words = ['aaah', 'aah', 'aaow', 'laa', 'ooh', 'ba', 'do', 'dodo', 'just', 'dont', 'gonna', 'bit', 'oh', 'im', 'like', 'little', 'cause', 'way',
                 'youre', 'youll', 'yeah', 'make', 'dont', 'wanna', 'na', 'somethin', 'something',
                 'know', 'ill', 'ive', 'thats', 'youve', 'shes', 'aint', 'la', 'getting', 'got', 'theyve']

stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

cv = CountVectorizer(stop_words=stop_words)

data_cv = cv.fit_transform(data_clean.lyrics)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

data_dtm.to_pickle("dtm.pkl")
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

data = pd.read_pickle('dtm.pkl')
data = data.transpose()
data.head()


Unnamed: 0,The Beatles,The Rolling Stones,Elton John,Mariah Carey,Madonna,Michael Jackson,Taylor Swift,Stevie Wonder,Whitney Houston,Elvis Presley,Drake,Queen
aaron,0,0,0,0,0,0,0,0,0,0,1,0
abandonedly,0,0,0,4,0,0,0,0,0,0,0,0
abel,0,0,3,0,0,0,0,0,0,0,0,0
abigail,0,0,0,0,0,0,2,0,0,0,0,0
able,0,0,0,1,1,0,0,2,0,0,0,1
