In [1]:
# Auto Reload
%load_ext autoreload
%autoreload 2

In [2]:
# Import Modules
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import json
from pprint import pprint
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from math import log
import re


# NLTK Modules
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('tagsets')
from nltk import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import chunk
from nltk.util import ngrams

# Import Custom Modules
from src.data_cleaner import *
from src.dummy_words import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [3]:
# Run Scraping Program
# !python src/web_scraper.py

# Intake Scraped Data

In [4]:
# Get Data
# data = pd.read_csv('data/data.csv')
data = pd.read_csv('data/all_data.csv')
data.head(3)

Unnamed: 0.1,Unnamed: 0,song,artist,featured,rank,year,lyrics,lyrics_state,song_id,lyrics_owner_id,primary_artist_url
0,0,Thrift Shop,Macklemore & Ryan Lewis,,1,2000,"""Hey, Macklemore, can we go thrift shopping?""\...",True,86538,3928,https://genius.com/artists/Macklemore-and-ryan...
1,1,Can't Hold Us,Macklemore & Ryan Lewis,,2,2000,"Hey, hey, hey\nGood to see you\nCome on, dude,...",True,57234,37383,https://genius.com/artists/Macklemore-and-ryan...
2,2,Holy Grail,Jay Z,,3,2000,You'd take the clothes off my back and I'd let...,True,177832,104344,https://genius.com/artists/Jay-z


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
Unnamed: 0            1000 non-null int64
song                  1000 non-null object
artist                1000 non-null object
featured              0 non-null float64
rank                  1000 non-null int64
year                  1000 non-null int64
lyrics                1000 non-null object
lyrics_state          1000 non-null bool
song_id               1000 non-null int64
lyrics_owner_id       1000 non-null int64
primary_artist_url    986 non-null object
dtypes: bool(1), float64(1), int64(5), object(4)
memory usage: 63.5+ KB


In [6]:
# data[data['lyrics'].isna()]

In [7]:
# Global Variables
song_idx = 0
data.iloc[song_idx]

Unnamed: 0                                                            0
song                                                        Thrift Shop
artist                                         Macklemore & Ryan Lewis 
featured                                                            NaN
rank                                                                  1
year                                                               2000
lyrics                "Hey, Macklemore, can we go thrift shopping?"\...
lyrics_state                                                       True
song_id                                                           86538
lyrics_owner_id                                                    3928
primary_artist_url    https://genius.com/artists/Macklemore-and-ryan...
Name: 0, dtype: object

# Text Cleaning

Creating pipeline for **indexing** song lyrics (document).

This will lead to **indexing** which creates a **signature** (vector) for each document.

Then, the **signatures** will be used for relating documents one to the other (and find out similar clusters of documents), or for mining underlying relations between concepts.

<img src="media/text-pipeline.png" width="100%"/>

### Bags of Words: Lyric String Example
> With the intake lyric data, it seems that there needs to be a couple things cleaned. Casing, punctuation, and new-lines

*Lyric String Passover 1*

In [8]:
sample = data['lyrics'][song_idx]
sample[:len(sample)//5]

'"Hey, Macklemore, can we go thrift shopping?"\nWhat what, what, what\nWhat what, what, what\nWhat what, what, what\nWhat what, what, what\nWhat what, what, what\nBada, bada, bada doo da\nWhat what, what, what\nBada, bada, bada doo da\nWhat what, what, what\nBada, bada, bada doo da\nBada, bada, bada doo da\nBada, bada, bada doo da\nBada, bada, bada doo da\nBada, bada, bada doo da\n\nI\'m gonna pop some tags\nOnly got 20 dollars in my pocket\nI\'m, I\'m, I\'m huntin\', lookin\' for a come up\nThis is fucking awesome\n\nNow\nWalk into the club like, "What up? I got a big cock"\nNah, I\'m just pumped, I bought some shit from a thrift shop\nIce on the fringe is so damn frosty\nThe people like, "Damn, that\'s a cold ass honkey"\nRollin\' in hella deep, headed to the mezzanine\nDressed in '

*Lyric String Passover 2*

In [9]:
# Clean New-line breaks, but preserve periods
data['lyrics'] = data['lyrics'].apply(lambda x: x.split('\n'))

In [10]:
sample = data['lyrics'][song_idx]
# Need to join due to splitting to list
# " ".join(sample[:len(sample)//5])
sample[:len(sample)//5]

['"Hey, Macklemore, can we go thrift shopping?"',
 'What what, what, what',
 'What what, what, what',
 'What what, what, what',
 'What what, what, what',
 'What what, what, what',
 'Bada, bada, bada doo da',
 'What what, what, what',
 'Bada, bada, bada doo da',
 'What what, what, what',
 'Bada, bada, bada doo da',
 'Bada, bada, bada doo da',
 'Bada, bada, bada doo da',
 'Bada, bada, bada doo da',
 'Bada, bada, bada doo da',
 '',
 "I'm gonna pop some tags",
 'Only got 20 dollars in my pocket',
 "I'm, I'm, I'm huntin', lookin' for a come up",
 'This is fucking awesome']

*Lyric String Passover 3*

In [11]:
# Use Custom Text Cleaning Function
data = clean_text(data, 'lyrics', 'lyrics')

In [12]:
sample = data['lyrics'][song_idx]
# Need to join due to splitting to list
# " ".join(sample[:len(sample)//5])
sample[:len(sample)//5]

['hey macklemore can we go thrift shopping',
 'what what what what',
 'what what what what',
 'what what what what',
 'what what what what',
 'what what what what',
 'bada bada bada doo da',
 'what what what what',
 'bada bada bada doo da',
 'what what what what',
 'bada bada bada doo da',
 'bada bada bada doo da',
 'bada bada bada doo da',
 'bada bada bada doo da',
 'bada bada bada doo da',
 '',
 'im gonna pop some tags',
 'only got 20 dollars in my pocket',
 'im im im huntin lookin for a come up',
 'this is fucking awesome']

### Create 'document' Feature

In [13]:
# Make Document Feature with Lyrics joined into one string (strips, negates whitespace)
data['clean_text'] = data['lyrics'].apply(lambda x: " ".join([i.strip() for i in x if i]))

In [14]:
sample = data['clean_text'][song_idx]
# Need to join due to splitting to list
sample[:len(sample)//5]

'hey macklemore can we go thrift shopping what what what what what what what what what what what what what what what what what what what what bada bada bada doo da what what what what bada bada bada doo da what what what what bada bada bada doo da bada bada bada doo da bada bada bada doo da bada bada bada doo da bada bada bada doo da im gonna pop some tags only got 20 dollars in my pocket im im im huntin lookin for a come up this is fucking awesome now walk into the club like what up i got a big cock nah im just pumped i bought some shit from a thrift shop ice on the fringe is so damn frosty the people like damn thats a cold ass honkey rollin in hella deep headed to the mezzanine dressed in all pin'

# Tokenization

# REMOVE EXPLETIVES

In [15]:
data['clean_text'] = data['clean_text'].apply(match_dummies)

In [16]:
# Create Sent Token Feature
data['sentences'] = data['clean_text'].apply(sent_tokenize)

In [17]:
# Show Sentences (truncated)
data['sentences'][song_idx][:10]

['hey macklemore can we go thrift shopping what what what what what what what what what what what what what what what what what what what what bada bada bada doo da what what what what bada bada bada doo da what what what what bada bada bada doo da bada bada bada doo da bada bada bada doo da bada bada bada doo da bada bada bada doo da im gonna pop some tags only got 20 dollars in my pocket im im im huntin lookin for a come up this is expletive_3 awesome now walk into the club like what up i got a big expletive_7 nah im just pumped i bought some expletive_4 from a thrift shop ice on the fringe is so damn frosty the people like damn thats a cold expletive_6 honkey rollin in hella deep headed to the mezzanine dressed in all pink cept my gator sexpletive_5 those are green draped in a leopard mink girl standin next to me probably shouldve washed this smells like r kelly sheets expletive_12 but expletive_4 it was 99 cents expletive_3 it coppin it washin it bout to go and get some compliments

In [18]:
# Create tokens for each song
data['tokens'] = data['clean_text'].apply(word_tokenize)
print(data['tokens'][song_idx][:20])

['hey', 'macklemore', 'can', 'we', 'go', 'thrift', 'shopping', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what']


# Filtration (Stop-words, punctiation, etc)

In [19]:
# Filter Punctuation
data['tokens'] = data['tokens'].apply(lambda x: [i for i in x if i not in string.punctuation])
print(data['tokens'][song_idx][:20])

['hey', 'macklemore', 'can', 'we', 'go', 'thrift', 'shopping', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what']


In [20]:
# Filter Stop Words
stop_words = set(stopwords.words('english'))
stop_words = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", " ".join(stop_words)).split() + ['im', 'ill']
data['tokens_stop'] = data['tokens'].apply(lambda x: [i for i in x if i not in stop_words])
print(data['tokens_stop'][song_idx][:20])

['hey', 'macklemore', 'go', 'thrift', 'shopping', 'bada', 'bada', 'bada', 'doo', 'da', 'bada', 'bada', 'bada', 'doo', 'da', 'bada', 'bada', 'bada', 'doo', 'da']


### Compare Tokens to Lyric Unique Words (from whole document)

In [21]:
print("Tokens: ")
print(data['tokens'][song_idx][:20])
print("Set (from non tokens): ")
print(list(set(data['clean_text'][song_idx].split()))[:20])

Tokens: 
['hey', 'macklemore', 'can', 'we', 'go', 'thrift', 'shopping', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what', 'what']
Set (from non tokens): 
['oh', 'dookie', 'a', 'road', 'expletive_7', 'zebra', 'but', 'about', 'big', 'wolf', 'gucci', 'yeah', 'been', 'next', 'other', 'tags', 'pink', 'thrift', 'leather', 'call']


# Stem or Lemmatize Words

In [22]:
# Create Porter, Snowball, WordNet Objects
porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

# Get functions from each object
porter_func = porter.stem
snowball_func = snowball.stem
wordnet_func = wordnet.lemmatize

# Create lambda func to easily apply func to each token
get_root = lambda tokens, func: [func(token) for token in tokens] 

In [23]:
# Get Tokens for each type of processor
porter_tokens = data['tokens'].apply(lambda x: get_root(x, porter_func)) 
snowball_tokens = data['tokens'].apply(lambda x: get_root(x, snowball_func)) 
wordnet_tokens = data['tokens'].apply(lambda x: get_root(x, wordnet_func)) 

### Results

In [24]:
## Print the stemmed and lemmatized words from the target document
print("%16s | %16s | %16s | %16s |" % ("WORD", "PORTER", "SNOWBALL", "LEMMATIZER"))
for i in range(min(len(porter_tokens[song_idx]), len(snowball_tokens[song_idx]), len(wordnet_tokens[song_idx]))):
    p, s, w = porter_tokens[song_idx][i], snowball_tokens[song_idx][i], wordnet_tokens[song_idx][i]
    if len(set((p, s, w))) != 1:
        print("%16s | %16s | %16s | %16s |" % (data['tokens'][song_idx][i], p, s, w))

            WORD |           PORTER |         SNOWBALL |       LEMMATIZER |
      macklemore |        macklemor |        macklemor |       macklemore |
        shopping |             shop |             shop |         shopping |
            only |             onli |             onli |             only |
            this |              thi |             this |             this |
         awesome |           awesom |           awesom |          awesome |
          pumped |             pump |             pump |           pumped |
          fringe |            fring |            fring |           fringe |
          frosty |           frosti |           frosti |           frosty |
          people |            peopl |            peopl |           people |
           thats |             that |             that |            thats |
          headed |             head |             head |           headed |
       mezzanine |         mezzanin |         mezzanin |        mezzanine |
         dre

In conclusion, the results show that using any type of stemmer or lemmatizer seemed to detract from the words rather than help center them. These methods of word procession are not able to account for the colloqualisms that come from the language of rap. Therefore we will not proceed with using this for any word processing.

# More Features Added

Now that the tokens are extracted from the lyric set, it's time to create a new feature with the SET of tokens for ease of use

In [25]:
# Create Token Set Feature
data['token_set'] = data['tokens'].apply(lambda x: list(set(x)))
data['token_set'][song_idx][:10]

['oh',
 'dookie',
 'a',
 'road',
 'expletive_7',
 'zebra',
 'but',
 'about',
 'big',
 'wolf']

# N-Grams

It might be useful to see if N-Grams would give us a better list of tokens, since most rap lyrics involve heavy use of consecutive and connected words

In [26]:
list(ngrams(data['tokens'][song_idx], 2))[:10]

[('hey', 'macklemore'),
 ('macklemore', 'can'),
 ('can', 'we'),
 ('we', 'go'),
 ('go', 'thrift'),
 ('thrift', 'shopping'),
 ('shopping', 'what'),
 ('what', 'what'),
 ('what', 'what'),
 ('what', 'what')]

# POS Tagging

In [27]:
# Minimalize Tag Functions
pos_tagger = nltk.pos_tag
explain_tag = nltk.help.upenn_tagset

# Get Sample Tags
tag_sample = np.array(pos_tagger(set(data['tokens'][song_idx]))[:10])
tag_sample

array([['oh', 'UH'],
       ['dookie', 'NN'],
       ['a', 'DT'],
       ['road', 'NN'],
       ['expletive_7', 'NN'],
       ['zebra', 'NN'],
       ['but', 'CC'],
       ['about', 'IN'],
       ['big', 'JJ'],
       ['wolf', 'NN']], dtype='<U11')

In [28]:
# Split Words and Tags
words, tags = tag_sample[:, 0], tag_sample[:, 1]

# Create DF to Groupby
tag_df = pd.DataFrame({'words':words, 'tags':tags})
grouped_tags = tag_df.groupby('tags')

for x in grouped_tags:
    word = x[1]['words']
    explain_tag(x[0])
    print(f'WORDS: {word.tolist()}')


CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
WORDS: ['but']
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
WORDS: ['a']
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
WORDS: ['about']
JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
WORDS: ['big']
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
WORDS: ['do

Using POS Tagging seems to give more insight to the words and what they represent, but similarly to the Stemmers/Lemmatizers, they seem to also miscategorize things. The word 'patek' is actually a brand reference to Patek Watches, a luxury watch brand, and is not a verb.

In [29]:
# Drop Extra Column
data = data.drop(columns=['Unnamed: 0'])

In [30]:
# data['tokens']

# Finally, export data to clean_data.pkl

In [31]:
# data.to_pickle('data/clean_data.pkl')

In [32]:
data.to_pickle('data/all_clean_data.pkl')