In [1]:
# Auto Reload
%load_ext autoreload
%autoreload 2

In [2]:
# Import Modules
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import json
from pprint import pprint
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from math import log
import re


# NLTK Modules
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('tagsets')
from nltk import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import chunk
from nltk.util import ngrams

# Import Custom Modules
from src.data_cleaner import *
from src.dummy_words import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [3]:
# Run Scraping Program
# !python src/web_scraper.py

# Intake Scraped Data

In [4]:
# Get Data
data = pd.read_csv('data/data.csv')
data.head(3)

Unnamed: 0.1,Unnamed: 0,song,artist,featured,rank,year,lyrics,lyrics_state,song_id,lyrics_owner_id,primary_artist_url
0,0,See You Again,Wiz Khalifa,,1,2015,"It's been a long day without you, my friend\nA...",True,720401,341761,https://genius.com/artists/Wiz-khalifa
1,1,Trap Queen,Fetty Wap,,2,2015,"RGF productions\nRemy Boyz, yah-ah\n1738, ayy\...",True,496445,104344,https://genius.com/artists/Fetty-wap
2,2,Watch Me,Silento,,3,2015,"Whip, nae nae\nWhip, whip, nae nae\nWhip, nae ...",True,1743010,1696010,https://genius.com/artists/Silento


In [5]:
# Global Variables
song_idx = 0
data.iloc[song_idx]

Unnamed: 0                                                            0
song                                                      See You Again
artist                                                     Wiz Khalifa 
featured                                                            NaN
rank                                                                  1
year                                                               2015
lyrics                It's been a long day without you, my friend\nA...
lyrics_state                                                       True
song_id                                                          720401
lyrics_owner_id                                                  341761
primary_artist_url               https://genius.com/artists/Wiz-khalifa
Name: 0, dtype: object

# Text Cleaning

Creating pipeline for **indexing** song lyrics (document).

This will lead to **indexing** which creates a **signature** (vector) for each document.

Then, the **signatures** will be used for relating documents one to the other (and find out similar clusters of documents), or for mining underlying relations between concepts.

<img src="media/text-pipeline.png" width="100%"/>

### Bags of Words: Lyric String Example
> With the intake lyric data, it seems that there needs to be a couple things cleaned. Casing, punctuation, and new-lines

*Lyric String Passover 1*

In [6]:
sample = data['lyrics'][song_idx]
sample[:len(sample)//5]

"It's been a long day without you, my friend\nAnd I'll tell you all about it when I see you again\nWe've come a long way from where we began\nOh, I'll tell you all about it when I see you again\nWhen I see you again\n\nDamn, who knew?\nAll the planes we flew, good things we been through\nThat I'd be standing right here talking to you\n'Bout another path, I know we loved to hit the road and laugh\nBut something told me that it wouldn't last\nHad to switch"

*Lyric String Passover 2*

In [7]:
# Clean New-line breaks, but preserve periods
data['lyrics'] = data['lyrics'].apply(lambda x: x.split('\n'))

In [8]:
sample = data['lyrics'][song_idx]
# Need to join due to splitting to list
# " ".join(sample[:len(sample)//5])
sample[:len(sample)//5]

["It's been a long day without you, my friend",
 "And I'll tell you all about it when I see you again",
 "We've come a long way from where we began",
 "Oh, I'll tell you all about it when I see you again",
 'When I see you again',
 '',
 'Damn, who knew?',
 'All the planes we flew, good things we been through',
 "That I'd be standing right here talking to you",
 "'Bout another path, I know we loved to hit the road and laugh",
 "But something told me that it wouldn't last",
 'Had to switch up, look at things different, see the bigger picture']

*Lyric String Passover 3*

In [9]:
# Use Custom Text Cleaning Function
data = clean_text(data, 'lyrics', 'lyrics')

In [10]:
sample = data['lyrics'][song_idx]
# Need to join due to splitting to list
# " ".join(sample[:len(sample)//5])
sample[:len(sample)//5]

['its been a long day without you my friend',
 'and ill tell you all about it when i see you again',
 'weve come a long way from where we began',
 'oh ill tell you all about it when i see you again',
 'when i see you again',
 '',
 'damn who knew',
 'all the planes we flew good things we been through',
 'that id be standing right here talking to you',
 'bout another path i know we loved to hit the road and laugh',
 'but something told me that it wouldnt last',
 'had to switch up look at things different see the bigger picture']

### Create 'document' Feature

In [11]:
# Make Document Feature with Lyrics joined into one string (strips, negates whitespace)
data['clean_text'] = data['lyrics'].apply(lambda x: " ".join([i.strip() for i in x if i]))

In [12]:
sample = data['clean_text'][song_idx]
# Need to join due to splitting to list
sample[:len(sample)//5]

'its been a long day without you my friend and ill tell you all about it when i see you again weve come a long way from where we began oh ill tell you all about it when i see you again when i see you again damn who knew all the planes we flew good things we been through that id be standing right here talking to you bout another path i know we loved to hit the road and laugh but something told me that it wouldnt last had to '

# Tokenization

# REMOVE EXPLETIVES

In [13]:
data['clean_text'] = data['clean_text'].apply(match_dummies)

In [14]:
# Create Sent Token Feature
data['sentences'] = data['clean_text'].apply(sent_tokenize)

In [15]:
# Show Sentences (truncated)
data['sentences'][song_idx][:10]

['its been a long day without you my friend and ill tell you all about it when i see you again weve come a long way from where we began oh ill tell you all about it when i see you again when i see you again damn who knew all the planes we flew good things we been through that id be standing right here talking to you bout another path i know we loved to hit the road and laugh but something told me that it wouldnt last had to switch up look at things different see the bigger picture those were the days hard work forever pays now i see you in a better place see you in a better place uh how could we not talk about family when familys all that we got everything i went through you were standing there by my side and now you gon be with me for the last ride its been a long day without you my friend and ill tell you all about it when i see you again ill see you again weve come a long way yeah we came a long way from where we began you know we started oh ill tell you all about it when i see you 

In [16]:
# Create tokens for each song
data['tokens'] = data['clean_text'].apply(word_tokenize)
print(data['tokens'][song_idx][:20])

['its', 'been', 'a', 'long', 'day', 'without', 'you', 'my', 'friend', 'and', 'ill', 'tell', 'you', 'all', 'about', 'it', 'when', 'i', 'see', 'you']


# Filtration (Stop-words, punctiation, etc)

In [17]:
# Filter Punctuation
data['tokens'] = data['tokens'].apply(lambda x: [i for i in x if i not in string.punctuation])
print(data['tokens'][song_idx][:20])

['its', 'been', 'a', 'long', 'day', 'without', 'you', 'my', 'friend', 'and', 'ill', 'tell', 'you', 'all', 'about', 'it', 'when', 'i', 'see', 'you']


In [18]:
# Filter Stop Words
stop_words = set(stopwords.words('english'))
stop_words = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", " ".join(stop_words)).split() + ['im', 'ill']
data['tokens_stop'] = data['tokens'].apply(lambda x: [i for i in x if i not in stop_words])
print(data['tokens_stop'][song_idx][:20])

['long', 'day', 'without', 'friend', 'tell', 'see', 'weve', 'come', 'long', 'way', 'began', 'oh', 'tell', 'see', 'see', 'damn', 'knew', 'planes', 'flew', 'good']


### Compare Tokens to Lyric Unique Words (from whole document)

In [19]:
print("Tokens: ")
print(data['tokens'][song_idx][:20])
print("Set (from non tokens): ")
print(list(set(data['clean_text'][song_idx].split()))[:20])

Tokens: 
['its', 'been', 'a', 'long', 'day', 'without', 'you', 'my', 'friend', 'and', 'ill', 'tell', 'you', 'all', 'about', 'it', 'when', 'i', 'see', 'you']
Set (from non tokens): 
['about', 'day', 'bigger', 'me', 'uh', 'how', 'came', 'place', 'ride', 'strong', 'get', 'is', 'friend', 'drawn', 'switch', 'always', 'take', 'picture', 'got', 'path']


# Stem or Lemmatize Words

In [20]:
# Create Porter, Snowball, WordNet Objects
porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

# Get functions from each object
porter_func = porter.stem
snowball_func = snowball.stem
wordnet_func = wordnet.lemmatize

# Create lambda func to easily apply func to each token
get_root = lambda tokens, func: [func(token) for token in tokens] 

In [21]:
# Get Tokens for each type of processor
porter_tokens = data['tokens'].apply(lambda x: get_root(x, porter_func)) 
snowball_tokens = data['tokens'].apply(lambda x: get_root(x, snowball_func)) 
wordnet_tokens = data['tokens'].apply(lambda x: get_root(x, wordnet_func)) 

### Results

In [22]:
## Print the stemmed and lemmatized words from the target document
print("%16s | %16s | %16s | %16s |" % ("WORD", "PORTER", "SNOWBALL", "LEMMATIZER"))
for i in range(min(len(porter_tokens[song_idx]), len(snowball_tokens[song_idx]), len(wordnet_tokens[song_idx]))):
    p, s, w = porter_tokens[song_idx][i], snowball_tokens[song_idx][i], wordnet_tokens[song_idx][i]
    if len(set((p, s, w))) != 1:
        print("%16s | %16s | %16s | %16s |" % (data['tokens'][song_idx][i], p, s, w))

            WORD |           PORTER |         SNOWBALL |       LEMMATIZER |
        standing |            stand |            stand |         standing |
         talking |             talk |             talk |          talking |
         another |            anoth |            anoth |          another |
           loved |             love |             love |            loved |
       something |           someth |           someth |        something |
       different |           differ |           differ |        different |
         picture |           pictur |           pictur |          picture |
         forever |            forev |            forev |          forever |
          family |           famili |           famili |           family |
         familys |           famili |           famili |           family |
      everything |          everyth |          everyth |       everything |
        standing |            stand |            stand |         standing |
         sta

In conclusion, the results show that using any type of stemmer or lemmatizer seemed to detract from the words rather than help center them. These methods of word procession are not able to account for the colloqualisms that come from the language of rap. Therefore we will not proceed with using this for any word processing.

# More Features Added

Now that the tokens are extracted from the lyric set, it's time to create a new feature with the SET of tokens for ease of use

In [23]:
# Create Token Set Feature
data['token_set'] = data['tokens'].apply(lambda x: list(set(x)))
data['token_set'][song_idx][:10]

['about',
 'day',
 'bigger',
 'me',
 'uh',
 'how',
 'came',
 'place',
 'ride',
 'strong']

# N-Grams

It might be useful to see if N-Grams would give us a better list of tokens, since most rap lyrics involve heavy use of consecutive and connected words

In [24]:
list(ngrams(data['tokens'][song_idx], 2))[:10]

[('its', 'been'),
 ('been', 'a'),
 ('a', 'long'),
 ('long', 'day'),
 ('day', 'without'),
 ('without', 'you'),
 ('you', 'my'),
 ('my', 'friend'),
 ('friend', 'and'),
 ('and', 'ill')]

# POS Tagging

In [25]:
# Minimalize Tag Functions
pos_tagger = nltk.pos_tag
explain_tag = nltk.help.upenn_tagset

# Get Sample Tags
tag_sample = np.array(pos_tagger(set(data['tokens'][song_idx]))[:10])
tag_sample

array([['about', 'RB'],
       ['day', 'NN'],
       ['bigger', 'JJR'],
       ['me', 'PRP'],
       ['uh', 'VB'],
       ['how', 'WRB'],
       ['came', 'VBD'],
       ['place', 'NN'],
       ['ride', 'NN'],
       ['strong', 'JJ']], dtype='<U6')

In [26]:
# Split Words and Tags
words, tags = tag_sample[:, 0], tag_sample[:, 1]

# Create DF to Groupby
tag_df = pd.DataFrame({'words':words, 'tags':tags})
grouped_tags = tag_df.groupby('tags')

for x in grouped_tags:
    word = x[1]['words']
    explain_tag(x[0])
    print(f'WORDS: {word.tolist()}')


JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
WORDS: ['strong']
JJR: adjective, comparative
    bleaker braver breezier briefer brighter brisker broader bumper busier
    calmer cheaper choosier cleaner clearer closer colder commoner costlier
    cozier creamier crunchier cuter ...
WORDS: ['bigger']
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
WORDS: ['day', 'place', 'ride']
PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us
WORDS: ['me']
RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently t

Using POS Tagging seems to give more insight to the words and what they represent, but similarly to the Stemmers/Lemmatizers, they seem to also miscategorize things. The word 'patek' is actually a brand reference to Patek Watches, a luxury watch brand, and is not a verb.

In [27]:
# Drop Extra Column
data = data.drop(columns=['Unnamed: 0'])

In [28]:
# data['tokens']

# Finally, export data to clean_data.pkl

In [29]:
data.to_pickle('data/clean_data.pkl')