In [None]:
# Auto Reload
%load_ext autoreload
%autoreload 2

In [2]:
# Import Modules
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import json
from pprint import pprint
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from math import log
import re


# NLTK Modules
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

# Import Custom Modules
from src.data_cleaner import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Run Scraping Program
# !python src/web_scraper.py

# Intake Scraped Data

In [4]:
# Get Data
data = pd.read_csv('data/data.csv')
data.head(3)

Unnamed: 0.1,Unnamed: 0,song,artist,featured,rank,year,lyrics,lyrics_state,song_id,lyrics_owner_id,primary_artist_url
0,0,See You Again,Wiz Khalifa,,1,2015,"It's been a long day without you, my friend\nA...",True,720401,341761,https://genius.com/artists/Wiz-khalifa
1,1,Trap Queen,Fetty Wap,,2,2015,"RGF productions\nRemy Boyz, yah-ah\n1738, ayy\...",True,496445,104344,https://genius.com/artists/Fetty-wap
2,2,Watch Me,Silento,,3,2015,"Whip, nae nae\nWhip, whip, nae nae\nWhip, nae ...",True,1743010,1696010,https://genius.com/artists/Silento


In [5]:
# Global Variables
song_idx = 116
data[data['song'] == "Rake It Up"]

Unnamed: 0.1,Unnamed: 0,song,artist,featured,rank,year,lyrics,lyrics_state,song_id,lyrics_owner_id,primary_artist_url
116,116,Rake It Up,Yo Gotti,,17,2017,"Ear Drummers\n30, you a fool for this one\nAh,...",True,3105241,250794,https://genius.com/artists/Yo-gotti


# Text Cleaning

### Lyric String Example
> With the intake lyric data, it seems that there needs to be a couple things cleaned. Casing, punctuation, and new-lines

*Lyric String Passover 1*

In [6]:
sample = data['lyrics'][song_idx]
sample[:len(sample)//5]

'Ear Drummers\n30, you a fool for this one\nAh, this the strip club anthem, nigga, what\'s up?\nYoung Money!\nYeah, me and Mike WiLL pull up to AOD back to back\nThem AMG 63\'s\nMike WiLL Made-It, nigga\n\nI tell all my hoes, "Rake it up\nBreak it down, bag it up"\nFuck it up, fuck it up (fuck it up, fuck it up)\nBack it up, back it up (back it up, back it up)\nRake it up, rake it up (rake it up, rake it up)\nBack it up, back it up (back it up, back it up)\nI tell all my hoes (what?) "Rake it up\nBreak it down, bag it up" (bag it up, bag it up)\nFuck it up, fuck it up (fuck it up)\nFuck it up, fuck it up (fuck it up)\nFuck it up, fuck it up (fuck it up)\nRake it up, rake it up (rake it up)\n\nI made love to a stripper (stripper), first I had to tip her (phrrr)\nTwenty thousand ones (woo), she said I\'m that nigga (I am)\nI said, "I\'m that nigga, bitch, I already know it" (I know it)\nI come with bad weather (ksh), they say I\'m a storm (ayy)\nVVS\'s in my cha'

*Lyric String Passover 2*

In [7]:
# Clean New-line breaks, but preserve periods
data['lyrics'] = data['lyrics'].apply(lambda x: x.split('\n'))

In [8]:
sample = data['lyrics'][song_idx]
# Need to join due to splitting to list
# " ".join(sample[:len(sample)//5])
sample[:len(sample)//5]

['Ear Drummers',
 '30, you a fool for this one',
 "Ah, this the strip club anthem, nigga, what's up?",
 'Young Money!',
 'Yeah, me and Mike WiLL pull up to AOD back to back',
 "Them AMG 63's",
 'Mike WiLL Made-It, nigga',
 '',
 'I tell all my hoes, "Rake it up',
 'Break it down, bag it up"',
 'Fuck it up, fuck it up (fuck it up, fuck it up)',
 'Back it up, back it up (back it up, back it up)',
 'Rake it up, rake it up (rake it up, rake it up)',
 'Back it up, back it up (back it up, back it up)',
 'I tell all my hoes (what?) "Rake it up',
 'Break it down, bag it up" (bag it up, bag it up)',
 'Fuck it up, fuck it up (fuck it up)',
 'Fuck it up, fuck it up (fuck it up)',
 'Fuck it up, fuck it up (fuck it up)',
 'Rake it up, rake it up (rake it up)',
 '',
 'I made love to a stripper (stripper), first I had to tip her (phrrr)']

*Lyric String Passover 3*

In [9]:
# Use Custom Text Cleaning Function
data = clean_text(data, 'lyrics', 'lyrics')

In [10]:
sample = data['lyrics'][song_idx]
# Need to join due to splitting to list
# " ".join(sample[:len(sample)//5])
sample[:len(sample)//5]

['ear drummers',
 '30 you a fool for this one',
 'ah this the strip club anthem nigga whats up',
 'young money',
 'yeah me and mike will pull up to aod back to back',
 'them amg 63s',
 'mike will madeit nigga',
 '',
 'i tell all my hoes rake it up',
 'break it down bag it up',
 'fuck it up fuck it up fuck it up fuck it up',
 'back it up back it up back it up back it up',
 'rake it up rake it up rake it up rake it up',
 'back it up back it up back it up back it up',
 'i tell all my hoes what rake it up',
 'break it down bag it up bag it up bag it up',
 'fuck it up fuck it up fuck it up',
 'fuck it up fuck it up fuck it up',
 'fuck it up fuck it up fuck it up',
 'rake it up rake it up rake it up',
 '',
 'i made love to a stripper stripper first i had to tip her phrrr']

### Create 'document' Feature

In [11]:
# Make Document Feature with Lyrics joined into one string (strips, negates whitespace)
data['clean_text'] = data['lyrics'].apply(lambda x: ". ".join([i.strip() for i in x if i]))

In [12]:
sample = data['clean_text'][song_idx]
# Need to join due to splitting to list
sample[:len(sample)//5]

'ear drummers. 30 you a fool for this one. ah this the strip club anthem nigga whats up. young money. yeah me and mike will pull up to aod back to back. them amg 63s. mike will madeit nigga. i tell all my hoes rake it up. break it down bag it up. fuck it up fuck it up fuck it up fuck it up. back it up back it up back it up back it up. rake it up rake it up rake it up rake it up. back it up back it up back it up back it up. i tell all my hoes what rake it up. break it down bag it up bag it up bag it up. fuck it up fuck it up fuck it up. fuck it up fuck it up fuck it up. fuck it up fuck it up fuck it up. rake it up rake it up rake it up. i made love to a stripper stripper first i had to tip her phrrr. twenty thousand ones woo she said im that nigga i am. i said im that nigga bitch i already know it i know it. i come with bad weather ksh they say im a storm ayy. vvss in my charm that'

# Tokenization

In [13]:
# Create Sent Token Feature
data['sentences'] = data['clean_text'].apply(sent_tokenize)

In [14]:
# Show Sentences (truncated)
data['sentences'][song_idx][:10]

['ear drummers.',
 '30 you a fool for this one.',
 'ah this the strip club anthem nigga whats up.',
 'young money.',
 'yeah me and mike will pull up to aod back to back.',
 'them amg 63s.',
 'mike will madeit nigga.',
 'i tell all my hoes rake it up.',
 'break it down bag it up.',
 'fuck it up fuck it up fuck it up fuck it up.']

In [15]:
# Create tokens for each song
data['tokens'] = data['clean_text'].apply(word_tokenize)
print(data['tokens'][song_idx][:20])

['ear', 'drummers', '.', '30', 'you', 'a', 'fool', 'for', 'this', 'one', '.', 'ah', 'this', 'the', 'strip', 'club', 'anthem', 'nigga', 'whats', 'up']


# Filtration (Stop-words, punctiation, etc)

In [16]:
# Filter Punctuation
data['tokens'] = data['tokens'].apply(lambda x: [i for i in x if i not in string.punctuation])
print(data['tokens'][song_idx][:20])

['ear', 'drummers', '30', 'you', 'a', 'fool', 'for', 'this', 'one', 'ah', 'this', 'the', 'strip', 'club', 'anthem', 'nigga', 'whats', 'up', 'young', 'money']


In [17]:
# Filter Stop Words
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda x: [i for i in x if i not in stop_words])
print(data['tokens'][song_idx][:20])

['ear', 'drummers', '30', 'fool', 'one', 'ah', 'strip', 'club', 'anthem', 'nigga', 'whats', 'young', 'money', 'yeah', 'mike', 'pull', 'aod', 'back', 'back', 'amg']


### Compare Tokens to Lyric Unique Words (from whole document)

In [18]:
print("Tokens: ")
print(data['tokens'][song_idx][:20])
print("Set (from non tokens): ")
print(list(set(data['clean_text'][song_idx].split()))[:20])

Tokens: 
['ear', 'drummers', '30', 'fool', 'one', 'ah', 'strip', 'club', 'anthem', 'nigga', 'whats', 'young', 'money', 'yeah', 'mike', 'pull', 'aod', 'back', 'back', 'amg']
Set (from non tokens): 
['punani', 'strip', 'her', 'still', 'throw', 'dough.', 'got', 'club', 'be', 'about.', 'big', 'pussy', 'lil', 'pull', 'drummers.', 'baby', 'social', 'thats', 'ima', 'statement']


# Stem or Lemmatize Words

In [19]:
# Create Porter, Snowball, WordNet Objects
porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

# Get functions from each object
porter_func = porter.stem
snowball_func = snowball.stem
wordnet_func = wordnet.lemmatize

# Create lambda func to easily apply func to each token
get_root = lambda tokens, func: [func(token) for token in tokens] 

In [20]:
# Get Tokens for each type of processor
porter_tokens = data['tokens'].apply(lambda x: get_root(x, porter_func)) 
snowball_tokens = data['tokens'].apply(lambda x: get_root(x, snowball_func)) 
wordnet_tokens = data['tokens'].apply(lambda x: get_root(x, wordnet_func)) 

### Results

In [21]:
## Print the stemmed and lemmatized words from the target document
print("%16s | %16s | %16s | %16s |" % ("WORD", "PORTER", "SNOWBALL", "LEMMATIZER"))
for i in range(min(len(porter_tokens[song_idx]), len(snowball_tokens[song_idx]), len(wordnet_tokens[song_idx]))):
    p, s, w = porter_tokens[song_idx][i], snowball_tokens[song_idx][i], wordnet_tokens[song_idx][i]
    if len(set((p, s, w))) != 1:
        print("%16s | %16s | %16s | %16s |" % (data['tokens'][song_idx][i], p, s, w))

            WORD |           PORTER |         SNOWBALL |       LEMMATIZER |
           whats |             what |             what |            whats |
             63s |               63 |              63s |              63s |
          twenty |           twenti |           twenti |           twenty |
         already |          alreadi |          alreadi |          already |
             ayy |              ayi |              ayi |              ayy |
           thats |             that |             that |            thats |
        phillipe |          phillip |          phillip |         phillipe |
          picked |             pick |             pick |           picked |
           pussy |            pussi |            pussi |            pussy |
           pussy |            pussi |            pussi |            pussy |
           pussy |            pussi |            pussi |            pussy |
           pussy |            pussi |            pussi |            pussy |
           a

In conclusion, the results show that using any type of stemmer or lemmatizer seemed to detract from the words rather than help center them. These methods of word procession are not able to account for the colloqualisms that come from the language of rap. Therefore we will not proceed with using this for any word processing.

In [75]:
sentence = 'Whats good nigga you fucking bitch nigga pussy'

bad_prefixes = ['nig', 'bitc', 'puss', 'fuc']
replace_prefixes = ['nibba', 'binch', 'boos', 'duck']
for pre, rep in zip(bad_prefixes, replace_prefixes):
    match = r'({bad_prefix})(.*?)\b'.format(bad_prefix=pre)
    res = re.sub(match, rep, sentence)
    print(res)

Whats good nibba you fucking bitch nibba pussy
Whats good nigga you fucking binch nigga pussy
Whats good nigga you fucking bitch nigga boos
Whats good nigga you duck bitch nigga pussy


In [57]:
for x in sentence.split():
#     if x.startswith("nig"):
#         print('bad word found')
    if re.match("nig*", x):
        print("bad word regex")

bad word regex
