In [1]:
# Auto Reload
%load_ext autoreload
%autoreload 2

In [None]:
# Import Modules
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import json
from pprint import pprint
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from math import log
import re


# NLTK Modules
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('tagsets')
from nltk import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import chunk
from nltk.util import ngrams

# Import Custom Modules
from src.data_cleaner import *
from src.dummy_words import *

In [None]:
# Run Scraping Program
# !python src/web_scraper.py

# Intake Scraped Data

In [None]:
# Get Data
# data = pd.read_csv('data/data.csv')
data = pd.read_csv('data/all_data.csv')
data.head(3)

In [None]:
data.info()

In [None]:
# data[data['lyrics'].isna()]

In [None]:
# Global Variables
song_idx = 0
data.iloc[song_idx]

# Text Cleaning

Creating pipeline for **indexing** song lyrics (document).

This will lead to **indexing** which creates a **signature** (vector) for each document.

Then, the **signatures** will be used for relating documents one to the other (and find out similar clusters of documents), or for mining underlying relations between concepts.

<img src="media/text-pipeline.png" width="100%"/>

### Bags of Words: Lyric String Example
> With the intake lyric data, it seems that there needs to be a couple things cleaned. Casing, punctuation, and new-lines

*Lyric String Passover 1*

In [None]:
sample = data['lyrics'][song_idx]
sample[:len(sample)//5]

*Lyric String Passover 2*

In [None]:
# Clean New-line breaks, but preserve periods
data['lyrics'] = data['lyrics'].apply(lambda x: x.split('\n'))

In [None]:
sample = data['lyrics'][song_idx]
# Need to join due to splitting to list
# " ".join(sample[:len(sample)//5])
sample[:len(sample)//5]

*Lyric String Passover 3*

In [None]:
# Use Custom Text Cleaning Function
data = clean_text(data, 'lyrics', 'lyrics')

In [None]:
sample = data['lyrics'][song_idx]
# Need to join due to splitting to list
# " ".join(sample[:len(sample)//5])
sample[:len(sample)//5]

### Create 'document' Feature

In [None]:
# Make Document Feature with Lyrics joined into one string (strips, negates whitespace)
data['clean_text'] = data['lyrics'].apply(lambda x: " ".join([i.strip() for i in x if i]))

In [None]:
sample = data['clean_text'][song_idx]
# Need to join due to splitting to list
sample[:len(sample)//5]

# Tokenization

# REMOVE EXPLETIVES

In [None]:
data['clean_text'] = data['clean_text'].apply(match_dummies)

In [None]:
# Create Sent Token Feature
data['sentences'] = data['clean_text'].apply(sent_tokenize)

In [None]:
# Show Sentences (truncated)
data['sentences'][song_idx][:10]

In [None]:
# Create tokens for each song
data['tokens'] = data['clean_text'].apply(word_tokenize)
print(data['tokens'][song_idx][:20])

# Filtration (Stop-words, punctiation, etc)

In [None]:
# Filter Punctuation
data['tokens'] = data['tokens'].apply(lambda x: [i for i in x if i not in string.punctuation])
print(data['tokens'][song_idx][:20])

In [None]:
# Filter Stop Words
stop_words = set(stopwords.words('english'))
stop_words = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", " ".join(stop_words)).split() + ['im', 'ill']
data['tokens_stop'] = data['tokens'].apply(lambda x: [i for i in x if i not in stop_words])
print(data['tokens_stop'][song_idx][:20])

### Compare Tokens to Lyric Unique Words (from whole document)

In [None]:
print("Tokens: ")
print(data['tokens'][song_idx][:20])
print("Set (from non tokens): ")
print(list(set(data['clean_text'][song_idx].split()))[:20])

# Stem or Lemmatize Words

In [None]:
# Create Porter, Snowball, WordNet Objects
porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

# Get functions from each object
porter_func = porter.stem
snowball_func = snowball.stem
wordnet_func = wordnet.lemmatize

# Create lambda func to easily apply func to each token
get_root = lambda tokens, func: [func(token) for token in tokens] 

In [None]:
# Get Tokens for each type of processor
porter_tokens = data['tokens'].apply(lambda x: get_root(x, porter_func)) 
snowball_tokens = data['tokens'].apply(lambda x: get_root(x, snowball_func)) 
wordnet_tokens = data['tokens'].apply(lambda x: get_root(x, wordnet_func)) 

### Results

In [None]:
## Print the stemmed and lemmatized words from the target document
print("%16s | %16s | %16s | %16s |" % ("WORD", "PORTER", "SNOWBALL", "LEMMATIZER"))
for i in range(min(len(porter_tokens[song_idx]), len(snowball_tokens[song_idx]), len(wordnet_tokens[song_idx]))):
    p, s, w = porter_tokens[song_idx][i], snowball_tokens[song_idx][i], wordnet_tokens[song_idx][i]
    if len(set((p, s, w))) != 1:
        print("%16s | %16s | %16s | %16s |" % (data['tokens'][song_idx][i], p, s, w))

In conclusion, the results show that using any type of stemmer or lemmatizer seemed to detract from the words rather than help center them. These methods of word procession are not able to account for the colloqualisms that come from the language of rap. Therefore we will not proceed with using this for any word processing.

# More Features Added

Now that the tokens are extracted from the lyric set, it's time to create a new feature with the SET of tokens for ease of use

In [None]:
# Create Token Set Feature
data['token_set'] = data['tokens'].apply(lambda x: list(set(x)))
data['token_set'][song_idx][:10]

# N-Grams

It might be useful to see if N-Grams would give us a better list of tokens, since most rap lyrics involve heavy use of consecutive and connected words

In [None]:
list(ngrams(data['tokens'][song_idx], 2))[:10]

# POS Tagging

In [None]:
# Minimalize Tag Functions
pos_tagger = nltk.pos_tag
explain_tag = nltk.help.upenn_tagset

# Get Sample Tags
tag_sample = np.array(pos_tagger(set(data['tokens'][song_idx]))[:10])
tag_sample

In [None]:
# Split Words and Tags
words, tags = tag_sample[:, 0], tag_sample[:, 1]

# Create DF to Groupby
tag_df = pd.DataFrame({'words':words, 'tags':tags})
grouped_tags = tag_df.groupby('tags')

for x in grouped_tags:
    word = x[1]['words']
    explain_tag(x[0])
    print(f'WORDS: {word.tolist()}')


Using POS Tagging seems to give more insight to the words and what they represent, but similarly to the Stemmers/Lemmatizers, they seem to also miscategorize things. The word 'patek' is actually a brand reference to Patek Watches, a luxury watch brand, and is not a verb.

In [None]:
# Drop Extra Column
data = data.drop(columns=['Unnamed: 0'])

# Have to drop 2000-2013 due to bad data

In [None]:
bad_years = range(2000, 2013)
for year in bad_years:
    data = data[data['year'] != year]

In [None]:
data.reset_index(drop=True, inplace=True)
data = data.drop(columns=['level_0'])
data

# Finally, export data to clean_data.pkl

In [None]:
# data.to_pickle('data/clean_data.pkl')

In [None]:
data.to_pickle('data/all_clean_data.pkl')