In [None]:
# Auto Reload
%load_ext autoreload
%autoreload 2

In [4]:
# Import Modules
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import json
from pprint import pprint
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from math import log
import re


# NLTK Modules
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('tagsets')
from nltk import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import chunk
from nltk.util import ngrams

# Import Custom Modules
from src.data_cleaner import *
from src.dummy_words import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [5]:
# Import Data
clean_df = pd.read_csv('data/clean_data.csv')

# Vectorizing: Indexing Bag-of-Words

### Getting TERM FREQUENCY

The number of times a term occurs in a specific document:

$tf(term,document) = \frac{\# \ of \ times \ a \ term \ appears \ in \ a \ document}{\#\ of\ terms\ in\ the\ document|}$

Unnamed: 0.1,Unnamed: 0,song,artist,featured,rank,year,lyrics,lyrics_state,song_id,lyrics_owner_id,primary_artist_url,clean_text,sentences,tokens,token_set
0,0,See You Again,Wiz Khalifa,,1,2015,"['its been a long day without you my friend', ...",True,720401,341761,https://genius.com/artists/Wiz-khalifa,its been a long day without you my friend. and...,"['its been a long day without you my friend.',...","['long', 'day', 'without', 'friend', 'ill', 't...","['light', 'memory', 'hit', 'laugh', 'things', ..."
1,1,Trap Queen,Fetty Wap,,2,2015,"['rgf productions', 'remy boyz yahah', '1738 a...",True,496445,104344,https://genius.com/artists/Fetty-wap,rgf productions. remy boyz yahah. 1738 ayy. im...,"['rgf productions.', 'remy boyz yahah.', '1738...","['rgf', 'productions', 'remy', 'boyz', 'yahah'...","['gritt', 'buy', 'roll', 'productions', 'hit',..."
2,2,Watch Me,Silento,,3,2015,"['whip nae nae', 'whip whip nae nae', 'whip na...",True,1743010,1696010,https://genius.com/artists/Silento,whip nae nae. whip whip nae nae. whip nae nae....,"['whip nae nae.', 'whip whip nae nae.', 'whip ...","['whip', 'nae', 'nae', 'whip', 'whip', 'nae', ...","['gon', 'already', 'superman', 'na', 'stank', ..."
3,3,679,Fetty Wap,,4,2015,"['yeaaah baby 17', 'remyboy lifestyle', 'ay ay...",True,696428,1483821,https://genius.com/artists/Fetty-wap,yeaaah baby 17. remyboy lifestyle. ay ay look....,"['yeaaah baby 17. remyboy lifestyle.', 'ay ay ...","['yeaaah', 'baby', '17.', 'remyboy', 'lifestyl...","['anywhere', 'shell', 'light', 'fettys', 'smok..."
4,4,Hotline Bling,Drake,,5,2015,"['you used to call me on my', 'you used to you...",True,2263723,744505,https://genius.com/artists/Drake,you used to call me on my. you used to you use...,"['you used to call me on my.', 'you used to yo...","['used', 'call', 'used', 'used', 'yeah', 'used...","['used', 'going', 'someone', 'things', 'reputa..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,245,Press,Cardi B,,46,2019,"['monstas gon tear it up', 'bardi', 'woo yeah'...",True,4191420,8117646,https://genius.com/artists/Cardi-b,monstas gon tear it up. bardi. woo yeah. bitch...,"['monstas gon tear it up.', 'bardi.', 'woo yea...","['monstas', 'gon', 'tear', 'bardi', 'woo', 'ye...","['greek', 'made', 'drop', 'put', 'quarter', 'l..."
246,246,Backin' It Up,Pardison Fontaine,,47,2019,"['ahhh', 'cardi', 'turn around fuck it all the...",True,3970189,104344,https://genius.com/artists/Pardison-fontaine,ahhh. cardi. turn around fuck it all the way u...,"['ahhh.', 'cardi.', 'turn around fuck it all t...","['ahhh', 'cardi', 'turn', 'around', 'fuck', 'w...","['bust', 'i.', 'beef', 'ran', 'forgot', 'someo..."
247,247,Twerk,City Girls,,48,2019,['i want a slim fine woman with some twerk wit...,True,4080406,1603328,https://genius.com/artists/City-girls,i want a slim fine woman with some twerk with ...,['i want a slim fine woman with some twerk wit...,"['want', 'slim', 'fine', 'woman', 'twerk', 'th...","['bounce', 'action', 'rude', 'gang', 'buy', 'h..."
248,248,Time,NF,,49,2019,"['even if we both break down tonight', 'and yo...",True,4693540,3375166,https://genius.com/artists/Nf,even if we both break down tonight. and you sa...,"['even if we both break down tonight.', 'and y...","['even', 'break', 'tonight', 'say', 'hate', 'g...","['overanalyzin', 'going', 'room', 'drive', 'ma..."
