In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500000)
logic_df = pd.read_csv('logic_df_final.csv', index_col=0)

In [2]:
logic_df.head()

Unnamed: 0,album,artist,lyric,song
0,Undeniable (2012),Young Sinatra,Yeah; pass the mic before I jack it like gore-tex,Disgusting
1,Undeniable (2012),Young Sinatra,"Bust like raw sex, rappers suck like vortex",Disgusting
2,Undeniable (2012),Young Sinatra,"The life of a Don- We living like kings, and killing our pawns",Disgusting
3,Undeniable (2012),Young Sinatra,"Boy, the seconds it's on - don't know where we going",Disgusting
4,Undeniable (2012),Young Sinatra,I'm flowing and killing this shit from dusk till dawn,Disgusting


In [3]:
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# text preprocessing

In [4]:
logic_df.lyric = logic_df.lyric.str.replace(r'([^\s\w]|_)+', '')

In [5]:
logic_df.lyric = logic_df.lyric.str.lower()

In [6]:
logic_df.head()

Unnamed: 0,album,artist,lyric,song
0,Undeniable (2012),Young Sinatra,yeah pass the mic before i jack it like goretex,Disgusting
1,Undeniable (2012),Young Sinatra,bust like raw sex rappers suck like vortex,Disgusting
2,Undeniable (2012),Young Sinatra,the life of a don we living like kings and killing our pawns,Disgusting
3,Undeniable (2012),Young Sinatra,boy the seconds its on dont know where we going,Disgusting
4,Undeniable (2012),Young Sinatra,im flowing and killing this shit from dusk till dawn,Disgusting


In [7]:
logic_df.drop(["artist", "album", "song"], axis=1, inplace=True)

In [8]:
logic_df.head()

Unnamed: 0,lyric
0,yeah pass the mic before i jack it like goretex
1,bust like raw sex rappers suck like vortex
2,the life of a don we living like kings and killing our pawns
3,boy the seconds its on dont know where we going
4,im flowing and killing this shit from dusk till dawn


In [9]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

logic_df['tokenized_lyrics'] = logic_df.lyric.apply(word_tokenize) 
logic_df.head()

[nltk_data] Downloading package punkt to /Users/Benjamin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,lyric,tokenized_lyrics
0,yeah pass the mic before i jack it like goretex,"[yeah, pass, the, mic, before, i, jack, it, like, goretex]"
1,bust like raw sex rappers suck like vortex,"[bust, like, raw, sex, rappers, suck, like, vortex]"
2,the life of a don we living like kings and killing our pawns,"[the, life, of, a, don, we, living, like, kings, and, killing, our, pawns]"
3,boy the seconds its on dont know where we going,"[boy, the, seconds, its, on, dont, know, where, we, going]"
4,im flowing and killing this shit from dusk till dawn,"[im, flowing, and, killing, this, shit, from, dusk, till, dawn]"


In [10]:
logic_df.lyric

0       yeah pass the mic before i jack it like goretex                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
1       bust like raw sex rappers suck like vortex                                                                                                                                                                                                                                                                                                                                                                                 

In [11]:
#create a list of all the unique words in the lyrics

unique_words = []
for line in logic_df.lyric:
        if line not in unique_words:
            unique_words.append(line)
        else:
            pass

In [12]:
from nltk.corpus import stopwords

In [13]:
stopwords = stopwords.words('english')+["woo","wizard", "rick", "morty", "life", "sinatra","ring", " hold", "wait", "tell", "whats", "take", "one", "come", "man", "around", "said", "yall", "bobby", "logic", "young sinatra", "go", "gotta", "ive", "feeling","back", "let", "ima", "wanna", "never", "make", "thats", "let", "cant", "cause", "got", "oh", "really", "want","get", "know", "way", "based", "regarding", "like", "yeah", "shit", "fuck", "im", "aint", "feel", "right", "em", "im", 'dont', "right"]

In [43]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(ngram_range=(1,2),  
                                   stop_words=stopwords, token_pattern="\\b[a-z][a-z]+\\b")
count_vectorizer.fit(unique_words)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',...arding', 'like', 'yeah', 'shit', 'fuck', 'im', 'aint', 'feel', 'right', 'em', 'im', 'dont', 'right'],
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [44]:
counts = count_vectorizer.transform(unique_words).transpose()

In [45]:
counts

<13795x4689 sparse matrix of type '<class 'numpy.int64'>'
	with 25228 stored elements in Compressed Sparse Column format>

In [46]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(counts)

In [47]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [48]:
count_vectorizer.vocabulary_.items()

dict_items([('pass', 8658), ('mic', 7568), ('jack', 6098), ('goretex', 4884), ('pass mic', 8660), ('mic jack', 7572), ('jack goretex', 6099), ('bust', 1524), ('raw', 9532), ('sex', 10465), ('rappers', 9472), ('suck', 11513), ('vortex', 12871), ('bust raw', 1531), ('raw sex', 9535), ('sex rappers', 10471), ('rappers suck', 9486), ('suck vortex', 11515), ('living', 6839), ('kings', 6342), ('killing', 6310), ('pawns', 8683), ('living kings', 6858), ('kings killing', 6343), ('killing pawns', 6314), ('boy', 1272), ('seconds', 10247), ('going', 4765), ('boy seconds', 1298), ('seconds going', 10248), ('flowing', 4189), ('dusk', 3178), ('till', 12150), ('dawn', 2562), ('flowing killing', 4191), ('killing dusk', 6312), ('dusk till', 3179), ('till dawn', 12156), ('middle', 7593), ('eastern', 3216), ('girl', 4590), ('pussy', 9316), ('bomb', 1175), ('sex middle', 10470), ('middle eastern', 7595), ('eastern girl', 3217), ('girl pussy', 4614), ('pussy bomb', 9317), ('last', 6465), ('line', 6695), ('

In [49]:
len(id2word)

13795

In [50]:
lda = models.LdaModel(corpus=corpus, num_topics=2, minimum_probability=0.2, id2word=id2word, passes=4)

2018-05-30 14:53:38,835 : INFO : using symmetric alpha at 0.5
2018-05-30 14:53:38,837 : INFO : using symmetric eta at 0.5
2018-05-30 14:53:38,842 : INFO : using serial LDA version on this node
2018-05-30 14:53:38,849 : INFO : running online (multi-pass) LDA training, 2 topics, 4 passes over the supplied corpus of 4689 documents, updating model once every 2000 documents, evaluating perplexity every 4689 documents, iterating 50x with a convergence threshold of 0.001000
2018-05-30 14:53:38,866 : INFO : PROGRESS: pass 0, at document #2000/4689
2018-05-30 14:53:40,500 : INFO : merging changes from 2000 documents into a model of 4689 documents
2018-05-30 14:53:40,506 : INFO : topic #0 (0.500): 0.005*"love" + 0.003*"time" + 0.003*"day" + 0.003*"kai" + 0.003*"world" + 0.002*"bitch" + 0.002*"people" + 0.002*"money" + 0.002*"yes" + 0.002*"imma"
2018-05-30 14:53:40,508 : INFO : topic #1 (0.500): 0.003*"girl" + 0.003*"thomas" + 0.003*"time" + 0.002*"think" + 0.002*"love" + 0.002*"wonder" + 0.002*"

2018-05-30 14:53:47,935 : INFO : -9.210 per-word bound, 592.3 perplexity estimate based on a held-out corpus of 689 documents with 3799 words
2018-05-30 14:53:47,939 : INFO : PROGRESS: pass 3, at document #4689/4689
2018-05-30 14:53:48,100 : INFO : merging changes from 689 documents into a model of 4689 documents
2018-05-30 14:53:48,105 : INFO : topic #0 (0.500): 0.003*"good" + 0.003*"bitch" + 0.003*"world" + 0.003*"uh" + 0.003*"love" + 0.003*"money" + 0.002*"black" + 0.002*"mean" + 0.002*"day" + 0.002*"time"
2018-05-30 14:53:48,107 : INFO : topic #1 (0.500): 0.005*"everybody" + 0.004*"give" + 0.003*"think" + 0.003*"mind" + 0.003*"everything" + 0.003*"boy" + 0.003*"god" + 0.003*"ever" + 0.003*"people" + 0.003*"finna"
2018-05-30 14:53:48,108 : INFO : topic diff=0.295232, rho=0.397010


In [51]:
lda.print_topics()

2018-05-30 14:53:48,119 : INFO : topic #0 (0.500): 0.003*"good" + 0.003*"bitch" + 0.003*"world" + 0.003*"uh" + 0.003*"love" + 0.003*"money" + 0.002*"black" + 0.002*"mean" + 0.002*"day" + 0.002*"time"
2018-05-30 14:53:48,121 : INFO : topic #1 (0.500): 0.005*"everybody" + 0.004*"give" + 0.003*"think" + 0.003*"mind" + 0.003*"everything" + 0.003*"boy" + 0.003*"god" + 0.003*"ever" + 0.003*"people" + 0.003*"finna"


[(0,
  '0.003*"good" + 0.003*"bitch" + 0.003*"world" + 0.003*"uh" + 0.003*"love" + 0.003*"money" + 0.002*"black" + 0.002*"mean" + 0.002*"day" + 0.002*"time"'),
 (1,
  '0.005*"everybody" + 0.004*"give" + 0.003*"think" + 0.003*"mind" + 0.003*"everything" + 0.003*"boy" + 0.003*"god" + 0.003*"ever" + 0.003*"people" + 0.003*"finna"')]

In [41]:
#Keep adding more phrases/words to stopwords and look at both (1,2) and (2,3) for ngram_range

In [42]:
#1 --> young sinatra (young, working hard, and making a name for himself)
#2 --> logic (positivity, love, change)
#3 --> bobby tarantino (the party/turn up alter-ego of Logic, party, girls, fun, cocky)