# Creating a DTM

In [15]:
import pandas as pd
from datetime import datetime
import spacy
from tqdm import tqdm_notebook as tqdm

loading and formatting data

In [33]:
# load in the lyrics data set
first_df = pd.read_csv("lyrics.csv")

In [38]:
# create a song id number by renaming the index
first_df.rename(columns={"index":"song_id"}, inplace=True)
first_df.shape

(362237, 6)

In [41]:
first_df.dropna(subset=['lyrics'], inplace=True)
first_df.shape

(266557, 6)

In [42]:
# split the dataframe
info = first_df[["song_id", "song", "year", "artist", "genre"]]
lyrics = first_df[["song_id", "lyrics"]]

print(info.columns)
print(lyrics.columns)

Index(['song_id', 'song', 'year', 'artist', 'genre'], dtype='object')
Index(['song_id', 'lyrics'], dtype='object')


In [62]:
# load the spaCy object with the english corpus
nlp = spacy.load("en_core_web_md")

<font color='red'>experimenting with spaCy</font>

In [6]:
# create a test case of one set of lyrics
doc = nlp(lyrics.lyrics[0])

# check the parts of speech etc
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

print("###################################################")

# check the tokenization
for token in doc:
    print(token.text)
    
print("###################################################")

# check the named entities in the document
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

print("###################################################")

# check if the words have vectors
for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

Oh oh INTJ UH intj Xx True False
baby baby NOUN NN npadvmod xxxx True False
, , PUNCT , punct , False False
how how ADV WRB advmod xxx True True
you -PRON- PRON PRP nsubj xxx True True
doing do VERB VBG ROOT xxxx True True
? ? PUNCT . punct ? False False

 
 SPACE _SP  
 False False
You -PRON- PRON PRP nsubj Xxx True True
know know VERB VBP parataxis xxxx True False
I -PRON- PRON PRP nsubj X True True
'm be AUX VBP aux 'x False True
gon go VERB VBG ROOT xxx True False
na to PART TO aux xx True False
cut cut VERB VB xcomp xxx True False
right right ADV RB dobj xxxx True False
to to ADP IN prep xx True True
the the DET DT det xxx True True
chase chase NOUN NN pobj xxxx True False

 
 SPACE _SP  
 False False
Some some DET DT det Xxxx True True
women woman NOUN NNS nsubjpass xxxx True False
were be AUX VBD auxpass xxxx True True
made make VERB VBN ROOT xxxx True True
but but CCONJ CC cc xxx True True
me -PRON- PRON PRP conj xx True True
, , PUNCT , punct , False False
myself -PRON- PRON P

 SPACE _SP  
 False False
Boy boy INTJ UH ROOT Xxx True False
you -PRON- PRON PRP dative xxx True True
a a DET DT det x True True
site site NOUN NN dobj xxxx True False
to to PART TO aux xx True True
see see VERB VB relcl xxx True True
, , PUNCT , punct , False False
kind kind ADV RB advmod xxxx True False
of of ADV RB advmod xx True True
something something PRON NN dobj xxxx True True
like like SCONJ IN prep xxxx True False
me -PRON- PRON PRP pobj xx True True

 
 SPACE _SP  
 False False
It -PRON- PRON PRP nsubj Xx True True
's be AUX VBZ ccomp 'x False True
too too ADV RB advmod xxx True True
big big ADJ JJ acomp xxx True False
, , PUNCT , punct , False False
it -PRON- PRON PRP nsubj xx True True
's be AUX VBZ ROOT 'x False True
too too ADV RB advmod xxx True True
wide wide ADJ JJ acomp xxxx True False

 
 SPACE _SP  
 False False
It -PRON- PRON PRP nsubj Xx True True
's be AUX VBZ ccomp 'x False True
too too ADV RB advmod xxx True True
strong strong ADJ JJ acomp xxxx True False
, ,

? True 5.1608233 False

 False 0.0 False
You True 5.1979666 False
know True 5.160699 False
I True 6.4231944 False
'm True 5.9417286 False
gon True 7.6101074 False
na True 7.9835854 False
cut True 5.9731746 False
right True 5.2088556 False
to True 4.74484 False
the True 4.70935 False
chase True 5.7777185 False

 False 0.0 False
Some True 5.0450377 False
women True 7.2206335 False
were True 6.003382 False
made True 4.963541 False
but True 4.903002 False
me True 5.75488 False
, True 5.094723 False
myself True 5.9297504 False

 False 0.0 False
I True 6.4231944 False
like True 4.78322 False
to True 4.74484 False
think True 5.335493 False
that True 4.8260193 False
I True 6.4231944 False
was True 5.4562387 False
created True 5.5150514 False
for True 4.8435082 False
a True 5.306696 False
special True 5.4631042 False
purpose True 5.2435203 False

 False 0.0 False
You True 5.1979666 False
know True 5.160699 False
, True 5.094723 False
what True 5.135811 False
's True 5.1889863 False
more True 5.

In [7]:
for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

Oh True 6.044095 False
baby True 6.911526 False
, True 5.094723 False
how True 5.2509694 False
you True 5.1979666 False
doing True 5.571892 False
? True 5.1608233 False

 False 0.0 False
You True 5.1979666 False
know True 5.160699 False
I True 6.4231944 False
'm True 5.9417286 False
gon True 7.6101074 False
na True 7.9835854 False
cut True 5.9731746 False
right True 5.2088556 False
to True 4.74484 False
the True 4.70935 False
chase True 5.7777185 False

 False 0.0 False
Some True 5.0450377 False
women True 7.2206335 False
were True 6.003382 False
made True 4.963541 False
but True 4.903002 False
me True 5.75488 False
, True 5.094723 False
myself True 5.9297504 False

 False 0.0 False
I True 6.4231944 False
like True 4.78322 False
to True 4.74484 False
think True 5.335493 False
that True 4.8260193 False
I True 6.4231944 False
was True 5.4562387 False
created True 5.5150514 False
for True 4.8435082 False
a True 5.306696 False
special True 5.4631042 False
purpose True 5.2435203 False

 Fal

I True 6.4231944 False
walk True 6.524474 False
like True 4.78322 False
this True 5.0461264 False
'cause True 5.8536057 False
I True 6.4231944 False
can True 5.132161 False
back True 5.1515756 False
it True 4.9409766 False
up True 4.6057925 False

 False 0.0 False
It True 4.9409766 False
's True 5.1889863 False
too True 5.173234 False
big True 5.7743006 False
, True 5.094723 False
it True 4.9409766 False
's True 5.1889863 False
too True 5.173234 False
wide True 5.9643054 False

 False 0.0 False
It True 4.9409766 False
's True 5.1889863 False
too True 5.173234 False
strong True 5.7850513 False
, True 5.094723 False
it True 4.9409766 False
wo True 5.6285934 False
n't True 5.2911263 False
fit True 5.7956285 False

 False 0.0 False
It True 4.9409766 False
's True 5.1889863 False
too True 5.173234 False
much True 5.084145 False
, True 5.094723 False
it True 4.9409766 False
's True 5.1889863 False
too True 5.173234 False
tough True 5.931958 False

 False 0.0 False
He True 6.080851 False
talk

creating a tokenizer pipeline

In [67]:
# define the lemmatizer function
def lemmatizer(doc):
    # remove the PRON (which are pronouns after lemming)
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_punct(doc):
    # remove punctuation -> Use token.text to return strings, needed for Gensim.
    doc = [token.text for token in doc if token.is_punct != True]
    return doc


# add_pipe adds the function to the tokenizer
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_punct, name="punct", last=True)

In [68]:
# get the column of lyrics
doc = lyrics.lyrics
doc

0         Oh baby, how you doing?\nYou know I'm gonna cu...
1         playin' everything so easy,\nit's like you see...
2         If you search\nFor tenderness\nIt isn't hard t...
3         Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...
4         Party the people, the people the party it's po...
                                ...                        
362232    I gotta say\nBoy, after only just a couple of ...
362233    I helped you find her diamond ring\nYou made m...
362234    Look at the couple in the corner booth\nLooks ...
362235    When I fly off this mortal earth\nAnd I'm meas...
362236    I heard from a friend of a friend of a friend ...
Name: lyrics, Length: 266557, dtype: object

In [69]:
# check to see if any lyrics are missing
doc.isna().sum()

0

In [70]:
doc_list = []

# go through each song
for doc in tqdm(doc):
    
    # tokenize the document
    pr = nlp(doc)
    
    # add it to the list
    doc_list.append(pr)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=266557.0), HTML(value='')))




In [87]:
# check a list of tokens
doc_list[0]

['oh',
 'baby',
 '\n ',
 'know',
 'cut',
 'right',
 'chase',
 '\n ',
 'woman',
 '\n ',
 'like',
 'think',
 'create',
 'special',
 'purpose',
 '\n ',
 'know',
 'special',
 'feel',
 '\n ',
 'baby',
 'let',
 'lose',
 '\n ',
 'need',
 'work',
 'boss',
 '\n ',
 'real',
 'want',
 'feel',
 '\n ',
 'consider',
 'lucky',
 'big',
 'deal',
 '\n ',
 'key',
 'heart',
 '\n ',
 'need',
 'open',
 'body',
 '\n ',
 'secret',
 'know',
 'inside',
 '\n ',
 'need',
 'lie',
 '\n ',
 'big',
 'wide',
 '\n ',
 'strong',
 'fit',
 '\n ',
 'tough',
 '\n ',
 'talk',
 'like',
 '\n ',
 'big',
 'ego',
 'huge',
 'ego',
 '\n ',
 'love',
 'big',
 'ego',
 '\n ',
 'walk',
 'like',
 '\n ',
 'usually',
 'humble',
 'right',
 'choose',
 '\n ',
 'leave',
 'blue',
 '\n ',
 'arrogant',
 'confident',
 '\n ',
 'decide',
 'find',
 'work',
 '\n ',
 'damn',
 'know',
 'kill',
 'leg',
 '\n ',
 'thigh',
 '\n ',
 'matter',
 'fact',
 'smile',
 'maybe',
 'eye',
 '\n ',
 'boy',
 'site',
 'kind',
 'like',
 '\n ',
 'big',
 'wide',
 '\n ',
 'st

In [94]:
# turn the list into a series, then turn it into a dataframe
temp = pd.DataFrame(pd.Series(doc_list), columns=["tokens"])

# reset the indices of the original df and the token df
temp.reset_index(drop=True, inplace=True)
first_df.reset_index(drop=True, inplace=True)

# merge the tokens df onto the original df using the indicies
df_with_tokens = pd.merge(first_df, temp, left_index=True, right_index=True)

In [95]:
# save the df
df_with_tokens.to_csv("lyrics_with_tokens.txt", sep="|", index=False)

In [96]:
# pd.read_csv("lyrics_with_tokens.txt", sep="|")

Unnamed: 0,song_id,song,year,artist,genre,lyrics,tokens
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu...","['oh', 'baby', '\n ', 'know', 'cut', 'right', ..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see...","['playin', 'easy', '\n ', 'like', 'sure', '\n ..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,"['search', '\n ', 'tenderness', '\n ', 'hard',..."
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...","['oh', 'oh', 'oh', 'oh', 'oh', 'oh', '\n ', 'v..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po...","['party', 'people', 'people', 'party', 'pop', ..."
...,...,...,...,...,...,...,...
266552,362232,who-am-i-drinking-tonight,2012,edens-edge,Country,"I gotta say\nBoy, after only just a couple of ...","['got', '\n ', 'boy', 'couple', 'date', '\n ',..."
266553,362233,liar,2012,edens-edge,Country,I helped you find her diamond ring\nYou made m...,"['help', 'find', 'diamond', 'ring', '\n ', 'tr..."
266554,362234,last-supper,2012,edens-edge,Country,Look at the couple in the corner booth\nLooks ...,"['look', 'couple', 'corner', 'booth', '\n ', '..."
266555,362235,christ-alone-live-in-studio,2012,edens-edge,Country,When I fly off this mortal earth\nAnd I'm meas...,"['fly', 'mortal', 'earth', '\n ', 'measure', '..."


create the tfidf

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create a dummy function to return the tokens as is
def dummy_fun(doc):
    return doc

# create the model for the tfidf, using the dummy function (since the tokens were made with spaCy)
tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

In [78]:
# fit the model to the tokens, and print the vocab
tfidf_vector = tfidf.fit_transform(doc_list)
tfidf.vocabulary_

{'oh': 408903,
 'baby': 202708,
 '\n ': 82,
 'know': 352733,
 'cut': 247580,
 'right': 450505,
 'chase': 230908,
 'woman': 539618,
 'like': 365907,
 'think': 502793,
 'create': 244719,
 'special': 482234,
 'purpose': 437055,
 'feel': 287405,
 'let': 364243,
 'lose': 368975,
 'need': 399181,
 'work': 540147,
 'boss': 217576,
 'real': 443005,
 'want': 534372,
 'consider': 240858,
 'lucky': 370020,
 'big': 212165,
 'deal': 252763,
 'key': 349369,
 'heart': 318344,
 'open': 411739,
 'body': 215930,
 'secret': 465109,
 'inside': 334629,
 'lie': 365212,
 'wide': 537896,
 'strong': 488288,
 'fit': 290278,
 'tough': 507918,
 'talk': 497152,
 'ego': 271941,
 'huge': 325484,
 'love': 369271,
 'walk': 534007,
 'usually': 522245,
 'humble': 325786,
 'choose': 233266,
 'leave': 362922,
 'blue': 215326,
 'arrogant': 196697,
 'confident': 239980,
 'decide': 253265,
 'find': 289593,
 'damn': 250812,
 'kill': 350265,
 'leg': 363306,
 'thigh': 502695,
 'matter': 378831,
 'fact': 285004,
 'smile': 477215

In [79]:
# check the shape of the vector
tfidf_vector.shape

(266557, 569352)

In [83]:
import scipy.sparse

# turn the sparse matrix into a dataframe
pd.DataFrame.sparse.from_spmatrix(tfidf_vector)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,569342,569343,569344,569345,569346,569347,569348,569349,569350,569351
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Use GloVe to train embeddings on the data