# Creating a DTM

In [None]:
import pandas as pd
from datetime import datetime
import spacy
from tqdm import tqdm_notebook as tqdm
import ast
import gensim

### First Time Creation of Tokens

In [None]:
# load in the lyrics data set
first_df = pd.read_csv("lyrics.csv")

In [None]:
# create a song id number by renaming the index
first_df.rename(columns={"index":"song_id"}, inplace=True)
first_df.shape

In [None]:
first_df.dropna(subset=['lyrics'], inplace=True)
first_df.shape

In [None]:
# split the dataframe
info = first_df[["song_id", "song", "year", "artist", "genre"]]
lyrics = first_df[["song_id", "lyrics"]]

print(info.columns)
print(lyrics.columns)

In [None]:
# load the spaCy object with the english corpus
nlp = spacy.load("en_core_web_md")

experimenting with spaCy

In [None]:
# create a test case of one set of lyrics
doc = nlp(lyrics.lyrics[0])

# check the parts of speech etc
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

print("###################################################")

# check the tokenization
for token in doc:
    print(token.text)
    
print("###################################################")

# check the named entities in the document
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

print("###################################################")

# check if the words have vectors
for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

In [None]:
for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

creating a tokenizer pipeline

In [None]:
# define the lemmatizer function
def lemmatizer(doc):
    # remove the PRON (which are pronouns after lemming)
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_punct(doc):
    # remove punctuation -> Use token.text to return strings, needed for Gensim.
    doc = [token.text for token in doc if token.is_punct != True]
    return doc


# add_pipe adds the function to the tokenizer
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_punct, name="punct", last=True)

In [None]:
# get the column of lyrics
doc = lyrics.lyrics
doc

In [None]:
# check to see if any lyrics are missing
doc.isna().sum()

In [None]:
# laptop time to run = 
# desktop time to run = 

doc_list = []

# go through each song
for doc in tqdm(doc):
    
    # tokenize the document
    pr = nlp(doc)
    
    # add it to the list
    doc_list.append(pr)

In [None]:
# check a list of tokens
doc_list[0]

In [None]:
# turn the list into a series, then turn it into a dataframe
temp = pd.DataFrame(pd.Series(doc_list), columns=["tokens"])

# reset the indices of the original df and the token df
temp.reset_index(drop=True, inplace=True)
first_df.reset_index(drop=True, inplace=True)

# merge the tokens df onto the original df using the indicies
df_with_tokens = pd.merge(first_df, temp, left_index=True, right_index=True)

In [None]:
# save the df
df_with_tokens.to_csv("lyrics_with_tokens.txt", sep="|", index=False)

### Loading in the Saved Tokens

In [None]:
df = pd.read_csv("lyrics_with_tokens.txt", sep="|")

In [None]:
df

Lyrics list and genre list

In [None]:
lyrics_list = df['tokens']
lyrics_list = lyrics_list.to_list()

lyr_list = []

for lyr in lyrics_list:
    temp = ast.literal_eval(lyr)
    lyr_list.append(temp)
    
lyr_list

In [None]:
genre_list = df['genre']
genre_list = genre_list.to_list()

gen_list = []

for gen in genre_list:
    gen_list.append(gen)
    
gen_list

### <font color='red'>create the tfidf (skip for now)</font>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create a dummy function to return the tokens as is
def dummy_fun(doc):
    return doc

# create the model for the tfidf, using the dummy function (since the tokens were made with spaCy)
tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

In [None]:
# fit the model to the tokens, and print the vocab
tfidf_vector = tfidf.fit_transform(doc_list)
tfidf.vocabulary_

In [None]:
# check the shape of the vector
tfidf_vector.shape

In [None]:
import scipy.sparse

# turn the sparse matrix into a dataframe
pd.DataFrame.sparse.from_spmatrix(tfidf_vector)

### Use Word2Vec to train embeddings on the data

In [None]:
# turn the lyrics into a list of sentences (assuming each line is the equivilent of a sentence)

# create a blank list
result = []

# loop through every lyric
for i in lyr_list:
    tmp = []
    
    # if the entry is a newline indicator, append nothing and start a new list tmp
    for entry in i:
        if entry != '\n ':
            tmp.append(entry)
        else:
            result.append(tmp)
            tmp = []
    result.append(tmp)

result

In [None]:
# set a counter for the index
index_count = 0

# loop through every sentence in the lyrics list
for i in result:

    # remove any chorus markers from the data (these are single item lists with the value chorus)
    if len(i) != 1: 
        index_count = index_count + 1
        continue
    else:
        if i[0] == 'chorus': 
            result.pop(index_count)
        index_count = index_count + 1

In [None]:
result = [[x.casefold() for x in sublst] for sublst in result]
result

In [None]:
import gensim

model = gensim.models.Word2Vec(result, min_count=5, workers=4)

In [None]:
model['hello']

### Use Doc2Vec to train embeddings on the data

### Create a Hierarchical Attention Network (HAN)