In [1]:
import re
import pandas as pd
import numpy as np

import gensim
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api
from gensim.parsing.preprocessing import remove_stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nccru\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nccru\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [2]:
df = pd.read_csv('song_lyrics.csv', nrows=1000)

In [3]:
# df.head()

In [4]:
### clean up current lyrics
### remove chorus:, intro:, etc.
### stop words, punctuation

In [None]:
def remove_between_brackets(text):
  """Removes all text between any matching pair of brackets, including the brackets themselves."""
  return re.sub(r'\[.*?\]', '', text)

In [6]:
def cleanse_lyrics(df):
    sw = stopwords.words('english')

    df['cleaned_lyrics'] = df['lyrics'].apply(remove_between_brackets)
    df['cleaned_lyrics'] = df['cleaned_lyrics'].str.lower()
    df['cleaned_lyrics'] = df['cleaned_lyrics'].apply(remove_stopwords)
    df['tokenized_text'] = df["cleaned_lyrics"].apply(word_tokenize)

    return df

In [7]:
df = cleanse_lyrics(df)
df.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language,cleaned_lyrics,tokenized_text
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en,"killa cam, killa cam, cam killa cam, killa cam...","[killa, cam, ,, killa, cam, ,, cam, killa, cam..."
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en,"yeah, hah, yeah, roc-a-fella invite somethin' ...","[yeah, ,, hah, ,, yeah, ,, roc-a-fella, invite..."
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en,maybe cause i'm eatin bastards fiend grub carr...,"[maybe, cause, i, 'm, eatin, bastards, fiend, ..."
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en,"ugh, killa! baby! kanye, 1970s heron flow, huh...","[ugh, ,, killa, !, baby, !, kanye, ,, 1970s, h..."
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en,"ask ""young boy gon' second time around? gon' c...","[ask, ``, young, boy, gon, ', second, time, ar..."


In [8]:
# # clean your sentences
# stopwords = [YOUR_STOPWORDS_HERE]
# cleaned_sentences = []
# for sentence in sentences:
#   cleaned = [word.lower() for word in sentence]
#   cleaned = [word for word in cleaned if word not in stopwords]
#   cleaned_sentences.append(cleaned)

# build a word2vec model on your dataset
sentences = df['tokenized_text'].tolist()
# base_model = Word2Vec(vector_size=100, min_count=5)
# base_model.build_vocab(sentences)
# total_examples = base_model.corpus_count

In [9]:
# base_model.train(sentences, total_examples=total_examples, epochs=base_model.epochs) 

In [10]:
# list(w for w in base_model.wv.index_to_key)[:5]

In [11]:
# base_model.wv.vectors[:5]

In [12]:
def apply_word2vec(sentences):
  """
  apply_word2vec
  params: sentences -> 'tokenized_text'
  returns: word2vec model
  
  Access vectors from base_model.wv.vectors and base_model.wv.index_to_key
  """
  base_model = Word2Vec(vector_size=100, min_count=5)
  base_model.build_vocab(sentences)
  # base_model.train(sentences, total_examples=base_model.corpus_count, epochs=base_model.epochs) 
  return base_model

In [13]:
model = apply_word2vec(df)

### Embedding: GloVe

In [14]:
def apply_glove(sentences, model="glove-wiki-gigaword-100"):

    print("Models available for use:")
    print(list(gensim.downloader.info()['models'].keys()))

    glove_model = api.load(model)

    ### initialize model
    base_model = Word2Vec(vector_size=100, min_count=1)
    base_model.build_vocab(sentences)
    total_examples = base_model.corpus_count

    base_model.build_vocab(glove_model.index_to_key, update=True)
    base_model.train(sentences, total_examples=total_examples, epochs=base_model.epochs)

    return base_model

In [15]:
glove_model = api.load("glove-wiki-gigaword-100")



In [17]:
apply_glove(sentences = sentences)

Models available for use:
['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


<gensim.models.word2vec.Word2Vec at 0x282a5cf6250>

In [18]:
### apply_glove and embedding matrix are not related to each. They are separate implementations depending on needed format

In [19]:
def create_glove_matrix(df):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['cleaned_lyrics'])

    max_length = max(len(data) for data in df['cleaned_lyrics'])
    word_index = tokenizer.word_index
    vocab_size = len(word_index)    

    # padding text data
    sequences = tokenizer.texts_to_sequences(df['cleaned_lyrics'])
    padded_seq = pad_sequences(sequences, maxlen=12630, padding='post', truncating='post')

    # create embedding index
    embedding_index = {}
    with open('glove.42B.300d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = coefs

    # create embedding matrix
    embedding_matrix = np.zeros((vocab_size+1, 300))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

### Embedding: BERT and DistilBERT

In [23]:
#import library
from transformers import DistilBertTokenizer, DistilBertModel
import torch
# import numpy as np

#load DistilBERT tokenizer and a pretrained model to avoid training from scratch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [24]:
#tokenize and obtain embeddings
def get_lyrics_embedding(lyrics):
    tokens = tokenizer(
        lyrics,
        truncation = True,
        padding = True, 
        max_length = 512, #DistilBERT has a max token limit of 512
        return_tensors = "pt" #converting output as PyTorch since DistilBERT expects tensors not token IDs
        )
    
    with torch.no_grad(): #removes gradient calculation to save memory usage since inferences are not needed as we are predicting, not training
        output = model(**tokens)

    cls_embedding = output.last_hidden_state[:, 0] #extract first token with CLS
    cls_embedding = cls_embedding.detach() #detach from PyTorch's gradient computation 
    cls_embedding = cls_embedding.cpu() #converting tensor to CPU to ensure compatability (ie. NumPy array conversion)
    cls_embedding = cls_embedding.squeeze() #remove any extra dimensions
    
    embedding = cls_embedding.numpy() #converting into NumPy array
    return embedding

#converting lyrics into embeddings using nrows
embeddings = np.array([get_lyrics_embedding(lyric) for lyric in df['lyrics']])


### Bag of Words Model

### TF-IDF

### Question: Are we using Keras / PyTorch?
This may change the format and implementation of the current method of embedding.