In [None]:
import re
import pandas as pd
import numpy as np

import gensim
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
import gensim.downloader as api
from gensim.parsing.preprocessing import remove_stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
df = pd.read_csv('song_lyrics.csv', nrows=1000)

In [None]:
# df.head()

In [None]:
### clean up current lyrics
### remove chorus:, intro:, etc.
### stop words, punctuation

In [None]:
def remove_between_brackets(text):
  """Removes all text between any matching pair of brackets, including the brackets themselves."""
  return re.sub(r'\[.*?\]', '', text)

In [None]:
def cleanse_lyrics(df):
    sw = stopwords.words('english')

    df['cleaned_lyrics'] = df['lyrics'].apply(remove_between_brackets)
    df['cleaned_lyrics'] = df['cleaned_lyrics'].str.lower()
    df['cleaned_lyrics'] = df['cleaned_lyrics'].apply(remove_stopwords)
    df['tokenized_text'] = df["cleaned_lyrics"].apply(word_tokenize)

    return df

In [None]:
df = cleanse_lyrics(df)
df.head()

In [None]:
# # clean your sentences
# stopwords = [YOUR_STOPWORDS_HERE]
# cleaned_sentences = []
# for sentence in sentences:
#   cleaned = [word.lower() for word in sentence]
#   cleaned = [word for word in cleaned if word not in stopwords]
#   cleaned_sentences.append(cleaned)

# build a word2vec model on your dataset
sentences = df['tokenized_text'].tolist()
# base_model = Word2Vec(vector_size=100, min_count=5)
# base_model.build_vocab(sentences)
# total_examples = base_model.corpus_count

In [None]:
# base_model.train(sentences, total_examples=total_examples, epochs=base_model.epochs) 

In [None]:
# list(w for w in base_model.wv.index_to_key)[:5]

In [None]:
# base_model.wv.vectors[:5]

In [None]:
def apply_word2vec(sentences):
  """
  apply_word2vec
  params: sentences -> 'tokenized_text'
  returns: word2vec model
  
  Access vectors from base_model.wv.vectors and base_model.wv.index_to_key
  """
  base_model = Word2Vec(vector_size=100, min_count=5)
  base_model.build_vocab(sentences)
  # base_model.train(sentences, total_examples=base_model.corpus_count, epochs=base_model.epochs) 
  return base_model

In [None]:
model = apply_word2vec(df)

### Embedding: GloVe

In [None]:
def apply_glove(sentences, model="glove-wiki-gigaword-100"):

    print("Models available for use:")
    print(list(gensim.downloader.info()['models'].keys()))

    glove_model = api.load(model)

    ### initialize model
    base_model = Word2Vec(vector_size=100, min_count=1)
    base_model.build_vocab(sentences)
    total_examples = base_model.corpus_count

    base_model.build_vocab(glove_model.index_to_key, update=True)
    base_model.train(sentences, total_examples=total_examples, epochs=base_model.epochs)

    return base_model

In [None]:
glove_model = api.load("glove-wiki-gigaword-100")

In [None]:
apply_glove(sentences = sentences)

In [None]:
### apply_glove and embedding matrix are not related to each. They are separate implementations depending on needed format

In [None]:
def create_glove_matrix(df):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['cleaned_lyrics'])

    max_length = max(len(data) for data in df['cleaned_lyrics'])
    word_index = tokenizer.word_index
    vocab_size = len(word_index)    

    # padding text data
    sequences = tokenizer.texts_to_sequences(df['cleaned_lyrics'])
    padded_seq = pad_sequences(sequences, maxlen=12630, padding='post', truncating='post')

    # create embedding index
    embedding_index = {}
    with open('glove.42B.300d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = coefs

    # create embedding matrix
    embedding_matrix = np.zeros((vocab_size+1, 300))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

### Embedding: BERT and DistilBERT

In [None]:
#import library
from transformers import DistilBertTokenizer, DistilBertModel
import torch
# import numpy as np

#load DistilBERT tokenizer and a pretrained model to avoid training from scratch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
#tokenize and obtain embeddings
def get_lyrics_embedding(lyrics):
    tokens = tokenizer(
        lyrics,
        truncation = True,
        padding = True, 
        max_length = 512, #DistilBERT has a max token limit of 512
        return_tensors = "pt" #converting output as PyTorch since DistilBERT expects tensors not token IDs
        )
    
    with torch.no_grad(): #removes gradient calculation to save memory usage since inferences are not needed as we are predicting, not training
        output = model(**tokens)

    cls_embedding = output.last_hidden_state[:, 0] #extract first token with CLS
    cls_embedding = cls_embedding.detach() #detach from PyTorch's gradient computation 
    cls_embedding = cls_embedding.cpu() #converting tensor to CPU to ensure compatability (ie. NumPy array conversion)
    cls_embedding = cls_embedding.squeeze() #remove any extra dimensions
    
    embedding = cls_embedding.numpy() #converting into NumPy array
    return embedding

#converting lyrics into embeddings using nrows
embeddings = np.array([get_lyrics_embedding(lyric) for lyric in df['lyrics']])


### Bag of Words Model

### TF-IDF

### Question: Are we using Keras / PyTorch?
This may change the format and implementation of the current method of embedding.