In [15]:
import pandas as pd
from gensim.models import Word2Vec
import gensim.downloader as api
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pokem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
df = pd.read_csv('song_lyrics.csv', nrows=10000)

In [7]:
df.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [None]:
### clean up current lyrics
### stop words

In [10]:
df['tokenized_text'] = df["lyrics"].apply(word_tokenize)
df['tokenized_text']

0       [[, Chorus, :, Opera, Steve, &, Cam'ron, ], Ki...
1       [[, Produced, by, Irv, Gotti, ], [, Intro, ], ...
2       [Maybe, cause, I, 'm, eatin, And, these, basta...
3       [[, Produced, by, Kanye, West, and, Brian, Mil...
4       [[, Intro, ], So, they, ask, me, '', Young, bo...
                              ...                        
9995    [[, Hook, ], Wish, I, would, a, died, for, you...
9996    [Yeah, Studio, rap, productions, (, this, is, ...
9997    [[, Verse, 1, ], Hey, it, 's, the, Martin, and...
9998    [Yeah, Some, people, wonder, ,, ya, know, ?, T...
9999    [[, Intro, :, Bizzy, Bone, ], Rest, in, peace,...
Name: tokenized_text, Length: 10000, dtype: object

In [19]:
# # clean your sentences
# stopwords = [YOUR_STOPWORDS_HERE]
# cleaned_sentences = []
# for sentence in sentences:
#   cleaned = [word.lower() for word in sentence]
#   cleaned = [word for word in cleaned if word not in stopwords]
#   cleaned_sentences.append(cleaned)

# build a word2vec model on your dataset
sentences = df['tokenized_text'].tolist()
base_model = Word2Vec(vector_size=300, min_count=5)
base_model.build_vocab(sentences)
total_examples = base_model.corpus_count

In [None]:
def apply_word2vec(sentences):
  """
  apply_word2vec
  params: senetences -> 'tokenized_text'
  returns: word2vec model
  """
  base_model = Word2Vec(vector_size=300, min_count=5)
  base_model.build_vocab(sentences)
  return base_model

In [12]:
model = apply_word2vec(df)

### vector representation of the word 'sentence'
vector = model.wv["sentence"]
print(vector)

### Embedding: GloVe

In [None]:
# add GloVe's vocabulary & weights
base_model.build_vocab([list(glove_vectors.vocab.keys())], update=True)

# train on your data
base_model.train(sentences, total_examples=total_examples, epochs=base_model.epochs)
base_model_wv = base_model.wv

In [16]:
model_glove_twitter = api.load("glove-twitter-25")



In [17]:
model_glove_twitter.most_similar("pelosi",topn=10)

[('clegg', 0.9653650522232056),
 ('miliband', 0.9515050053596497),
 ('bachmann', 0.9484400749206543),
 ('mcconnell', 0.9416398406028748),
 ('carney', 0.9340257048606873),
 ('coulter', 0.9311323165893555),
 ('boehner', 0.9286302328109741),
 ('santorum', 0.9269059300422668),
 ('farage', 0.9193653464317322),
 ('mourdock', 0.9186689853668213)]

### Embedding: DistilBERT

In [None]:
#import library
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np

#load DistilBERT tokenizer and a pretrained model to avoid training from scratch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
#tokenize and obtain embeddings
def get_lyrics_embedding(lyrics):
    tokens = tokenizer(
        lyrics,
        truncation = True,
        padding = True, 
        max_length = 512, #DistilBERT has a max token limit of 512
        return_tensors = "pt" #converting output as PyTorch since DistilBERT expects tensors not token IDs
        )
    
    with torch.no_grad(): #removes gradient calculation to save memory usage since inferences are not needed as we are predicting, not training
        output = model(**tokens)

    cls_embedding = output.last_hidden_state[:, 0] #extract first token with CLS
    cls_embedding = cls_embedding.detach() #detach from PyTorch's gradient computation 
    cls_embedding = cls_embedding.cpu() #converting tensor to CPU to ensure compatability (ie. NumPy array conversion)
    cls_embedding = cls_embedding.squeeze() #remove any extra dimensions
    
    embedding = cls_embedding.numpy() #converting into NumPy array
    return embedding

#converting lyrics into embeddings using nrows
embeddings = np.array([get_lyrics_embedding(lyric) for lyric in df['lyrics']])
