In [1]:
%%capture
from tqdm.notebook import tqdm
tqdm().pandas()

In [2]:
import pandas as pd
import numpy as np
import sklearn 
import matplotlib.pyplot as plt

In [3]:
from ftfy import fix_text
import threading
from concurrent.futures import ThreadPoolExecutor
import os

filename = 'data/fulldataset.csv'
sample_frac = 1.0

df = pd.read_csv(filename,quotechar="\"")
df = df.sample(frac=sample_frac, random_state=1)
df.reset_index(inplace=True, drop=True)
num_rows = len(df.index)
print("num rows", num_rows)

num rows 516174


In [4]:
# sanity check cell, skip this cell (and the next one) if training on full data
df = pd.read_csv('sanity_check.csv')
sample_frac = 1.0
df

Unnamed: 0,Band,Lyrics,Song
0,Mariah Carry,I don't want a lot for Christmas\nThere is jus...,All I Want for Christmas is You
1,Wham!,Last Christmas I gave you my heart\nBut the ve...,Last Christmas
2,Idk,"Jingle bells, jingle bells, jingle all the way...",Jingle Bells
3,Johnny Mathis,"Jingle bell, jingle bell Jingle bell rock Jing...",Jingle Bell Rock
4,Lee Greenwood,If tomorrow all the things were gone\nI worked...,God Bless the USA
5,Samuel Ward,Oh beautiful for heroes proved\nIn liberating ...,America the Beautiful
6,Francis Scott Key,"Oh, say can you see by the dawn’s early light\...",Star Spangled Banner


In [5]:
pickle_filename = f"data/lyrec_df_{sample_frac}.pkl"
if os.path.isfile(pickle_filename):
    print("Found pickle file for current config, using that")
    df = pd.read_pickle(pickle_filename)
else:
    print("Cleaning dataset, no cache available")
    pbar = tqdm(total=(num_rows))
    def clean(row):
        try:
            lyric = fix_text(df['Lyrics'][row])
            band = fix_text(df['Band'][row])
            song = fix_text(df['Song'][row])
            df['Lyrics'][row] = lyric
            df['Band'][row] = band
            df['Song'][row] = song
        except Exception:
            print("Error cleaning up row:", row,"dropping instead")
            df.drop(index=row)
        pbar.update(1)

    executor = ThreadPoolExecutor(10)
    threads = []
    for row in df.index:
        t = executor.submit(clean, (row))
        threads.append(t)

    for t in threads:
        t.result()

df.reset_index(inplace=True, drop=True)
df

Cleaning dataset, no cache available


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=516174.0), HTML(value='')))


Error cleaning up row: 18082 dropping instead


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Error cleaning up row: 457340 dropping instead
Error cleaning up row: 515163 dropping instead


Unnamed: 0,Band,Lyrics,Song
0,Reba McEntire,The greatest man I never knew lived just down ...,The Greatest Man I Never Knew
1,Vanessa Carlton,Stealing glances through the key hole\nIn a br...,Hear the Bells
2,Snoop Dogg,"What it do,\nComing at you live, it's your boy...",Pronto
3,Snoop Dogg,Another smoke session up in this motherfucker\...,Buss 'N Rocks
4,The Shirelles,"Everybody, twist\n\nThey're twisting in Clevel...",Twisting USA
...,...,...,...
516169,Erika Paul,Sassy Sarah On The Saxophone\n\nWords & Music ...,Sassy Sarah on the Saxophone [Funk Fusion]
516170,Billy Currington,This porch light\nI can keep it on all night\n...,Jonesin'
516171,Barbra Streisand,I'm dreaming tonight\nOf a place I love\nEven ...,I'll Be Home for Christmas
516172,Daniel Johnston,"Without you, I'll be doing fine\nWithout you, ...",Without You


In [6]:
# save dataset to pickle
df.to_pickle(pickle_filename)

## Attempt at BERT pairwise similarities

In [None]:
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
document_embeddings = None
got_from_file = False
filepath = f"data/lyrec_embeddings_{sample_frac}.pkl"

if os.path.isfile(filepath):
    got_from_file = True
    with open(filepath, 'rb') as f:
        document_embeddings = np.load(f)
else:
    document_embeddings = sbert_model.encode(df['Lyrics'], show_progress_bar=True, device="cpu")

if not got_from_file:
    with open(filepath, 'wb') as f:
        np.save(document_embeddings, filepath)      

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=16131.0), HTML(value='')))

In [6]:
from sklearn.metrics import pairwise
print(document_embeddings.shape)
pairwise_similarities=pairwise.cosine_similarity(document_embeddings)
pairwise_differences=pairwise.euclidean_distances(document_embeddings)

def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Song: {df.iloc[doc_id]["Song"]} - {df.iloc[doc_id]["Band"]}')
    print ('\n')
    print ('Similar Songs:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[:5]:
#         if ix==doc_id:
#             continue
        print('\n')
        print (f'Similar Song: {df.iloc[ix]["Song"]} - {df.iloc[ix]["Band"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

for i in range(7):
    most_similar(i,pairwise_similarities,'Cosine Similarity')
    print("---")


NameError: name 'document_embeddings' is not defined

## Bag of Words

In [8]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import re
from operator import itemgetter
from sklearn.metrics import pairwise
# Define dictionary utility function
BAG_SIZE = 100
def get_top_values(d,N = BAG_SIZE):
    return list(sorted(d.items(), key = itemgetter(1), reverse = True)[:N])

# Preprocess songs

# set stop words in english
stop_words = set(stopwords.words('english')) 

# creating  "master" dictionary
word2count = {}

for row in df.index:
    song = df['Lyrics'][row]

    # remove stopwords and clean up each song
    song = song.lower()
    song = re.sub(r'\W', ' ', song)
    song = re.sub(r'\s+', ' ', song)
    song = re.sub(r'\r|\n', ' ', song)

    words = nltk.word_tokenize(song)
    for word in words:

        if word not in stop_words:

            if word not in word2count.keys():
                word2count[word] = 1
            else:
                word2count[word] += 1

word2count_list = get_top_values(word2count)
word2count = dict(word2count_list)
word_ranks = {w2c[0]: r for r, w2c in enumerate(word2count_list)}

# Now pass through again and create vectors

song_counts = np.zeros([len(df.index), BAG_SIZE])

for song_index in df.index:

    song = df['Lyrics'][song_index]

    # remove stopwords and clean up each song
    song = song.lower()
    song = re.sub(r'\W', ' ', song)
    song = re.sub(r'\s+', ' ', song)
    song = re.sub(r'\r|\n', ' ', song)

    words = nltk.word_tokenize(song)
    for word in words:
        if word in word_ranks:
            word_idx = word_ranks[word]
            song_counts[song_index][word_idx] += 1

print(word2count)
print(song_counts)


pairwise_similarities=pairwise.cosine_similarity(song_counts)
pairwise_differences=pairwise.euclidean_distances(song_counts)


def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Song: {df.iloc[doc_id]["Song"]} - {df.iloc[doc_id]["Band"]}')
    print ('Similar Songs:')
    
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[:5]:
#         if ix==doc_id:
#             continue
        print('\n')
        print (f'Similar Song: {df.iloc[ix]["Song"]} - {df.iloc[ix]["Band"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

for i in range(7):
    most_similar(i,pairwise_similarities,'Cosine Similarity')
    print("---")
    #most_similar(0,pairwise_differences,'Euclidean Distance')


{'jingle': 49, 'christmas': 23, 'oh': 21, 'bells': 20, 'gave': 20, 'bell': 19, 'want': 16, 'one': 15, 'baby': 14, 'horse': 14, 'sleigh': 13, 'special': 13, 'away': 12, 'know': 11, 'fun': 11, 'next': 10, 'time': 10, 'heart': 9, 'someone': 9, 'god': 9, 'way': 9, 'ride': 9, 'open': 9, 'year': 8, 'give': 8, 'land': 8, 'america': 8, 'day': 7, 'cause': 7, 'last': 7, 'save': 7, 'love': 7, 'rock': 7, 'free': 7, 'sea': 7, 'right': 6, 'tears': 6, 'still': 6, 'er': 6, 'home': 6, 'make': 5, 'wish': 5, 'shining': 5, 'bright': 5, 'stand': 5, 'need': 4, 'could': 4, 'ever': 4, 'go': 4, 'ring': 4, 'sing': 4, 'today': 4, 'american': 4, 'star': 4, 'spangled': 4, 'banner': 4, 'wave': 4, 'brave': 4, 'lot': 3, 'underneath': 3, 'come': 3, 'true': 3, 'snow': 3, 'tonight': 3, 'air': 3, 'see': 3, 'fool': 3, 'night': 3, 'proud': 3, 'least': 3, 'forget': 3, 'men': 3, 'died': 3, 'gladly': 3, 'defend': 3, 'doubt': 3, 'bless': 3, 'u': 3, 'say': 3, 'may': 3, 'thy': 3, 'shed': 3, 'grace': 3, 'thee': 3, 'lord': 3, 'thi

In [None]:
# save matrix as pickle
with open(f'data/lyrec_similarity_mat_{sample_frac}.pkl', 'wb') as f:
    np.save(f, pairwise_similarities)