In [1]:
%%capture
from tqdm.notebook import tqdm
tqdm().pandas()

In [2]:
import pandas as pd
import numpy as np
import sklearn 
import matplotlib.pyplot as plt

In [3]:
from ftfy import fix_text
import threading
from concurrent.futures import ThreadPoolExecutor
import os

filename = 'data/fulldataset.csv'
sample_frac = 1.0
using_sanity_check = False

df = pd.read_csv(filename,quotechar="\"")
df = df.sample(frac=sample_frac, random_state=1)
df.reset_index(inplace=True, drop=True)
num_rows = len(df.index)
print("num rows", num_rows)

num rows 516174


In [4]:
# sanity check cell, skip this cell if training on full data
if using_sanity_check:
    df = pd.read_csv('sanity_check.csv')
    sample_frac = 1.0
df

Unnamed: 0,Band,Lyrics,Song
0,Reba McEntire,The greatest man I never knew lived just down ...,The Greatest Man I Never Knew
1,Vanessa Carlton,Stealing glances through the key hole\r\nIn a ...,Hear the Bells
2,Snoop Dogg,"What it do,\r\nComing at you live, it's your b...",Pronto
3,Snoop Dogg,Another smoke session up in this motherfucker\...,Buss 'N Rocks
4,The Shirelles,"Everybody, twist\r\n\r\nThey're twisting in Cl...",Twisting USA
...,...,...,...
516169,Erika Paul,Sassy Sarah On The Saxophone\r\n\r\nWords & Mu...,Sassy Sarah on the Saxophone [Funk Fusion]
516170,Billy Currington,This porch light\r\nI can keep it on all night...,Jonesin'
516171,Barbra Streisand,I'm dreaming tonight\r\nOf a place I love\r\nE...,I'll Be Home for Christmas
516172,Daniel Johnston,"Without you, I'll be doing fine\r\nWithout you...",Without You


In [5]:
if not using_sanity_check:
    pickle_filename = f"data/lyrec_df_{sample_frac}.pkl"
    if os.path.isfile(pickle_filename):
        print("Found pickle file for current config, using that")
        df = pd.read_pickle(pickle_filename)
    else:
        print("Cleaning dataset, no cache available")
        pbar = tqdm(total=(num_rows))
        def clean(row):
            try:
                lyric = fix_text(df['Lyrics'][row])
                band = fix_text(df['Band'][row])
                song = fix_text(df['Song'][row])
                df['Lyrics'][row] = lyric
                df['Band'][row] = band
                df['Song'][row] = song
            except Exception:
                print("Error cleaning up row:", row,"dropping instead")
                df.drop(index=row)
            pbar.update(1)

        executor = ThreadPoolExecutor(10)
        threads = []
        for row in df.index:
            t = executor.submit(clean, (row))
            threads.append(t)

        for t in threads:
            t.result()

df.reset_index(inplace=True, drop=True)
df['Lyrics'].astype(str)
df

Found pickle file for current config, using that


Unnamed: 0,Band,Lyrics,Song
0,Reba McEntire,The greatest man I never knew lived just down ...,The Greatest Man I Never Knew
1,Vanessa Carlton,Stealing glances through the key hole\nIn a br...,Hear the Bells
2,Snoop Dogg,"What it do,\nComing at you live, it's your boy...",Pronto
3,Snoop Dogg,Another smoke session up in this motherfucker\...,Buss 'N Rocks
4,The Shirelles,"Everybody, twist\n\nThey're twisting in Clevel...",Twisting USA
...,...,...,...
516169,Erika Paul,Sassy Sarah On The Saxophone\n\nWords & Music ...,Sassy Sarah on the Saxophone [Funk Fusion]
516170,Billy Currington,This porch light\nI can keep it on all night\n...,Jonesin'
516171,Barbra Streisand,I'm dreaming tonight\nOf a place I love\nEven ...,I'll Be Home for Christmas
516172,Daniel Johnston,"Without you, I'll be doing fine\nWithout you, ...",Without You


In [6]:
# save dataset to pickle
if not using_sanity_check:
    df.to_pickle(pickle_filename)
df.dtypes

Band      object
Lyrics    object
Song      object
dtype: object

## Attempt at BERT pairwise similarities

In [7]:
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
document_embeddings = None
got_from_file = False
filepath = f"data/lyrec_embeddings_{sample_frac}.pkl.npy"

if os.path.isfile(filepath):
    got_from_file = True
    document_embeddings = np.load(filepath)
else:
    document_embeddings = sbert_model.encode(df['Lyrics'], show_progress_bar=True, device="cpu")

if not got_from_file:
    np.save(filepath, document_embeddings)      

In [9]:
from sklearn.metrics import pairwise
print(document_embeddings.shape)

# generate similarities on subset of dataframe (doing the full 500K requires like 2000 GB of RAM!!!)
subset_df = df.sample(frac=0.02, random_state=1)
subset_embeddings = document_embeddings[subset_df.index]
print(subset_embeddings.shape)

pairwise_similarities=pairwise.cosine_similarity(subset_embeddings)
# pairwise_differences=pairwise.euclidean_distances(subset_embeddings)

def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Song: {df.iloc[doc_id]["Song"]} - {df.iloc[doc_id]["Band"]}')
    print ('\n')
    print ('Similar Songs:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[:5]:
#         if ix==doc_id:
#             continue
        print('\n')
        print (f'Similar Song: {df.iloc[ix]["Song"]} - {df.iloc[ix]["Band"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

for i in range(7):
    most_similar(i,pairwise_similarities,'Cosine Similarity')
    print("---")


(516174, 768)
(10323, 768)
Song: The Greatest Man I Never Knew - Reba McEntire


Similar Songs:


Similar Song: The Greatest Man I Never Knew - Reba McEntire
Cosine Similarity : 0.9999997615814209


Similar Song: Bombed - Mark Lanegan
Cosine Similarity : 0.8760876655578613


Similar Song: Baby You're a Rich Man - The Beatles
Cosine Similarity : 0.8560428023338318


Similar Song: Demarrage Hold Up - Kaaris
Cosine Similarity : 0.8514242172241211


Similar Song: Step by Step - Ocean Colour Scene
Cosine Similarity : 0.8487228751182556
---
Song: Hear the Bells - Vanessa Carlton


Similar Songs:


Similar Song: Hear the Bells - Vanessa Carlton
Cosine Similarity : 0.9999998807907104


Similar Song: Land of Youth (Tir Na NÓg) - Moya Brennan
Cosine Similarity : 0.9084211587905884


Similar Song: Amar y Vivir - Claudia Sierra
Cosine Similarity : 0.9006235599517822


Similar Song: Temptation - Sammy Kaye
Cosine Similarity : 0.8936922550201416


Similar Song: Here Comes the Night - Bob Welch
Cosin

## Bag of Words

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import re
from operator import itemgetter
from sklearn.metrics import pairwise
# Define dictionary utility function
BAG_SIZE = 100
def get_top_values(d,N = BAG_SIZE):
    return list(sorted(d.items(), key = itemgetter(1), reverse = True)[:N])

# Preprocess songs

# set stop words in english
stop_words = set(stopwords.words('english')) 

# creating  "master" dictionary
word2count = {}

for row in df.index:
    song = df['Lyrics'][row]

    # remove stopwords and clean up each song
    song = song.lower()
    song = re.sub(r'\W', ' ', song)
    song = re.sub(r'\s+', ' ', song)
    song = re.sub(r'\r|\n', ' ', song)

    words = nltk.word_tokenize(song)
    for word in words:

        if word not in stop_words:

            if word not in word2count.keys():
                word2count[word] = 1
            else:
                word2count[word] += 1

word2count_list = get_top_values(word2count)
word2count = dict(word2count_list)
word_ranks = {w2c[0]: r for r, w2c in enumerate(word2count_list)}

# Now pass through again and create vectors

song_counts = np.zeros([len(df.index), BAG_SIZE])

for song_index in df.index:

    song = df['Lyrics'][song_index]

    # remove stopwords and clean up each song
    song = song.lower()
    song = re.sub(r'\W', ' ', song)
    song = re.sub(r'\s+', ' ', song)
    song = re.sub(r'\r|\n', ' ', song)

    words = nltk.word_tokenize(song)
    for word in words:
        if word in word_ranks:
            word_idx = word_ranks[word]
            song_counts[song_index][word_idx] += 1

print(word2count)
print(song_counts)


pairwise_similarities=pairwise.cosine_similarity(song_counts)
pairwise_differences=pairwise.euclidean_distances(song_counts)


def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Song: {df.iloc[doc_id]["Song"]} - {df.iloc[doc_id]["Band"]}')
    print ('Similar Songs:')
    
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[:5]:
#         if ix==doc_id:
#             continue
        print('\n')
        print (f'Similar Song: {df.iloc[ix]["Song"]} - {df.iloc[ix]["Band"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

for i in range(7):
    most_similar(i,pairwise_similarities,'Cosine Similarity')
    print("---")
    #most_similar(0,pairwise_differences,'Euclidean Distance')


In [None]:
# save matrix as pickle
with open(f'data/lyrec_similarity_mat_{sample_frac}.pkl', 'wb') as f:
    np.save(f, pairwise_similarities)