In [1]:
%%capture
from tqdm.notebook import tqdm
tqdm().pandas()

In [2]:
import pandas as pd
import numpy as np
import sklearn 
import matplotlib.pyplot as plt
from ftfy import fix_text
import threading
import os

filename = 'fulldataset.csv'
sample_frac = 0.01

df = pd.read_csv(filename,quotechar="\"")
df = df.sample(frac=sample_frac, random_state=1)
df.reset_index(inplace=True, drop=True)
num_rows = len(df.index)
print("num rows", num_rows)

num rows 5162


In [3]:
pickle_filename = f"lyrec_df_{sample_frac}.pkl"
if os.path.isfile(pickle_filename):
    print("Found pickle file for current config, using that")
    df = pd.read_pickle(file_name)
else:
    print("Cleaning dataset, no cache available")
    pbar = tqdm(total=(num_rows))
    def clean(df, row):
        try:
            lyric = fix_text(df['Lyrics'][row])
            band = fix_text(df['Band'][row])
            song = fix_text(df['Song'][row])
            df['Lyrics'][row] = lyric
            df['Band'][row] = band
            df['Song'][row] = song
        except Exception:
            print("Error cleaning up row:", row,"dropping instead")
            df.drop(index=row)
        pbar.update(1)

    threads = []
    for row in df.index:
        t = threading.Thread(target=clean, args=(df, row))
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

df.reset_index(inplace=True, drop=True)
df

Cleaning dataset, no cache available


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5162.0), HTML(value='')))

Unnamed: 0,Band,Lyrics,Song
0,Reba McEntire,The greatest man I never knew lived just down ...,The Greatest Man I Never Knew
1,Vanessa Carlton,Stealing glances through the key hole\nIn a br...,Hear the Bells
2,Snoop Dogg,"What it do,\nComing at you live, it's your boy...",Pronto
3,Snoop Dogg,Another smoke session up in this motherfucker\...,Buss 'N Rocks
4,The Shirelles,"Everybody, twist\n\nThey're twisting in Clevel...",Twisting USA
...,...,...,...
5157,Asia,Whenever your shadow falls on stony ground\nAn...,Wherever You Are
5158,John Prine,I got kicked off Noah's Ark\nI turn my cheek t...,Sweet Revenge
5159,Travis,In the church one day you will get hurt\nIn th...,Some Sad Song
5160,4 Strings,The waves are rolling in\nI can smell the sea\...,All Around the World


In [4]:
# save dataset to pickle
df.to_pickle(pickle_filename)

## Attempt at BERT pairwise similarities

In [6]:
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
document_embeddings = None
if os.path.isfile(f'lyrec_embeddings_{sample_frac}.pkl'):
    print("Using existing pickle")
    with open(f'lyrec_embeddings_{sample_frac}.pkl', 'wb') as f:
        document_embeddings = numpy.load(f)
else:
    document_embeddings = sbert_model.encode(df['Lyrics'], show_progress_bar=True, device="cpu")
# document_embeddingsing difficulty using/installing this library

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=162.0), HTML(value='')))




In [9]:
with open(f'lyrec_embeddings_{sample_frac}.pkl', 'wb') as f:
    np.save(f, document_embeddings)

In [7]:
from sklearn.metrics import pairwise
print(document_embeddings.shape)
pairwise_similarities=pairwise.cosine_similarity(document_embeddings)
pairwise_differences=pairwise.euclidean_distances(document_embeddings)

from sklearn.feature_extraction.text import TfidfVectorizer 

tfidfvectoriser=TfidfVectorizer()
tfidfvectoriser.fit(df.Lyrics)
tfidf_vectors=tfidfvectoriser.transform(df.Lyrics)

def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Song: {df.iloc[doc_id]["Song"]} - {df.iloc[doc_id]["Band"]}')
    print ('\n')
    print ('Similar Songs:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[:5]:
#         if ix==doc_id:
#             continue
        print('\n')
        print (f'Similar Song: {df.iloc[ix]["Song"]} - {df.iloc[ix]["Band"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

most_similar(0,pairwise_similarities,'Cosine Similarity')
print("---")
most_similar(0,pairwise_differences,'Euclidean Distance')

(5162, 768)
Song: The Greatest Man I Never Knew - Reba McEntire


Similar Songs:


Similar Song: The Greatest Man I Never Knew - Reba McEntire
Cosine Similarity : 1.0


Similar Song: Bonny - Prefab Sprout
Cosine Similarity : 0.8753452301025391


Similar Song: Chatting Today - Thin Lizzy
Cosine Similarity : 0.8715394735336304


Similar Song: A Most Peculiar Man [Live] - Simon & Garfunkel
Cosine Similarity : 0.8692291975021362


Similar Song: Almost Said Goodbye [DVD] - Peter Frampton
Cosine Similarity : 0.8509968519210815
---
Song: The Greatest Man I Never Knew - Reba McEntire


Similar Songs:


Similar Song: The Greatest Man I Never Knew - Reba McEntire
Euclidean Distance : 0.0


Similar Song: Bonny - Prefab Sprout
Euclidean Distance : 7.809667110443115


Similar Song: Chatting Today - Thin Lizzy
Euclidean Distance : 8.06407642364502


Similar Song: A Most Peculiar Man [Live] - Simon & Garfunkel
Euclidean Distance : 8.12240982055664


Similar Song: Almost Said Goodbye [DVD] - Peter Fra

## Bag of Words

In [6]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import re
from operator import itemgetter
from sklearn.metrics import pairwise
# Define dictionary utility function
BAG_SIZE = 100
def get_top_values(d,N = BAG_SIZE):
    return list(sorted(d.items(), key = itemgetter(1), reverse = True)[:N])

# Preprocess songs

# set stop words in english
stop_words = set(stopwords.words('english')) 

# creating  "master" dictionary
word2count = {}

for row in df.index:
    song = df['Lyrics'][row]

    # remove stopwords and clean up each song
    song = song.lower()
    song = re.sub(r'\W', ' ', song)
    song = re.sub(r'\s+', ' ', song)
    song = re.sub(r'\r|\n', ' ', song)

    words = nltk.word_tokenize(song)
    for word in words:

        if word not in stop_words:

            if word not in word2count.keys():
                word2count[word] = 1
            else:
                word2count[word] += 1

word2count_list = get_top_values(word2count)
word2count = dict(word2count_list)
word_ranks = {w2c[0]: r for r, w2c in enumerate(word2count_list)}

# Now pass through again and create vectors

song_counts = np.zeros([num_rows, BAG_SIZE])

for song_index in df.index:

    song = df['Lyrics'][song_index]

    # remove stopwords and clean up each song
    song = song.lower()
    song = re.sub(r'\W', ' ', song)
    song = re.sub(r'\s+', ' ', song)
    song = re.sub(r'\r|\n', ' ', song)

    words = nltk.word_tokenize(song)
    for word in words:
        if word in word_ranks:
            word_idx = word_ranks[word]
            song_counts[song_index][word_idx] += 1

print(word2count)
print(song_counts)


pairwise_similarities=pairwise.cosine_similarity(song_counts)
# pairwise_differences=pairwise.euclidean_distances(song_counts)


def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Song: {df.iloc[doc_id]["Song"]} - {df.iloc[doc_id]["Band"]}')
    print ('Similar Songs:')
    
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[:5]:
#         if ix==doc_id:
#             continue
        print('\n')
        print (f'Similar Song: {df.iloc[ix]["Song"]} - {df.iloc[ix]["Band"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')
            
most_similar(0,pairwise_similarities,'Cosine Similarity')
print("---")
# most_similar(0,pairwise_differences,'Euclidean Distance')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/george/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/george/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'love': 73729, 'know': 58822, 'like': 55336, 'got': 51522, 'oh': 51155, 'na': 40133, 'get': 38660, 'one': 36915, 'go': 36237, 'que': 35033, 'time': 34482, 'la': 33735, 'let': 33129, 'see': 32970, 'baby': 32736, 'never': 31985, 'want': 30519, 'come': 29883, 'de': 29258, 'yeah': 28277, 'way': 26474, 'say': 26126, 'cause': 25475, 'make': 25312, 'back': 23866, 'take': 23324, 'gon': 22378, 'heart': 22250, 'life': 21894, 'away': 20811, 'day': 20372, 'feel': 20075, 'night': 19821, 'right': 19090, 'man': 19002, 'could': 18932, 'tell': 18332, 'need': 17837, 'world': 16843, 'give': 16365, 'good': 16024, 'girl': 16003, 'little': 15752, 'think': 14813, 'well': 14719, 'still': 14128, 'un': 13877, 'keep': 13800, 'long': 13786, 'en': 13736, 'ya': 13674, 'tu': 13622, 'eyes': 13622, 'te': 13539, 'around': 13510, 'every': 13347, 'look': 13235, 'wan': 13120, 'said': 12964, 'el': 12636, 'would': 12469, 'us': 12271, 'find': 12222, 'home': 12097, 'ever': 11897, 'mind': 11876, 'yo': 11797, 'always': 11566, 

In [None]:
# save matrix as pickle
with open(f'lyrec_similarity_mat_{sample_frac}.pkl', 'wb') as f:
    np.save(f, pairwise_similarities)
# with open(f'lyrec_differences_mat_{sample_frac}.pkl', 'wb') as f:
#     np.save(f, pairwise_similarities)