In [1]:
import pandas as pd
import numpy as np
import sklearn 
import matplotlib.pyplot as plt

In [2]:
filename = 'first1000.csv'
df = pd.read_csv(filename,quotechar="\"")
df


Unnamed: 0,Band,Lyrics,Song
0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday
1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die
2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside
3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot
4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds
...,...,...,...
994,Judy Garland,"Day in, day out\r\nThe same old hoodoo follows...","Day In, Day Out"
995,Judy Garland,When a dove is in love\r\nWith a doll of a dov...,I Could Go on Singin' ('Til the Cows Come Home)
996,Judy Garland,You've got me where you want me\r\nAnd I hope ...,You Got Me Where You Want Me
997,Judy Garland,"If you're ever in a jam, here I am \r\nIf you'...",Friendship (Medley)


## Attempt at BERT pairwise similarities

In [4]:
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

document_embeddings = sbert_model.encode(df['Lyrics'])
# document_embeddingsing difficulty using/installing this library

In [6]:
from sklearn.metrics import pairwise
print(document_embeddings.shape)
pairwise_similarities=pairwise.cosine_similarity(document_embeddings)
pairwise_differences=pairwise.euclidean_distances(document_embeddings)

from sklearn.feature_extraction.text import TfidfVectorizer 

tfidfvectoriser=TfidfVectorizer()
tfidfvectoriser.fit(df.Lyrics)
tfidf_vectors=tfidfvectoriser.transform(df.Lyrics)

def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Song: {df.iloc[doc_id]["Song"]} - {df.iloc[doc_id]["Band"]}')
    print ('\n')
    print ('Similar Songs:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[:5]:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Similar Song: {df.iloc[ix]["Song"]} - {df.iloc[ix]["Band"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')

most_similar(0,pairwise_similarities,'Cosine Similarity')
print("---")
most_similar(0,pairwise_differences,'Euclidean Distance')

(999, 768)
Song: Everyday - Elijah Blake


Similar Songs:


Similar Song: Fascinating Rhythm - Ella Fitzgerald
Cosine Similarity : 0.8106637001037598


Similar Song: Fascinating Rhythm [From the Film:  Girl Crazy] - Judy Garland
Cosine Similarity : 0.7968353033065796


Similar Song: Fascinating Rhythm [Mono Mix][*] - Ella Fitzgerald
Cosine Similarity : 0.7901540398597717


Similar Song: Everybody Knows - Elizabeth & the Catapult
Cosine Similarity : 0.7596976161003113
---
Song: Everyday - Elijah Blake


Similar Songs:


Similar Song: Fascinating Rhythm - Ella Fitzgerald
Euclidean Distance : 9.550185203552246


Similar Song: Fascinating Rhythm [From the Film:  Girl Crazy] - Judy Garland
Euclidean Distance : 9.717370986938477


Similar Song: Fascinating Rhythm [Mono Mix][*] - Ella Fitzgerald
Euclidean Distance : 10.056916236877441


Similar Song: Together - Ella Eyre
Euclidean Distance : 10.776580810546875


## Bag of Words

Just as a preface, this is broken

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import re
from operator import itemgetter
from sklearn.metrics import pairwise
# Define dictionary utility function
BAG_SIZE = 100
def get_top_values(d,N = BAG_SIZE):
    return dict(sorted(d.items(), key = itemgetter(1), reverse = True)[:N])

# Preprocess songs

# set stop words in english
stop_words = set(stopwords.words('english')) 

# creating  "master" dictionary
word2count = {}

for row in df.index:
    song = df['Lyrics'][row]

    # remove stopwords and clean up each song
    song = song.lower()
    song = re.sub(r'\W', ' ', song)
    song = re.sub(r'\s+', ' ', song)

    words = nltk.word_tokenize(song)
    for word in words:

        if word not in stop_words:

            if word not in word2count.keys():
                word2count[word] = 1
            else:
                word2count[word] += 1

word2count = get_top_values(word2count)
word_ranks = {k: v for v, k in enumerate(list(word2count.keys()))}

# Now pass through again and create vectors

song_counts = np.zeros([1000, BAG_SIZE])

for song_index in df.index:

    song = df['Lyrics'][song_index]

    # remove stopwords and clean up each song
    song = song.lower()
    song = re.sub(r'\W', ' ', song)
    song = re.sub(r'\s+', ' ', song)

    for word in words:

        if word in word_ranks:

            word_idx = word_ranks[word]
            song_counts[song_idx][word_idx] += 1

print(word2count)
print(song_counts)


pairwise_similarities=pairwise.cosine_similarity(song_counts)
pairwise_differences=pairwise.euclidean_distances(song_counts)


def most_similar(doc_id,similarity_matrix,matrix):
    print (f'Song: {df.iloc[doc_id]["Song"]} - {df.iloc[doc_id]["Band"]}')
    print ('Similar Songs:')
    
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[:5]:
        if ix==doc_id:
            continue
        print('\n')
        print (f'Similar Song: {df.iloc[ix]["Song"]} - {df.iloc[ix]["Band"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')
            
most_similar(0,pairwise_similarities,'Cosine Similarity')
print("---")
most_similar(0,pairwise_differences,'Euclidean Distance')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/george/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/george/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


NameError: name 'song_idx' is not defined

[    0.     0.     0.     0.     0.     0.     0.     0.  2997.   999.
     0.     0.     0.  6993.     0.     0.   999.     0.     0.  2997.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.   999.  1998.     0.     0.     0.   999.     0.     0.
     0.     0.     0.     0.     0.     0.   999.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.  4995.     0.   999.     0.     0.
     0.     0.     0.     0.     0.     0.   999.     0.     0.     0.
     0.     0.     0.     0.     0.     0.  1998.  1998.   999.     0.
     0. 14985.  1998.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.   999.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.   999.     0.     0.     0.     0.     0.     0.     0.
     0