In [1]:
import os
import re
import time

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

from datasketch import MinHash, MinHashLSHForest

from tqdm import tqdm
from multiprocessing import Pool
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from sentence_transformers import SentenceTransformer, util
from textblob import TextBlob

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sreekiranv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sreekiranv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/sreekiranv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
lyrics_df = pd.read_csv("/home/sreekiranv/Highlight_PDF/lyrics/lyrics_embeds.csv")
user_df = pd.read_csv("/home/sreekiranv/Highlight_PDF/lyrics/user_playlist_data.csv")
print("Shape of user playlist data", user_df.shape)
print("Shape of lyrics data", lyrics_df.shape)

Shape of user playlist data (4441715, 26)
Shape of lyrics data (107554, 7)


In [None]:
class PreProcessing():
    
    def __init__(self,text):
        
        self.text = text

    def remove_urls(self):
        self.new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",self.text).split())
        return self.new_text

    def text_lowercase(self):
        """
        Lower Text
        """
        self.new_text = self.text.lower()
        return self.new_text

    def remove_numbers(self):
        """
        Remove numbers from corpus
        """

        self.new_text = re.sub(r'\d+', '', self.new_text)
        return self.new_text

    def remove_punctuation(self):
        """
        Remove punctuations from corpus
        """

        self.new_text = re.sub(r'[^\w\s]','',self.new_text)
        return self.new_text

    def tokenize(self):
        """
        Tokenize phrases and tokens
        """

        self.new_text = word_tokenize(self.new_text)
        return self.new_text

    def remove_stopwords(self):
        """
        Remove stopwords from corpus
        """

        self.new_text = [i for i in self.new_text if not i in stop_words]
        return self.new_text

    def lemmatize(self):
        """
        Extract root words - Lemmatization
        """

        self.new_text = [lemmatizer.lemmatize(token) for token in self.new_text]
        return self.new_text

    def stemming(text):
        """
        Extract root words - Stemming
        """
        self.new_text = [porter.stem(token) for token in self.new_text]
        return self.new_text
        
    def remove_letters(self,size):
        """
        Remove words less than size n
        """

        self.new_text = [i for i in self.new_text if len(i)> size]
        return self.new_text
    
def create_tokens(phrase,merge=False,k=2):
    """
    Apply pre-processing and tokenize words
    """

    preprocess = PreProcessing(phrase)
    
    preprocess.text_lowercase()
    preprocess.remove_punctuation()
    preprocess.remove_numbers()
    preprocess.tokenize()
    preprocess.remove_stopwords()
    preprocess.remove_letters(k)
    processed_tokens = preprocess.lemmatize()

    if merge:
        processed_tokens = " ".join(processed_tokens)
    
    return processed_tokens

In [3]:
# Creating Word embeddings, sentiment and polarity scores

l_final = lyrics_df[lyrics_df['lyrics'].notna()]
l_final = l_final[['id', 'artists', 'name', 'lyrics']]



model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

lyrics = l_final['lyrics'].values.tolist()
embeddings = model.encode(lyrics)
vecs = [i.tolist() for i in embeddings]
l_final['vectors'] = vecs



def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity
def getPolarity(review):
    return TextBlob(review).sentiment.polarity

# function to analyze the reviews
def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

l_final['Sentiment'] = l_final['lyrics'].apply(getSubjectivity)
l_final['Polarity'] = l_final['lyrics'].apply(getPolarity)


def get_embedding(text):
    embeddings = model.encode(text)
    return embeddings

embeddings = []
with Pool(10) as spool:
    for d in tqdm(spool.imap_unordered(get_embedding, lyrics), total=len(lyrics)):
        embeddings.append(d)
        pass
spool.close()
spool.join()

"\nl_final = lyrics_df[lyrics_df['lyrics'].notna()]\nl_final = l_final[['id', 'artists', 'name', 'lyrics']]\n\n\n\nmodel = SentenceTransformer('paraphrase-MiniLM-L6-v2')\n\nlyrics = l_final['lyrics'].values.tolist()\nembeddings = model.encode(lyrics)\nvecs = [i.tolist() for i in embeddings]\nl_final['vectors'] = vecs\n\n\n\ndef getSubjectivity(review):\n    return TextBlob(review).sentiment.subjectivity\ndef getPolarity(review):\n    return TextBlob(review).sentiment.polarity\n\n# function to analyze the reviews\ndef analysis(score):\n    if score < 0:\n        return 'Negative'\n    elif score == 0:\n        return 'Neutral'\n    else:\n        return 'Positive'\n\nl_final['Sentiment'] = l_final['lyrics'].apply(getSubjectivity)\nl_final['Polarity'] = l_final['lyrics'].apply(getPolarity)\n\n\ndef get_embedding(text):\n    embeddings = model.encode(text)\n    return embeddings\n\nembeddings = []\nwith Pool(10) as spool:\n    for d in tqdm(spool.imap_unordered(get_embedding, lyrics), tot

In [24]:
rx = re.compile(r'(-?\d+\.?\d*|abc)', re.VERBOSE)
embeddings = [[float(i) for i in rx.findall(j)] for j in lyrics_df['vectors']]
x = [len(i) for i in embeddings]
lyrics_df['vec_len'] = x
lyrics_df = lyrics_df[lyrics_df['vec_len']==384].reset_index(drop=True)
embeddings = [[float(i) for i in rx.findall(j)] for j in lyrics_df['vectors']]

In [25]:
from scipy.sparse import csr_matrix
embed_mat = csr_matrix(embeddings)

x = user_df.groupby('user_id')['artistname'].count().sort_values().reset_index(name="count")
ids = user_df[user_df['user_id'].isin(['24dd04d5cb76d236ce4062d7f5ff82e3',"84b0cd6e3fe13609af340bb7341d3487","f38586cc60d63ae2687e5262d540b995"])]
user_songs = ids[['user_id',"trackname"]].drop_duplicates().reset_index(drop=True)
user_df[user_df['user_id']=="84b0cd6e3fe13609af340bb7341d3487"]

In [61]:
user_df[user_df['user_id']=="84b0cd6e3fe13609af340bb7341d3487"]

Unnamed: 0,user_id,artistname,trackname,playlistname,acousticness,artists,danceability,duration_ms,energy,explicit,...,name,popularity,release_date,speechiness,tempo,valence,year,gender,age,country
935451,84b0cd6e3fe13609af340bb7341d3487,Cesare Cremonini,Hello!,Cremonini,0.735,Andrew Rannells Josh Gad Rory O Malley Kevin D...,0.658,172027,0.506,0,...,Hello!,55,2011-05-17,0.246,130.491,0.654,2011,M,13,UK
935452,84b0cd6e3fe13609af340bb7341d3487,Cesare Cremonini,I Love You,Cremonini,0.539,Barney,0.615,62533,0.234,0,...,I Love You,44,2000-01-01,0.0447,96.972,0.167,2000,M,13,UK
935453,84b0cd6e3fe13609af340bb7341d3487,Francesco Guccini,Farewell,Guccini Studio,0.704,Bernard Herrmann,0.15,199039,0.094,0,...,Farewell,4,1947,0.0437,101.273,0.0353,1947,M,13,UK
935454,84b0cd6e3fe13609af340bb7341d3487,Radiohead,Creep,Musica del (mio) momento,0.0193,TLC,0.811,268533,0.458,0,...,Creep,66,1994-11-15,0.0402,92.94,0.797,1994,M,13,UK
935455,84b0cd6e3fe13609af340bb7341d3487,Francesco Guccini,Farewell,Musica del (mio) momento,0.704,Bernard Herrmann,0.15,199039,0.094,0,...,Farewell,4,1947,0.0437,101.273,0.0353,1947,M,13,UK
935456,84b0cd6e3fe13609af340bb7341d3487,Gianna Nannini,Ti Voglio Tanto Bene,Musica del (mio) momento,0.982,Beniamino Gigli,0.312,173800,0.251,0,...,Ti Voglio Tanto Bene,0,1926-01-01,0.0303,98.815,0.174,1926,M,13,UK
935457,84b0cd6e3fe13609af340bb7341d3487,P!nk,Try,Musica del (mio) momento,0.807,Colbie Caillat,0.603,224573,0.376,0,...,Try,58,2014-09-30,0.0288,139.995,0.538,2014,M,13,UK


In [29]:
user_songs

Unnamed: 0,user_id,trackname
0,84b0cd6e3fe13609af340bb7341d3487,Hello!
1,84b0cd6e3fe13609af340bb7341d3487,I Love You
2,84b0cd6e3fe13609af340bb7341d3487,Farewell
3,84b0cd6e3fe13609af340bb7341d3487,Creep
4,84b0cd6e3fe13609af340bb7341d3487,Ti Voglio Tanto Bene
5,84b0cd6e3fe13609af340bb7341d3487,Try
6,f38586cc60d63ae2687e5262d540b995,Blame It On Me
7,f38586cc60d63ae2687e5262d540b995,Gimme Some Lovin'
8,f38586cc60d63ae2687e5262d540b995,Love
9,f38586cc60d63ae2687e5262d540b995,Sambalero


In [30]:
lyrics_df[lyrics_df['name']=="Hello!"].index[0]

57490

In [31]:
user_songs['trackname'].unique()

array(['Hello!', 'I Love You', 'Farewell', 'Creep',
       'Ti Voglio Tanto Bene', 'Try', 'Blame It On Me',
       "Gimme Some Lovin'", 'Love', 'Sambalero', 'Sun Is Shining',
       'Caring Is Creepy', 'Heartattack And Vine', 'Heartbeats',
       'Somebody That I Used To Know'], dtype=object)

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(embeddings[0:2])

array([[1.        , 0.20268135],
       [0.20268135, 1.        ]])

In [100]:
from scipy import spatial

data_list = []

for song in user_songs['trackname'].unique():
    try:
        idx = lyrics_df[lyrics_df['name']==song].index[0]
        song_embed = embeddings[idx]

        scores = []
        for i in embeddings:
            try:
                scores.append(1 - spatial.distance.cosine(song_embed, i))
            except:
                scores.append(0)
        del scores[idx]
        res_idx = sorted(range(len(scores)), key = lambda sub: scores[sub])[-5:]
        sub_df = lyrics_df.iloc[res_idx]
        sub_df['similarity_score'] = pd.Series(scores)[res_idx].values
        sub_df['song'] = song
    except:
        sub_df = pd.DataFrame({"song":[song]*5})
    data_list.append(sub_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [102]:
output_df = pd.concat(data_list)
output_df = output_df[["song","similarity_score","Polarity","Sentiment","name"]]
reco_df = pd.merge(user_songs,output_df, left_on = "trackname", right_on = "song", how = "left")
reco_df = reco_df.sort_values(by = ["user_id","similarity_score","Polarity","Sentiment"], ascending = [True, False, False, False])
user_scores = pd.merge(user_songs,lyrics_df[["name","Sentiment","Polarity"]], left_on = "trackname", right_on = "name", how = "left")
user_sub = user_scores.groupby(['user_id']).agg({"Sentiment":"mean","Polarity":"mean"}).reset_index()

In [129]:
user_sub

Unnamed: 0,user_id,Sentiment,Polarity
0,24dd04d5cb76d236ce4062d7f5ff82e3,0.487503,0.046177
1,84b0cd6e3fe13609af340bb7341d3487,0.522872,0.137443
2,f38586cc60d63ae2687e5262d540b995,0.469113,0.054016


In [155]:
reco_df

Unnamed: 0,user_id,user_playlist,song,similarity_score,Polarity,Sentiment,recommended_song
70,24dd04d5cb76d236ce4062d7f5ff82e3,Somebody That I Used To Know,Somebody That I Used To Know,1.0,0.169702,0.448780,Sweet Leaf - 2014 Remaster
74,24dd04d5cb76d236ce4062d7f5ff82e3,Somebody That I Used To Know,Somebody That I Used To Know,1.0,0.120519,0.442287,Pride and the Badge
73,24dd04d5cb76d236ce4062d7f5ff82e3,Somebody That I Used To Know,Somebody That I Used To Know,1.0,-0.240720,0.594444,Blue on Black
71,24dd04d5cb76d236ce4062d7f5ff82e3,Somebody That I Used To Know,Somebody That I Used To Know,1.0,-0.283333,0.595833,California Sun
72,24dd04d5cb76d236ce4062d7f5ff82e3,Somebody That I Used To Know,Somebody That I Used To Know,1.0,-0.283974,0.521795,Sieve-Fisted Find
...,...,...,...,...,...,...,...
45,f38586cc60d63ae2687e5262d540b995,Sambalero,Sambalero,,,,
46,f38586cc60d63ae2687e5262d540b995,Sambalero,Sambalero,,,,
47,f38586cc60d63ae2687e5262d540b995,Sambalero,Sambalero,,,,
48,f38586cc60d63ae2687e5262d540b995,Sambalero,Sambalero,,,,


In [157]:
df1 = reco_df[(reco_df['user_id']== "84b0cd6e3fe13609af340bb7341d3487")]
df2 = df1[(df1['Sentiment'] > 0.522872)]
x1 = df2.iloc[:5]
x1['Rank'] = [1,2,3,4,5]
x1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,user_id,user_playlist,song,similarity_score,Polarity,Sentiment,recommended_song,Rank
9,84b0cd6e3fe13609af340bb7341d3487,I Love You,I Love You,0.693975,0.733333,0.866667,The Christmas Song,1
26,84b0cd6e3fe13609af340bb7341d3487,Try,Try,0.683918,0.25182,0.609674,Heyma,2
8,84b0cd6e3fe13609af340bb7341d3487,I Love You,I Love You,0.682793,0.266176,0.535294,Smiling Faces Sometimes,3
7,84b0cd6e3fe13609af340bb7341d3487,I Love You,I Love You,0.674626,-0.013917,0.675482,Detroit Rock City,4
3,84b0cd6e3fe13609af340bb7341d3487,Hello!,Hello!,0.513266,0.104246,0.561294,This Year,5


In [159]:
df1 = reco_df[(reco_df['user_id']== "24dd04d5cb76d236ce4062d7f5ff82e3")]
df2 = df1[(df1['Sentiment'] > 0.487503)]
x2 = df2.iloc[:5]
x2['Rank'] = [1,2,3,4,5]
x2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,user_id,user_playlist,song,similarity_score,Polarity,Sentiment,recommended_song,Rank
73,24dd04d5cb76d236ce4062d7f5ff82e3,Somebody That I Used To Know,Somebody That I Used To Know,1.0,-0.24072,0.594444,Blue on Black,1
71,24dd04d5cb76d236ce4062d7f5ff82e3,Somebody That I Used To Know,Somebody That I Used To Know,1.0,-0.283333,0.595833,California Sun,2
72,24dd04d5cb76d236ce4062d7f5ff82e3,Somebody That I Used To Know,Somebody That I Used To Know,1.0,-0.283974,0.521795,Sieve-Fisted Find,3
66,24dd04d5cb76d236ce4062d7f5ff82e3,Heartbeats,Heartbeats,0.721598,0.225,0.64375,Separate Ways (Worlds Apart),4
65,24dd04d5cb76d236ce4062d7f5ff82e3,Heartbeats,Heartbeats,0.721122,0.156667,0.626667,Spend The Night - Live at Continental NYC 2003,5


In [160]:
df1 = reco_df[(reco_df['user_id']== "f38586cc60d63ae2687e5262d540b995")]
df2 = df1[(df1['Sentiment'] > 0.469113)]
x3 = df2.iloc[:5]
x3['Rank'] = [1,2,3,4,5]
x3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,user_id,user_playlist,song,similarity_score,Polarity,Sentiment,recommended_song,Rank
44,f38586cc60d63ae2687e5262d540b995,Love,Love,1.0,0.120132,0.478683,Have You Ever Seen The Rain,1
54,f38586cc60d63ae2687e5262d540b995,Sun Is Shining,Sun Is Shining,1.0,0.054491,0.591456,Elegy - 2004 Remaster,2
53,f38586cc60d63ae2687e5262d540b995,Sun Is Shining,Sun Is Shining,0.828182,0.001981,0.535125,Me Volví A Acordar De Ti,3
50,f38586cc60d63ae2687e5262d540b995,Sun Is Shining,Sun Is Shining,0.813455,0.094444,0.633333,Didn't I Get This Last Year?,4
32,f38586cc60d63ae2687e5262d540b995,Blame It On Me,Blame It On Me,0.769986,-0.023438,0.5,all the kids are depressed,5


### LSH

In [5]:
lyrics = lyrics_df['lyrics'].tolist()
filtered_tokens = []
with Pool(40) as spool:
    for d in tqdm(spool.imap_unordered(create_tokens, lyrics), total=len(lyrics)):
        filtered_tokens.append(d)
        pass
spool.close()
spool.join()

100%|██████████| 107554/107554 [02:34<00:00, 698.09it/s] 


In [6]:
lyrics_df['text'] = filtered_tokens

In [7]:
lyrics_df.head()

Unnamed: 0,id,artists,name,lyrics,vectors,Sentiment,Polarity,text
0,0BEO6nHi1rmTOPiEZvCIDW,Ka Koula,Ta oula sou,[Verse 1] I put a spell on you Because you're ...,"[0.17677061259746552, 0.14915935695171356, 0.1...",0.524921,0.326032,"[verse, put, spell, youre, mine, stop, thing, ..."
1,0DH1IROKoPK5XTglUt9Pq0,Justrock,Schumacher,"Stretch and Bobbito ft. The Bad Seed, Icon, Me...","[-0.0025917813181877136, -0.08715461939573288,...",0.563669,-0.16928,"[produced, pete, rock, chorus, pete, rock, na,..."
2,0HVjPaxbyfFcg8Rh0plyo5,Takis Nikolaou,Mparmpaouzos,"[00:00:00] Oh shit, I guess we're starting The...","[0.09928365796804428, -0.2250249683856964, 0.0...",0.515986,0.087665,"[read, add, change, pagehighest, upvotes, buck..."
3,0OM9aSti0UOwN9yuz1m85y,Vas lis Bes ris Tourkovas lis,Sirtó prevezániko,"(Chacalcolik - Round 01) Chacalcolik, aka l'an...","[-0.21848070621490479, 0.3851667046546936, -0....",0.445756,-0.000977,"[round, loaded, lux, power, people, back, main..."
4,0UYplqEm5qa4hkEo64C0TW,Athanas a Alexandropo lou,"I photoúla, beráti","[Paroles de ""Pinocchio"" ft. Gato & Damso] [In...","[-0.26637059450149536, 0.2075779139995575, -0....",0.291071,-0.185714,"[last, updated, mstoctoberoctober, jesse, powe..."


In [8]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['name']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [None]:
def hash_text(text,perms=1):
    minhash = []
    m = MinHash(num_perm=perms)
    for s in text:
        m.update(s.encode('utf8'))
    minhash.append(m)
    return minhash

In [None]:
hashes = []
with Pool(40) as spool:
    for d in tqdm(spool.imap_unordered(hash_text, filtered_tokens[:100]), total=len(filtered_tokens[:100])):
        hashes.append(d)
        pass
spool.close()
spool.join()

In [None]:
num_recommendations = 5
song = "Schumacher"
result = predict(song, db, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)