In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import re

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [13]:
df_spotify_lyrics = pd.read_csv('spotify_millsongdata.csv')
df_spotify_lyrics.sample(5)

Unnamed: 0,artist,song,link,text
10147,Katy Perry,Choose Your Battles,/k/katy+perry/choose+your+battles_21069179.html,You are my hurt locker lover \r\nKeep me walk...
15588,Paul McCartney,Just Because,/p/paul+mccartney/just+because_20225762.html,"Well, well, well, \r\nJust because you think ..."
5915,Faith Hill,Just To Hear You Say That You Love Me,/f/faith+hill/just+to+hear+you+say+that+you+lo...,If I could win your heart \r\nIf you'd let me...
15221,Outkast,Babylon,/o/outkast/babylon_20103839.html,I came into this world high as a bird \r\nFro...
7978,Hooverphonic,Satin Doll,/h/hooverphonic/satin+doll_20065987.html,This scary face on the wall \r\nHis watch kil...


In [14]:
df_spotify_lyrics = df_spotify_lyrics.sample(10000)

In [15]:
df_spotify_lyrics.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [16]:
df_spotify_lyrics.duplicated().sum()

0

In [17]:
df_spotify_lyrics.shape

(10000, 4)

In [18]:
df_spotify_lyrics.drop(columns=['link'], inplace=True)

In [19]:
df_spotify_lyrics.head()

Unnamed: 0,artist,song,text
51429,Scorpions,Every Minute Every Day,Well it seems that I was lost \r\nIn intoxica...
4400,Donna Summer,I Will Survive,At First I was afriad I was petrified kept \r...
55242,Virgin Steele,Don't Close Your Eyes,"Don't go, don't leave me here alone \r\nWith ..."
31626,Enrique Iglesias,Back To You,I've been down \r\nI've been beat \r\nI've b...
43631,Michael Bolton,We're Not Making Love Anymore,"Here we are, just goin' through the motions on..."


In [20]:
df_spotify_lyrics.reset_index(drop=True, inplace=True)

In [21]:
df_spotify_lyrics.head()

Unnamed: 0,artist,song,text
0,Scorpions,Every Minute Every Day,Well it seems that I was lost \r\nIn intoxica...
1,Donna Summer,I Will Survive,At First I was afriad I was petrified kept \r...
2,Virgin Steele,Don't Close Your Eyes,"Don't go, don't leave me here alone \r\nWith ..."
3,Enrique Iglesias,Back To You,I've been down \r\nI've been beat \r\nI've b...
4,Michael Bolton,We're Not Making Love Anymore,"Here we are, just goin' through the motions on..."


In [22]:
# Text Preprocessing:
# 1) lowercasing
# 2) Removing html tags and other expressions
# 3) Removing Puntuation
# 4) Spelling correction
# 5) Removing Stopwords
# 6) Lemmatization/Stemming
# 7) Tokenization

In [23]:
# lowercasing
df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(lambda x: x.lower())

In [24]:
df_spotify_lyrics['text'][3]

"i've been down  \r\ni've been beat  \r\ni've been so that i could not speak  \r\ni've been so lost that i could not see  \r\ni wanted things that were out of reach  \r\nthen i found you and help me through and you show me  \r\nwhat to do  \r\nthat's why i'm comin' back to you  \r\n  \r\nlike a star that guides a ship across the ocean  \r\nthat's how your love can take me home back to you  \r\nand if i wish upon that star that someday i'll be where you are  \r\ncause i know that day is comin' soon yeah i'm comin' back to you  \r\n  \r\nyou've been alone but you did not show it  \r\nyou've been in pain when i did not know it  \r\nyou let me do what i needed to  \r\nyou where there when i needed you  \r\n  \r\nmighta let you down  \r\nmighta messed you round  \r\nbut you never change you point of view  \r\nthat's why i'm comin back to you.........\r\n\r\n"

In [25]:
# Removing html tags
import re
def remove_htmltags(txt):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',txt)

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(remove_htmltags)

In [26]:
def remove_regex(txt):
    pattern = re.compile('[\r\n]+')
    return pattern.sub(r'',txt)

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(remove_regex)

In [27]:
df_spotify_lyrics['text'][3]

"i've been down  i've been beat  i've been so that i could not speak  i've been so lost that i could not see  i wanted things that were out of reach  then i found you and help me through and you show me  what to do  that's why i'm comin' back to you    like a star that guides a ship across the ocean  that's how your love can take me home back to you  and if i wish upon that star that someday i'll be where you are  cause i know that day is comin' soon yeah i'm comin' back to you    you've been alone but you did not show it  you've been in pain when i did not know it  you let me do what i needed to  you where there when i needed you    mighta let you down  mighta messed you round  but you never change you point of view  that's why i'm comin back to you........."

In [28]:
# Removing punctuations
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [29]:
exclude = string.punctuation
def remove_punc(txt):
    return txt.translate(str.maketrans('','', exclude))

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(remove_punc)

In [30]:
tqdm.pandas()
# spelling correction
from textblob import TextBlob
def spel_cor(txt):
    txtblb = TextBlob(txt)
    return txtblb.correct().string

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].progress_apply(spel_cor)

100%|███████████████████████████████████| 10000/10000 [6:44:52<00:00,  2.43s/it]


In [31]:
#stopwords
from nltk.corpus import stopwords
stopwords.words('english')

def remove_stopwords(txt):
    new_txt = []
    for w in txt.split():
        if w in stopwords.words('english'):
            new_txt.append('')
        else:
            new_txt.append(w)
    return ' '.join(new_txt)

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(remove_stopwords)

In [34]:
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize(text):
    wordnet_lemma = WordNetLemmatizer()
    punctuation = '?:!.,;'
    lem = []
    
    sent_word = nltk.word_tokenize(text)
    for w in sent_word:
        if w not in punctuation:
            lem.append(wordnet_lemma.lemmatize(w))
            
    return ' '.join(lem)

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].progress_apply(lemmatize)

100%|███████████████████████████████████| 10000/10000 [00:03<00:00, 3234.22it/s]


In [35]:
df_spotify_lyrics['text'][0]

'well seems lost intoxicated night following trace ecstasy future past present alive like prisoner chained dream drowning feeling possession rule till morning kill creature night read lipstick message mirror truth gone realize without life like song without music without world wont need love every minute need love every day need love every minute need love come back stay well know lost nightmare life following flash fantasy feeling exploded id left reality behind addicted power dream without life like road lead nowhere without world wont feel need love every minute need love every day need love every minute need love come back stay'

In [38]:
from nltk import sent_tokenize
import gensim
from gensim.utils import simple_preprocess

story = []
for doc in df_spotify_lyrics['text']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [39]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2)

model.build_vocab(story)

In [40]:
model.train(story, total_examples=model.corpus_count, epochs= model.epochs)

(4685713, 5383345)

In [41]:
model.wv.index_to_key

['love',
 'dont',
 'know',
 'like',
 'oh',
 'time',
 'got',
 'go',
 'give',
 'one',
 'get',
 'want',
 'see',
 'come',
 'baby',
 'let',
 'say',
 'never',
 'make',
 'yeah',
 'way',
 'night',
 'take',
 'day',
 'ill',
 'cause',
 'feel',
 'heart',
 'life',
 'back',
 'well',
 'donna',
 'need',
 'away',
 'right',
 'could',
 'girl',
 'man',
 'tell',
 'thing',
 'world',
 'little',
 'chorus',
 'aunt',
 'eye',
 'good',
 'think',
 'said',
 'long',
 'keep',
 'look',
 'wont',
 'home',
 'would',
 'around',
 'every',
 'nothing',
 'still',
 'mind',
 'find',
 'anna',
 'always',
 'dream',
 'hand',
 'hear',
 'ever',
 'light',
 'cry',
 'hey',
 'hold',
 'gutta',
 'gone',
 'much',
 'believe',
 'better',
 'something',
 'face',
 'going',
 'boy',
 'la',
 'everything',
 'really',
 'feeling',
 'friend',
 'turn',
 'call',
 'tonight',
 'old',
 'live',
 'try',
 'made',
 'put',
 'id',
 'place',
 'people',
 'last',
 'another',
 'leave',
 'head',
 'alone',
 'new',
 'stay',
 'run',
 'hard',
 'coming',
 'show',
 'word',


In [42]:
model.wv['touch']

array([ 0.85556954,  0.80403876, -0.5799133 ,  2.157911  , -0.06291752,
       -0.79848593, -0.5518212 ,  1.5456254 , -1.6095718 , -0.30140883,
        1.799928  ,  1.7779415 , -2.9499729 ,  0.42505038, -0.44563225,
        1.4839821 ,  0.4025599 , -0.70364213,  0.23643167, -0.20767981,
       -0.10041285, -0.36842972, -0.70606595,  0.11227178,  0.2000336 ,
       -1.4328538 , -1.8491517 ,  2.6237078 , -2.7602255 ,  0.33889124,
       -1.0337923 ,  0.89929885, -1.1444167 ,  0.1320258 , -0.32441503,
       -0.95940113,  1.0633048 , -0.19023094,  1.1773819 ,  1.0901102 ,
        0.03482237, -2.0500684 ,  0.79589605,  0.2434444 ,  1.230167  ,
        0.82516223,  0.29196653, -0.7240777 ,  2.2009106 ,  1.5828495 ,
       -1.9538383 ,  0.33239785, -0.68160844,  0.76618373,  0.0786422 ,
        0.63139015,  0.8393283 , -1.7127674 , -0.9891505 ,  0.32632843,
        0.07847245, -1.0099546 ,  1.4549843 , -1.6710777 , -0.57812476,
        3.5078712 ,  1.2378346 ,  1.1661932 , -1.7118393 ,  1.01

In [53]:
def document_vector(doc):
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

# When you set axis=0, the operation is applied column-wise across the rows of the array.
#.      It operates along the first axis, which is typically the vertical axis in a 2D array.
# When you use axis=1 in a NumPy function like np.mean(), the operation is applied row-wise across the columns of the array. 
#       In other words, it operates along the second axis, which is typically the horizontal axis in a 2D array.

In [54]:
document_vector(df_spotify_lyrics['text'][0])

array([ 0.04750453, -0.8294919 ,  0.43110248,  0.65020746, -0.21500133,
       -0.82213116,  0.00151731,  0.828463  , -0.8376896 , -0.4398146 ,
        0.35101175,  0.58949494, -0.7812034 ,  0.54321176,  0.12932776,
       -0.03060713,  0.2531629 ,  0.06347384, -0.67183137, -0.31927574,
        0.6649782 , -0.22541367,  0.61443675,  0.1640586 ,  0.42775196,
       -0.13882816, -0.4441272 ,  0.7546618 , -0.6600979 ,  0.27395803,
        0.08682233, -0.44627452,  0.4670278 , -0.5980858 , -0.15199226,
       -0.08926564,  0.64209473,  0.42930746, -0.49154812, -0.55687356,
        0.39556727,  0.32033512,  0.02601005,  0.09679578,  0.30347863,
        0.02589785, -0.4853221 ,  0.01142661,  0.36417407,  0.42228287,
       -0.254038  , -0.02409875, -0.6274225 ,  0.33309883,  0.19318515,
       -0.01718237, -0.22036767,  0.03245667, -0.06796312,  0.44734555,
       -0.10104336, -0.08076698,  0.86316717, -0.00814562, -0.56034225,
        0.83093876,  0.36929598,  0.6575853 , -0.57968515,  0.65

In [55]:
dox = []
for d in tqdm(df_spotify_lyrics['text'].values):
    dox.append(document_vector(d))

100%|███████████████████████████████████| 10000/10000 [00:07<00:00, 1261.90it/s]


In [56]:
dox = np.array(dox)

In [57]:
dox.shape

(10000, 100)

In [64]:
from sklearn.metrics.pairwise import cosine_similarity
similar_wv = cosine_similarity(dox)

In [65]:
similar_wv[2]                           # cosine similarity of 3rd review with every review

array([0.8122626 , 0.8392058 , 1.0000002 , ..., 0.5569336 , 0.8094466 ,
       0.71388286], dtype=float32)

In [69]:
df_spotify_lyrics[(df_spotify_lyrics['song']=='I Will Survive') & (df_spotify_lyrics['artist'] == 'Donna Summer')]

Unnamed: 0,artist,song,text
1,Donna Summer,I Will Survive,first afraid petrified kept think going live w...


In [77]:
def recommender_wv(song_name, artist_name):
    idx = df_spotify_lyrics[(df_spotify_lyrics['song'] == song_name) & 
                            (df_spotify_lyrics['artist']== artist_name)].index[0]
    
    # distance is a list of tuples. Each tuple contains index and the similiarity score
    distance = sorted(list(enumerate(similar_wv[idx])), reverse=True, key=lambda x: x[1])
    
    songs = []
    for s_id in distance[1:10]:
        songs.append(df_spotify_lyrics.iloc[s_id[0]].song)
    
    return songs

In [78]:
recommender_wv('I Will Survive', 'Donna Summer')

['I Will Survive',
 'No Looking Back',
 "It's Over",
 "Mickey's Malt Liquor",
 "Don't It",
 'Locomotive',
 'Shaking You',
 'Walk Away From Love',
 'Spinning The Wheel']

In [79]:
import pickle
pickle.dump(similar_wv, open('similarity_lyrics_wv', 'wb'))
pickle.dump(df_spotify_lyrics, open('df_spotify_lyrics','wb'))