In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [190]:
df_spotify_lyrics = pd.read_csv('spotify_millsongdata.csv')

In [191]:
df_spotify_lyrics.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [192]:
df_spotify_lyrics = df_spotify_lyrics.sample(15000)

In [193]:
df_spotify_lyrics.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [194]:
df_spotify_lyrics.duplicated().sum()

0

In [195]:
df_spotify_lyrics.shape

(15000, 4)

In [196]:
df_spotify_lyrics.drop(columns=['link'], inplace=True)

In [197]:
df_spotify_lyrics.head()

Unnamed: 0,artist,song,text
2565,Cher,I Still Haven't Found What I'm Looking For,I have climbed highest mountain \r\nI have ru...
47839,Pearl Jam,Of The Girl,"Oh, he deals 'em off, off the top, ties 'em of..."
5815,Face To Face,The New Way,"You're alright, you're alive, you're the numbe..."
52266,Stevie Wonder,Ain't That Love,"Now, baby when you sigh \r\n(when you sigh) ..."
50246,Ray Boltz,Nobody's Home,Nobody's Home \r\nWords and music by Ray Bolt...


In [198]:
df_spotify_lyrics.reset_index(drop=True, inplace=True)

In [199]:
df_spotify_lyrics.head()

Unnamed: 0,artist,song,text
0,Cher,I Still Haven't Found What I'm Looking For,I have climbed highest mountain \r\nI have ru...
1,Pearl Jam,Of The Girl,"Oh, he deals 'em off, off the top, ties 'em of..."
2,Face To Face,The New Way,"You're alright, you're alive, you're the numbe..."
3,Stevie Wonder,Ain't That Love,"Now, baby when you sigh \r\n(when you sigh) ..."
4,Ray Boltz,Nobody's Home,Nobody's Home \r\nWords and music by Ray Bolt...


In [200]:
# Text Preprocessing:
# 1) lowercasing
# 2) Removing html tags and other expressions
# 3) Removing Puntuation
# 4) Spelling correction
# 5) Removing Stopwords
# 6) Lemmatization/Stemming
# 7) Tokenization

In [201]:
# lowercasing
df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(lambda x: x.lower())

In [202]:
df_spotify_lyrics['text'][3]

"now, baby when you sigh  \r\n(when you sigh)  \r\ni wanna sigh with you  \r\nwhen you cry  \r\n(when you cry)  \r\ni wanna cry some, too  \r\n  \r\nnow, ain't that love  \r\n(ain't that love)  \r\noh, ain't that love that i feel  \r\n(that i feel)  \r\nin my heart for you  \r\n  \r\nwhen your friends  \r\n(when your friends)  \r\nturn their back on you  \r\ni'll be here  \r\n(i'll be here)  \r\njust to see you through  \r\n  \r\nnow, ain't that love  \r\n(ain't that love)  \r\nain't that love  \r\n(ain't that love)  \r\nain't that love, oh now  \r\nain't that love  \r\nthat i feel in my heart for you  \r\n  \r\nnow when you walk  \r\n(when you walk)  \r\ni wanna walk with you  \r\nwhen you talk  \r\n(when you talk)  \r\nwanna talk some, too  \r\n  \r\nnow, ain't that love  \r\n(ain't that love)  \r\noh, ain't that love that i feel  \r\n(that i feel)  \r\nin my heart for you  \r\n  \r\nif you ever  \r\n(if you ever)  \r\never need a friend  \r\ni'll be with you  \r\n(be with you)  \r\n

In [203]:
# Removing html tags
import re
def remove_htmltags(txt):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',txt)

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(remove_htmltags)

In [204]:
def remove_regex(txt):
    pattern = re.compile('[\r\n]+')
    return pattern.sub(r'',txt)

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(remove_regex)

In [205]:
df_spotify_lyrics['text'][3]

"now, baby when you sigh  (when you sigh)  i wanna sigh with you  when you cry  (when you cry)  i wanna cry some, too    now, ain't that love  (ain't that love)  oh, ain't that love that i feel  (that i feel)  in my heart for you    when your friends  (when your friends)  turn their back on you  i'll be here  (i'll be here)  just to see you through    now, ain't that love  (ain't that love)  ain't that love  (ain't that love)  ain't that love, oh now  ain't that love  that i feel in my heart for you    now when you walk  (when you walk)  i wanna walk with you  when you talk  (when you talk)  wanna talk some, too    now, ain't that love  (ain't that love)  oh, ain't that love that i feel  (that i feel)  in my heart for you    if you ever  (if you ever)  ever need a friend  i'll be with you  (be with you)  yes, until the end    now, ain't that love  (ain't that love)  ain't that love  (ain't that love)  don't you know, baby  don't you know, i  (oh, oh, oh, oh)  ain't that love,  baby tha

In [206]:
# Removing punctuations
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [207]:
exclude = string.punctuation
def remove_punc(txt):
    return txt.translate(str.maketrans('','', exclude))

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(remove_punc)

In [187]:
from tqdm import tqdm
tqdm.pandas()

In [130]:
# spelling correction
from textblob import TextBlob
def spel_cor(txt):
    txtblb = TextBlob(txt)
    return txtblb.correct().string

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].progress_apply(spel_cor)

100%|███████████████████████████████████| 15000/15000 [3:22:34<00:00,  1.23it/s]


In [208]:
#stopwords
from nltk.corpus import stopwords
stopwords.words('english')

def remove_stopwords(txt):
    new_txt = []
    for w in txt.split():
        if w in stopwords.words('english'):
            new_txt.append('')
        else:
            new_txt.append(w)
    return ' '.join(new_txt)

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(remove_stopwords)


In [213]:
# Stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(txt):
    return ' '.join([ps.stem(w) for w in txt.split()])

df_spotify_lyrics['text'] = df_spotify_lyrics['text'].apply(stem_words)

In [214]:
df_spotify_lyrics['text'][15]

'son heir shyness crimin vulgar son heir noth particular oh shut mouth say go thing wrong way im human need love like everyon like everyon son heir shyness crimin vulgar son heir noth particular noth particular say gonna happen exactli mean ive alreadi wait long hope gone there club youd like go might meet someon realli love go leav go home cri wanna die'

In [219]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

matrix = tfidf.fit_transform(df_spotify_lyrics['text']).toarray()

In [221]:
from sklearn.metrics.pairwise import cosine_similarity
similar = cosine_similarity(matrix)

In [229]:
similar[2]                           # cosine similarity of 3rd review with every review

array([0.00644592, 0.04139237, 1.        , ..., 0.07331797, 0.01530703,
       0.00949046])

In [253]:
df_spotify_lyrics[df_spotify_lyrics['song']=='Of The Girl'].index[0]

1

In [256]:
# Recommendation function

def recommender(song_name):
    idx = df_spotify_lyrics[df_spotify_lyrics['song']==song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])),reverse=True, key=lambda x: x[1])    # distance is a list of tuples. Each tuple contains index and the similiarity score
    
    song = []
    for s_id in distance[1:8]:
        song.append(df_spotify_lyrics.iloc[s_id[0]].song)
    return song

In [258]:
df_spotify_lyrics.sample(10)

Unnamed: 0,artist,song,text
2717,John Prine,Grandpa Was A Carpenter,grandpa wore suit dinner nearli everi day part...
5692,Grateful Dead,Can't Come Down,well im fli desert street wrap mother wine she...
5636,Youth Of Today,One Family,much anticip futur gener think kid today grow ...
260,America,Only Game In Town,spectat line avenu player step right cue everi...
3041,Ocean Colour Scene,Fleeting Mind,brillianc fleet mind chime like voic foreign c...
14359,Eric Clapton,Bad Influence,use get anyth want oh im chang man babi friend...
12017,Stone Temple Pilots,Same On The Inside,send messag world cant seem find word one want...
7679,Helloween,Revolution Now,well think realli time revolut destroy cost lo...
4791,Tragically Hip,A Beautiful Thing,ulcer silenc perspect come way alway ransom ra...
74,Morrissey,Black Cloud,one love stand near one love everywher woo amu...


In [261]:
recommender('Of The Girl')

['More Fool You',
 'Girls And Boys',
 'Million Dollar Bill',
 "Don't Make 'Em Like You",
 'In The End',
 'Sing Together',
 'Pavlove']

In [262]:
import pickle
pickle.dump(similar, open('similarity', 'wb'))
pickle.dump(df_spotify_lyrics, open('df_spotify_lyrics','wb'))