In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df.head(10)

Unnamed: 0,artist,song,text
0,Enigma,Total Eclipse Of The Moon,Miles away from light at noon \r\nTotal eclip...
1,Beach Boys,Little Bird,Little bird up in a tree \r\nLooked down and ...
2,Nirvana,Tourette's,"Moderate Rock \r\n \r\nMay day, every day, m..."
3,Waterboys,Suffer,[Chorus] \r\nI'm gonna suffer for you no long...
4,"Harry Connick, Jr.","On The Atchison, Topeka, And The Santa Fe",Do you hear that whistle down the line? \r\nI...
5,Travis,I Forget My Name,Turn me on to something \r\nI don't want to b...
6,Rihanna,Take A Bow,How 'bout a round of applause? \r\nStandin' o...
7,Heart,Desire Walks On,Stand at the window \r\nPull back the lace \...
8,Indigo Girls,Secure Yourself,"In the ink of an eye I saw you bleed, \r\nThr..."
9,Oscar Hammerstein,School Song,We work and work \r\nFrom week to week \r\nA...


In [9]:
df['text'][0]

"Miles away from light at noon  \r\nTotal eclipse of the moon  \r\nMany reasons to believe in life  \r\nJust listen what it's telling you  \r\n  \r\nCome and have a look inside  \r\nTotal eclipse of the moon  \r\nDon't be childish, don't be so cruel  \r\nI'm feeling just lonely without  \r\nWithout you  \r\n  \r\nI can see the wide horizons  \r\nBut debts have to be paid  \r\nOur ways will cross again someday  \r\nBelieve, and come back to you  \r\nI'll see you soon  \r\n  \r\nTime doesn't say hello  \r\nTotal eclipse of the moon  \r\nMany nights in our lives before  \r\nI was dreaming to be just beside  \r\nBeside you  \r\n  \r\nTotal eclipse of the moon  \r\nI'll see you soon\r\n\r\n"

In [10]:
# df = df.sample(5000)

In [11]:
df.shape

(5000, 3)

Text Cleaning/ Text Preprocessing

In [12]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [13]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [14]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [17]:
similarity[0]

array([1.        , 0.01801509, 0.        , ..., 0.01557565, 0.04638671,
       0.01032466])

In [18]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text
2045,UB40,Crying Over You,cri over you in the morn cri over you in the e...


In [19]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [21]:
recommendation('I Forget My Name')

['Open My Eyes',
 "I Can't Forget",
 'One Year, Six Months',
 'Bones',
 'End Of All Time',
 'Cold Love',
 'I Know You Too Well',
 'Things I Never Said (Japaneses Bonus Track)',
 'One Thing',
 'You Belong To Me',
 'Last Night On Earth',
 'Birthday Song',
 'What I Did For Love',
 "Just Couldn't Wait",
 'Come Back',
 'If You Work Away',
 'Thieves Like Us',
 'This I Promise You',
 "Woman's Touch",
 'Gold Rush']

In [22]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))