In [14]:
import numpy as np
import pandas as pd

In [15]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [16]:
df.shape

(57650, 4)

In [17]:
df = df.sample(n=5000).drop('link',axis=1).reset_index(drop=True)

In [18]:
df

Unnamed: 0,artist,song,text
0,Incubus,Vitamin,I'm born \nI'm alive \nI breathe \nIn a mom...
1,Britney Spears,Ooh La La,You don't have to look like a movie star \nOo...
2,Kinks,Jack The Idiot Dunce,"Who's the fool with the cross-eyed stare, \nT..."
3,Styx,More Love For The Money,The cats were on the table \nThey were waitin...
4,Paul McCartney,"Hi, Hi, Hi","Well, when I met you at the station \nYou wer..."
...,...,...,...
4995,Jennifer Lopez,Charades,"Break it, you buy it \nNothing's for free \n..."
4996,Dan Fogelberg,Scarecrow's Dream,Seldom seen \nA scarecrow's dream \nI hang i...
4997,Allman Brothers Band,Melissa,"Crossroads, seem to come and go, yeah. \nThe ..."
4998,The Monkees,Goin' Down,Sock it to me \nFloatin' down the river \nWi...


In [19]:
# CLEANING 
df['text'][0]

"I'm born  \nI'm alive  \nI breathe  \nIn a moment or two I realize,  \nThat the sphere upon which I reside,  \nIs asleep on its feet.  \nShould I go back to sleep?  \nWe orbit the sun  \nI grow up  \nMy open eyes see  \n  \nA zombified, somnambulist society.  \nLeaving us as vitamins  \nFor the hibernating human animal.  \nDo you see what I mean?  \nYou stare at me like a vitamin  \nOn the surface you hate,  \nBut you know you need me.  \n  \nI'll come dressed as any pill you deem fit.  \nWhatever helps you swallow truth  \nAll the more easily.  \n  \nAnd I wonder, will you digest me?  \nInto the sleep machine I won't plug in,  \nIn fact I'd rather die before I will comply.  \nTo you, my friend,  \nI write the reason I still live,  \n'cause in my mind it's set the vitamin is ripe to give  \nComing closer to another 2000 years  \nYou and I will pry  \nThe closed eye of the sleep machine\n\n"

In [25]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)


In [26]:
df['text'][0]

"i'm born   i'm alive   i breathe   in a moment or two i realize,   that the sphere upon which i reside,   is asleep on its feet.   should i go back to sleep?   we orbit the sun   i grow up   my open eyes see      a zombified, somnambulist society.   leaving us as vitamins   for the hibernating human animal.   do you see what i mean?   you stare at me like a vitamin   on the surface you hate,   but you know you need me.      i'll come dressed as any pill you deem fit.   whatever helps you swallow truth   all the more easily.      and i wonder, will you digest me?   into the sleep machine i won't plug in,   in fact i'd rather die before i will comply.   to you, my friend,   i write the reason i still live,   'cause in my mind it's set the vitamin is ripe to give   coming closer to another 2000 years   you and i will pry   the closed eye of the sleep machine  "

In [33]:

import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
nltk.download('punkt')
nltk.download('punkt_tab')

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [34]:
df['text'] = df['text'].apply(lambda x: tokenization(x))


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [39]:
similarity[0]


array([1.        , 0.02742392, 0.00606288, ..., 0.01434507, 0.02201942,
       0.        ])

In [60]:
def recommendation(song_df):
    matches = df[df['song'] == song_df]
    if matches.empty:
        return []
    
    idx = matches.index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])
    
    songs = [df.iloc[m_id[0]].song for m_id in distances[1:21]]
    return songs


In [66]:
recommendation('You Talk A Lot')


['Perfect World',
 'Somebody Knows',
 'Talk Me Down',
 'The Way',
 "Please, Don't Talk About Me When I'm Gone",
 "Feel's Like Home",
 "I Can't Hold Out",
 'Talk',
 'Had A Lot Of Love Last Night',
 'Mystery Song',
 "Please Don't Talk About Me When I'm Gone",
 'Harmonic',
 'Talk To Me',
 'The Way You Move',
 "I'd Rather Believe In You",
 'Territorial Pissings',
 'I Can Feel Love',
 "That's A Man's Way",
 'Waiting For Girl Like You',
 'Just The Way']

In [None]:
# save model
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))