In [21]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [23]:
df.shape

(57650, 4)

In [24]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [25]:
df.shape

(5000, 3)

In [26]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [27]:
df['text'][0]



In [28]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [29]:
df['text']

0       [hook]   uh, uh yeah, yeah, oh   rap's new gen...
1       you were born with a face that would let you g...
2       where are those happy days, they seem so hard ...
3       i wish i , i wish i was in richmond   i do, i ...
4       sun comes up on this new morning   shifting sh...
                              ...                        
4995    what the hell am i?   thousand eyes, a fly   l...
4996    i, i who have nothing   i, i who have no one  ...
4997    yesterday feels like running away   feels like...
4998    don't be shy   just let your feelings roll on ...
4999    lady, are you crying, do the tears belong to m...
Name: text, Length: 5000, dtype: object

In [30]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [33]:
similarity[0]

array([1.        , 0.02344482, 0.01089358, ..., 0.04845822, 0.03643961,
       0.03230482])

In [34]:
df['song'][0]

'Next Generation'

In [35]:
df[df['song']=='Heart Of Hearts']

Unnamed: 0,artist,song,text
1360,Electric Light Orchestra,Heart Of Hearts,vers they say there 's gold under the stone th...


# recommedation function

In [36]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [38]:
recommendation('Heart Of Hearts')

['Heart To Heart',
 'Not Enough Romance',
 'Give Your Heart A Break',
 "You'll Be In My Heart",
 'Heart Of Stone',
 'Heart Of Gold',
 'Take It To Heart',
 'Slave',
 'Had A Dream',
 'Love Needs A Heart',
 'Never Give Your Heary',
 'Heart Full Of Soul',
 'Heart Of Gold',
 'Kissing A Fool',
 'Have You Ever',
 'Straight From The Heart',
 'Breaking Hearts',
 'The Heart Never Learns',
 "Don't Be Cruel",
 'Anyone Who Had A Heart']

In [39]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))