In [7]:
import pandas as pd

In [8]:
df = pd.read_csv("spotify_millsongdata.csv")

In [9]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [10]:
df.shape

(57650, 4)

In [11]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [12]:
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [13]:
df.head(5)

Unnamed: 0,artist,song,text
0,Elvis Presley,El Toro,There's a legend of a famous matador \r\nWho ...
1,Willie Nelson,Harbor Lights,I saw the harbor lights \r\nThey only told me...
2,Kid Rock,Flying High,You know I spent a little time out in Malibu ...
3,Air Supply,Chances,There's a chance you will be there \r\nI'd li...
4,Dolly Parton,Before The Next Teardrop Falls,If she brings you happiness \r\nThen I wish y...


In [14]:
df["text"][0]

"There's a legend of a famous matador  \r\nWho went to meet El Toro  \r\nThough he fought as he had never done before  \r\nHe could not beat, El Toro  \r\nThe bull El Toro, brought him defeat and pain  \r\nAnd to his sorrow, the matador knew shame  \r\nThey said time would never heal the many scars  \r\nBrought by the great, El Toro  \r\nAnd the bitterness that burned deep in his heart  \r\nCaused him to hate, El Toro  \r\nThe bull El Toro, brought him defeat and pain  \r\nAnd to his sorrow, the matador knew shame  \r\nSo one night, when no-one was on sight  \r\nThe matador, went to finish the score  \r\nIn the lonely fields, beneath the pale moonlight  \r\nHe fought the bull...and they fought once more  \r\nWhen they found the matador and saw him dying  \r\nHe'd never see tomorrow  \r\nNow they say that on the spot where he was lying  \r\nStill walks the proud El Toro  \r\nThe bull El Toro, brought him defeat and pain  \r\nAnd to his sorrow, the matador knew shame  \r\n\r\n"

In [15]:
df.shape

(5000, 3)

Cleaning and processing the dataset.

In [16]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [17]:
df.head(5)

Unnamed: 0,artist,song,text
0,Elvis Presley,El Toro,there's a legend of a famous matador \r who w...
1,Willie Nelson,Harbor Lights,i saw the harbor lights \r they only told me ...
2,Kid Rock,Flying High,you know i spent a little time out in malibu ...
3,Air Supply,Chances,there's a chance you will be there \r i'd lik...
4,Dolly Parton,Before The Next Teardrop Falls,if she brings you happiness \r then i wish yo...


In [18]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adhik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
import nltk
from nltk.stem import PorterStemmer


stemmer = PorterStemmer()

def token(txt):
    tokens = nltk.word_tokenize(txt)
    stemmed_tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(stemmed_tokens)


In [20]:
token("Your are beautiful")

'your are beauti'

In [21]:
df['text'].apply(lambda x: token(x))

0       there 's a legend of a famou matador who went ...
1       i saw the harbor light they onli told me we we...
2       you know i spent a littl time out in malibu i ...
3       there 's a chanc you will be there i 'd like t...
4       if she bring you happi then i wish you both th...
                              ...                        
4995    i should have told him that i need him when i ...
4996    go . disjoint we scare ourselv with all that w...
4997    now here come the great music thing call 'hold...
4998    [ nicki minaj : ] damn right a bitch qualifi i...
4999    could n't keep a secret got a concret skull co...
Name: text, Length: 5000, dtype: object

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [24]:
matrix = tfid.fit_transform(df['text'])

In [25]:
similer = cosine_similarity(matrix)

In [26]:
similer[0]

array([1.00000000e+00, 2.79386493e-02, 3.31587977e-04, ...,
       2.05170014e-03, 1.25989798e-03, 0.00000000e+00])

In [28]:
df[df['song'] =='Flying High'].index[0]

2

Recommender Function

In [37]:
def recommender(song_name):
    idx = df[df['song']==song_name].index[0]
    distance = sorted(list(enumerate(similer[idx])), reverse=True, key = lambda x:x[1])
    song = []
    for s_id in distance[1:20]:
        song.append(df.iloc[s_id[0]].song)
    return song


In [38]:
recommender("Flying High")

['Power Of Love',
 "She's So High Above Me",
 'One-Eyed, One-Horned Flying Purple People Eater',
 'Bob Marley',
 "Anyone Who Isn't Me Tonight",
 "Ain't Nobody's Business",
 'Feel Like Flying',
 'Good Life',
 'High Time',
 'Head Down',
 'I Lift You High',
 'Be Good Johnny',
 'Thunderbird',
 'Eagle',
 'Jacksonville Kid',
 'Carried Away',
 'Good Good Time',
 'How High',
 "I'd Like That"]

In [40]:
import pickle

In [43]:
pickle.dump(similer, open("similarity", "wb"))

In [44]:
pickle.dump(df, open("df", "wb"))