In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [5]:
df = df.drop('link', axis=1).reset_index(drop=True)

In [6]:
df.shape

(57650, 3)

TEXT PREPROCESSING

In [7]:
df['text'] = df['text'].str.lower().replace(r'^a-ZA-Z0-9',' ').replace(r'\n',' ', regex=True)

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
tfidvector = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = tfidvector.fit_transform(df['text'].sample(1000))
similarity = cosine_similarity(matrix)

In [11]:
similarity[0]

array([1.00000000e+00, 5.77654287e-03, 8.39659317e-03, 1.13853901e-02,
       2.00536414e-04, 6.57515526e-02, 7.12032491e-03, 1.35093193e-03,
       4.03453870e-03, 0.00000000e+00, 1.33570551e-02, 1.39705857e-01,
       4.11069654e-03, 1.28187250e-02, 2.21762944e-02, 4.49391408e-02,
       3.68560628e-02, 1.59812999e-02, 4.90767843e-04, 5.23705396e-02,
       9.46699502e-03, 2.56829152e-03, 4.87376907e-03, 2.35076470e-02,
       1.09081504e-03, 3.71615625e-01, 7.53180483e-03, 3.80063686e-03,
       1.84056178e-04, 6.86260795e-03, 7.60846441e-02, 0.00000000e+00,
       4.49497480e-02, 9.06217650e-03, 3.71491411e-03, 0.00000000e+00,
       3.47353179e-02, 0.00000000e+00, 9.14697304e-02, 2.17825271e-02,
       1.81878533e-04, 3.73392323e-02, 9.40673876e-03, 6.30106943e-02,
       8.05056963e-03, 2.31195453e-02, 3.56425056e-02, 2.94751899e-03,
       8.67019991e-04, 3.24427770e-03, 1.53844320e-03, 1.12121885e-02,
       0.00000000e+00, 5.68766055e-03, 1.11034361e-02, 5.09402342e-02,
      

In [12]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text
9,ABBA,Crying Over You,i'm waitin' for you baby \r i'm sitting all a...
19891,UB40,Crying Over You,crying over you in the morning \r crying over...


RECOMMENDER FUNCTION

In [13]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [14]:
recommendation('Crying Over You')

["Ain't It Strange",
 'All In This Together',
 'Tennessee River',
 'Lost In Space',
 'Sunset',
 'One Of Us',
 'Melt My Heart To Stone',
 'Mountain Top',
 '17',
 'Chasing The Rainbow',
 'Carrot Juice Is Murder',
 'Shut Up And Dance',
 'Hold',
 '(The System Of) Doctor Tarr And Professor Fether',
 'Miracles',
 'Private Hell',
 'Somebody Turned On The Light',
 'The Best Damn Thing',
 'Christmas In Dixie',
 'Peaches And Cream']

In [15]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))