In [1]:
import pandas as pd

In [2]:
songs = pd.read_csv("spotify_millsongdata.csv")

In [3]:
songs.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
songs = songs.sample(10000).drop('link', axis=1).reset_index(drop=True)

In [5]:
songs.shape

(10000, 3)

In [6]:
songs['text'] = songs['text'].str.lower().replace(r'\n', ' ', regex=True)

In [7]:
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
stemmer = PorterStemmer()

def stem_lyrics(text):
    tokens = nltk.word_tokenize(text)
    return " ".join([stemmer.stem(word) for word in tokens])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
songs['text'] = songs['text'].apply(stem_lyrics)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = vectorizer.fit_transform(songs['text'])
similarity_matrix = cosine_similarity(tfidf_matrix)

In [10]:
def recommend(song_title):
    idx = songs[songs['song'] == song_title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:21]
    return [songs.iloc[i[0]].song for i in sim_scores]

In [11]:
import pickle
pickle.dump(similarity_matrix, open('similarity.pkl', 'wb'))
pickle.dump(songs, open('df.pkl', 'wb'))