In [4]:
import pandas as pd

In [5]:
data = pd.read_csv("spotify_millsongdata.csv")

In [6]:
data = data.sample(5000).drop('link', axis=1).reset_index(drop=True)

Test Cleaning

In [7]:
data['text'] = data['text'].str.lower().replace(r'^\w\s','').replace(r'\n',' ', regex = True)

In [8]:
data

Unnamed: 0,artist,song,text
0,Harry Belafonte,When The Saint Go Marching In,"oh, when the saints, go marching in, \r yes w..."
1,John Denver,Ponies,somewhere out on the prairie \r is the greate...
2,Nazareth,Sweet Little Rock And Roller,by: chuck berry as done by nazareth \r ninete...
3,Wilson Phillips,Impulsive,"the secret of love, i thought i understood \r..."
4,Billy Joel,You Can Make Me Free,you can make me free \r you can make me smile...
...,...,...,...
4995,Overkill,The Years Of Decay,out on this road takes its toll \r and you pa...
4996,Hank Snow,Gentle On My Mind,it's knowing that your door is always open and...
4997,Drake,Take Care,know you've been hurt by someone else \r i ca...
4998,U2,Springhill Mining Disaster,"in the town of springhill, nova scotia \r dow..."


In [9]:
data['text']

0       oh, when the saints, go marching in,  \r yes w...
1       somewhere out on the prairie  \r is the greate...
2       by: chuck berry as done by nazareth  \r ninete...
3       the secret of love, i thought i understood  \r...
4       you can make me free  \r you can make me smile...
                              ...                        
4995    out on this road takes its toll  \r and you pa...
4996    it's knowing that your door is always open and...
4997    know you've been hurt by someone else  \r i ca...
4998    in the town of springhill, nova scotia  \r dow...
4999    we sat on the couch a playin'  \r touch the to...
Name: text, Length: 5000, dtype: object

In [10]:
import nltk
from nltk.stem.porter import PorterStemmer

In [11]:
stemmer = PorterStemmer()

In [12]:
def token(txt):
    tokens = nltk.word_tokenize(txt)
    a = [stemmer.stem(w) for w in tokens]
    return " ".join(a)

In [13]:
data['text'].apply(lambda x : token(x))

0       oh , when the saint , go march in , ye when th...
1       somewher out on the prairi is the greatest cow...
2       by : chuck berri as done by nazareth nineteeen...
3       the secret of love , i thought i understood th...
4       you can make me free you can make me smile you...
                              ...                        
4995    out on thi road take it toll and you pay for e...
4996    it 's know that your door is alway open and yo...
4997    know you 've been hurt by someon els i can tel...
4998    in the town of springhil , nova scotia down in...
4999    we sat on the couch a playin ' touch the tongu...
Name: text, Length: 5000, dtype: object

Reduced multiple similar words into one

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [16]:
sparse_matrix = tfid.fit_transform(data['text'])

In [17]:
similarity = cosine_similarity(sparse_matrix)

Recommender Function

In [18]:
def recommender(song_name):
    index = data[data['song'] == song_name].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse = True, key = lambda x:x[1])
    song = []
    for s_id in distance[1:5]:
        song.append(data.iloc[s_id[0]].song)
    return song

In [19]:
import pickle

In [20]:
pickle.dump(similarity, open("similarity", "wb"))

In [21]:
pickle.dump(data, open("data", "wb"))