In [2]:
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv('songdata.csv')

In [4]:
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [5]:
df.shape

(57650, 4)

In [6]:
df=df.sample(n=10000)
df=df.drop('link',axis=1)
df=df.reset_index(drop=True)

In [7]:
df.shape

(10000, 3)

CLEANING

In [8]:
df['song'][0]

'Hatfield'

In [9]:
df['text'][0]

'Wide awake in San Diego  \nSmallest root shrinking dry  \nThe fish are swimming closer inside Lake Morena  \nStill get no rain from the sky  \nMen were firing cannons  \nHoping smoke might tear an angel\'s eyes  \nHeard the stories of shooting arrows  \nTearin\' open the clouds  \nBut Indians shoot the best, and  \nThe Indians they don\'t like us, much  \n  \nHatfield  \nYou made rain for L.A.  \nWe\'ve got ten grand  \nFor you to go cook us some rain  \n  \nScience from the cooking pot mixing up with the air  \nFeeling thunder  \nNights since they have started  \nNow the clouds won\'t stay apart  \nA little California voodoo  \nCare of Hatfield and his brother  \nNow the horses won\'t race where the down\'s turned to mud  \nStreams and rivers are growing  \nAnd my boots are filling up  \nWater\'s from back this way  \nLook at them smiling, cooking and smiling  \n  \nHatfield  \nMade rain for L.A.  \nWell, "Hot damn",  \nPeople swear with one walk in this rain  \n  \nFamilies on porch

In [10]:
df['text']=df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n','',regex=True)          #here \w=word,\s=string,^=negation(replace everything except words and string)

In [11]:
df['text'][0]

'wide awake in san diego  smallest root shrinking dry  the fish are swimming closer inside lake morena  still get no rain from the sky  men were firing cannons  hoping smoke might tear an angel\'s eyes  heard the stories of shooting arrows  tearin\' open the clouds  but indians shoot the best, and  the indians they don\'t like us, much    hatfield  you made rain for l.a.  we\'ve got ten grand  for you to go cook us some rain    science from the cooking pot mixing up with the air  feeling thunder  nights since they have started  now the clouds won\'t stay apart  a little california voodoo  care of hatfield and his brother  now the horses won\'t race where the down\'s turned to mud  streams and rivers are growing  and my boots are filling up  water\'s from back this way  look at them smiling, cooking and smiling    hatfield  made rain for l.a.  well, "hot damn",  people swear with one walk in this rain    families on porches  the children are smiling  the owners are mad, owners are cryin

TOKENIZATION (Changes all the form of word into base word eg.: loved,loving into love)

In [23]:
import nltk
from nltk.stem.porter import PorterStemmer

ps=PorterStemmer()

def tokenization(txt):
    tokens= nltk.word_tokenize(txt)
    stemming=[ps.stem(i) for i in tokens]
    return " ".join(stemming)

In [13]:
df['text']=df['text'].apply(lambda x: tokenization(x))

In [14]:
df['text']

0       wide awak in san diego smallest root shrink dr...
1       vers 1 justic beckon for those who glori in th...
2       just one glanc and i never had a chanc there a...
3       i wa convert to the other side from the day i ...
4       stand here alon with you wonder what it is tha...
                              ...                        
9995    a sens of doom the warn thi is our christ retu...
9996    featur deep e written by alan parson and david...
9997    you thought she 'd care for you and so you act...
9998    you consid me the young apprentic caught betwe...
9999    sleigh bell ring are you listen in the lane sn...
Name: text, Length: 10000, dtype: object

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
tfid=TfidfVectorizer(stop_words='english')      #ignore common english words such as this,is,to etc
matrix=tfid.fit_transform(df['text'])           #fit the vectorizer and transform into tf-idf vectors

In [17]:
matrix.shape

(10000, 25709)

In [18]:
similarity=cosine_similarity(matrix)

In [19]:
similarity[0]

array([1.        , 0.01433101, 0.01829757, ..., 0.02050291, 0.00550511,
       0.0126527 ])

In [20]:
def recommendation(song):
    idx= df[df['song']==song].index[0]
    distance=sorted(list(enumerate(similarity[idx])),reverse=False, key=lambda x:x[1])
    
    songs=[]
    for i in distance[1:21]:
        songs.append(df.iloc[i[0]].song)
    return songs

In [71]:
import pickle
pickle.dump(df,open('df.pkl','wb'))                        #5k samples
pickle.dump(similarity,open('similarity.pkl','wb'))

In [None]:
import gzip
import pickle
with open('similarity.pkl','rb') as file:
    original_data=pickle.load(file)
with gzip.open('compressed_similarity.pickle.gz', 'wb') as file:
    pickle.dump(original_data, file)

In [21]:
df['song'].values

array(['Hatfield', 'Lest My Labor Be In Vain', 'True Romance', ...,
       'Crazy Heart', 'Wrapped Around Your Finger', 'Winter Wonderland'],
      dtype=object)