In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [3]:
df.shape

(57650, 4)

In [4]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df

Unnamed: 0,artist,song,text
0,Cheap Trick,I Love You Honey But I Hate Your Friends,I love you honey but i hate your friends \nI ...
1,Kirsty Maccoll,Autumngirlsoup,"I'm an autumn girl, flying over london \nWith..."
2,Whitesnake,Love To Keep You Warm,"If you want love, \nYou gotta be cool \nAnd ..."
3,Britney Spears,Am I A Sinner,Keep telling myself \nGet out of my mind \nT...
4,Ray Charles,I Chose To Sing The Blues,I could have been a gambler \n'Cause I'm good...
...,...,...,...
4995,Fifth Harmony,Them Girls Be Like,[Dinah] \nDo my... Look fat? (or naw) \nShou...
4996,Nat King Cole,Brush Those Tears From Your Eyes,Brush those tears from your eyes \nAnd try to...
4997,Cyndi Lauper,A Part Hate,Somber sister \nThis is a strange and bitter ...
4998,Xavier Rudd,Messages,So come sit down \nWill you talk with me now?...


In [6]:
df.shape

(5000, 3)

In [7]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [8]:
df['text'][0]

"i love you honey but i hate your friends   i love you honey but they'll be the end of me   oh yeah   i love you honey but i hate those friends      that fat cat frank got a heart of gold   he's got a head of lead, he's young but he acts old   that limp wristed two-fisted diplomat   better draw a map, to see where he's at   around and round when he rambles on   'bout the latest deal we should be in on   we shouldn't give him the time of day   he doesn't give a damn if we sink or swim      i love you honey but i hate your friends   i love you honey but they'll be the end of me   oh yeah   i love you honey but i hate your friends      did some toot, yeah we had a blow   look man, no holes, real nose   when he says hi he really means the moon   he was there long before armstrong   he stays loose, he says, fill her up   for eternal youth from those swiss docs.   he's thirty but he feels like sixteen   check it out: yep, hundred-n-sixteen!      i love you honey but i hate your friends   i l

In [9]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)


In [10]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
# In this line, you import the cosine_similarity function from scikit-learn's metrics.pairwise module.
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [13]:
similarity[0]

array([1.        , 0.03442288, 0.10518085, ..., 0.286477  , 0.01147514,
       0.00803664])

In [14]:
df['song'][0]

'I Love You Honey But I Hate Your Friends'

In [15]:
df[df['song']== 'Missing My Nemesis']

Unnamed: 0,artist,song,text
3286,Phineas And Ferb,Missing My Nemesis,"without your scheme my life it seem is empti ,..."


In [16]:
filtered_df = df[df['song']=='']
if not filtered_df.empty:
    first_index = filtered_df.index[0]
    # Do something with first_index
else:
    # Handle the case when the DataFrame is empty
    print("Filtered DataFrame is empty.")



Filtered DataFrame is empty.


In [17]:
sorted(list(enumerate(similarity[0])),reverse=False,key=lambda x:x[1])

[(82, 0.0),
 (113, 0.0),
 (131, 0.0),
 (160, 0.0),
 (399, 0.0),
 (935, 0.0),
 (965, 0.0),
 (1222, 0.0),
 (1242, 0.0),
 (1828, 0.0),
 (2010, 0.0),
 (2019, 0.0),
 (2143, 0.0),
 (2160, 0.0),
 (2478, 0.0),
 (2922, 0.0),
 (2964, 0.0),
 (3408, 0.0),
 (3409, 0.0),
 (3565, 0.0),
 (3688, 0.0),
 (3775, 0.0),
 (3955, 0.0),
 (3977, 0.0),
 (4010, 0.0),
 (4019, 0.0),
 (4054, 0.0),
 (4238, 0.0),
 (4287, 0.0),
 (4289, 0.0),
 (4380, 0.0),
 (4530, 0.0),
 (4824, 0.0),
 (4933, 0.0),
 (4984, 0.0),
 (4190, 0.0001388581220021685),
 (1677, 0.00017346082612802025),
 (1281, 0.00017346398886362277),
 (2965, 0.00021898650600784617),
 (2783, 0.0002925937004541426),
 (3734, 0.00031579288067962093),
 (4142, 0.0003453095479542185),
 (1959, 0.00036031586178633874),
 (4694, 0.0003931724384609483),
 (4660, 0.0003946843335537887),
 (1453, 0.00043420904746006703),
 (1205, 0.0004640734625407226),
 (787, 0.0005122770637409909),
 (1190, 0.0005157824126009552),
 (1401, 0.0005419585359418198),
 (2747, 0.0005748793111602283),
 

# recommedation function

In [18]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [19]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])
    songs = []


In [20]:
def recommendation(song_df):
    matching_songs = df[df['song'] == song_df]
    
    if matching_songs.empty:
        print(f"No matching songs found for '{song_df}'.")
        return []  # Return an empty list if no matching songs found
    
    idx = matching_songs.index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])
    songs = []
    # Rest of your code for populating the 'songs' list
    return songs


In [21]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))