In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spotify_millsongdata.csv')
df.head(10)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
5,ABBA,Burning My Bridges,/a/abba/burning+my+bridges_20003011.html,"Well, you hoot and you holler and you make me ..."
6,ABBA,Cassandra,/a/abba/cassandra_20002811.html,Down in the street they're all singing and sho...
7,ABBA,Chiquitita,/a/abba/chiquitita_20002978.html,"Chiquitita, tell me what's wrong \r\nYou're e..."
8,ABBA,Crazy World,/a/abba/crazy+world_20003013.html,I was out with the morning sun \r\nCouldn't s...
9,ABBA,Crying Over You,/a/abba/crying+over+you_20177611.html,I'm waitin' for you baby \r\nI'm sitting all ...


In [3]:
print(df.shape)
print(df.isnull().sum())
#no null or missing values present in the dataset

(57650, 4)
artist    0
song      0
link      0
text      0
dtype: int64


In [4]:
df = df.sample(10000).drop('link', axis=1).reset_index(drop=True)

In [5]:
#text cleaning and preprocessing
#WE HAVE TO REPLACE ALL TJE BLACK AND BACKSPACE
df['text'] = df['text'].str.lower().replace(r'^\w\s', '').replace(r'\n', ' ', regex=True).replace(r'[^a-zA-Z0-9]', ' ', regex=True).replace(r'\s+', ' ', regex=True).replace(r'^\s+|\s+?$', '', regex=True)
df.head(5)

Unnamed: 0,artist,song,text
0,Natalie Cole,I'm Catching Hell,tonight i i just want to talk to the ladies oh...
1,Dusty Springfield,Nothing,baby if a wall of stone were built around your...
2,Chaka Khan,Smokin' Room,here we are alone in this old smokin room agai...
3,Leonard Cohen,Famous Blue Raincoat,it s four in the morning the end of december i...
4,Overkill,Hellish Pride,ain t nobody listening ain t nobody there some...


In [6]:
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
stemmer = PorterStemmer()

In [8]:
def token(txt):
    tokens = nltk.word_tokenize(txt)
    stemmed_tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(stemmed_tokens)

In [9]:
token("you are beautiful, beauty")

'you are beauti , beauti'

In [10]:
df['text'] = df['text'].apply( lambda x: token(x))

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english', max_features=1000)


In [13]:
matrix = tfid.fit_transform(df['text']).toarray()


In [14]:
similer = cosine_similarity(matrix)
similer[0]

array([1.        , 0.15296077, 0.14067906, ..., 0.11059501, 0.05518126,
       0.13628693])

In [15]:
def recommend(song_name):
    # Clean and standardize the input song name
    song_name = song_name.strip().lower()
    
    # Check if a similar song name exists in the DataFrame
    similar_songs = [song for song in df['song'].str.strip().str.lower() if song_name in song]
    
    if len(similar_songs) == 0:
        return ["Song not found in the dataset"]  # Return a message if no similar song is found
    
    # Get the first similar song and proceed with recommendation
    similar_song_name = similar_songs[0]
    idx = df[df['song'].str.strip().str.lower() == similar_song_name].index[0]
    distance = sorted(list(enumerate(similer[idx])), reverse=True, key=lambda x: x[1])[1:6]
    song = [df.iloc[s_id[0]].song for s_id in distance]
    return song


In [16]:
recommend('In Your Eyes')

['A Thousand Words',
 'More Than Words',
 'Eye To Eye',
 'Cherish',
 'In A Manner Of Speaking (Trandy Mix)']

In [19]:
import pickle
pickle.dump(similer, open('similarity.pkl', 'wb'))
pickle.dump(df, open('df.pkl', 'wb'))