# Importing Libraries and Data Collection

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
df = pd.read_csv("spotify_millsongdata_Recommendation.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
df.shape

(57650, 4)

In [4]:
# checking the null values
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [6]:
# you try as much as possible to work with little number of columns, or shrink them together.
# when dealing with a recommender system.
# dropping the link column
df.drop('link', axis = 1, inplace = True)

In [7]:
# Taking just the sample of the dataset for simplicity purpose
df = df.sample(15000)

In [8]:
df

Unnamed: 0,artist,song,text
13122,Miley Cyrus,Lilac Wine,I lost myself on a cool damp night \r\nI gave...
35829,Hillsong,Cry Of The Broken,Lord I come \r\nLord I thank you \r\nFor you...
15562,Paul McCartney,Flaming Pie,Making love underneath the bed \r\nShooting s...
16781,Radiohead,Cuttooth,I will lead a wallpaper life \r\nOr run away ...
9329,Jimmy Buffett,High Cumberland Jubilee,Maybe leaving's better in the afternoon \r\nW...
...,...,...,...
50247,Ray Boltz,Not Long Ago,"Not Long Ago \r\nWords by Ray Boltz, Music by..."
7645,Heart,Never,Hey baby I'm talking to you \r\nStop yourself...
38753,Journey,Dixie Highway,"""Well, since we're recording tonight, \r\nAnd..."
43598,Michael Bolton,Recondita Armonia,(Tosca)(What strange and lovely harmony) \r\n...


In [9]:
# Resetting the index
df.reset_index(drop=True, inplace=True)

In [10]:
df

Unnamed: 0,artist,song,text
0,Miley Cyrus,Lilac Wine,I lost myself on a cool damp night \r\nI gave...
1,Hillsong,Cry Of The Broken,Lord I come \r\nLord I thank you \r\nFor you...
2,Paul McCartney,Flaming Pie,Making love underneath the bed \r\nShooting s...
3,Radiohead,Cuttooth,I will lead a wallpaper life \r\nOr run away ...
4,Jimmy Buffett,High Cumberland Jubilee,Maybe leaving's better in the afternoon \r\nW...
...,...,...,...
14995,Ray Boltz,Not Long Ago,"Not Long Ago \r\nWords by Ray Boltz, Music by..."
14996,Heart,Never,Hey baby I'm talking to you \r\nStop yourself...
14997,Journey,Dixie Highway,"""Well, since we're recording tonight, \r\nAnd..."
14998,Michael Bolton,Recondita Armonia,(Tosca)(What strange and lovely harmony) \r\n...


In [11]:
df.head()

Unnamed: 0,artist,song,text
0,Miley Cyrus,Lilac Wine,I lost myself on a cool damp night \r\nI gave...
1,Hillsong,Cry Of The Broken,Lord I come \r\nLord I thank you \r\nFor you...
2,Paul McCartney,Flaming Pie,Making love underneath the bed \r\nShooting s...
3,Radiohead,Cuttooth,I will lead a wallpaper life \r\nOr run away ...
4,Jimmy Buffett,High Cumberland Jubilee,Maybe leaving's better in the afternoon \r\nW...


In [12]:
df.duplicated().sum()

0

In [13]:
df['text'][0]

"I lost myself on a cool damp night  \r\nI gave myself in that misty light  \r\nWas hypnotized by a strange delight  \r\nUnder a lilac tree  \r\n  \r\nI made wine from the lilac tree  \r\nPut my heart in it's recipe  \r\nIt makes me see what I want to see  \r\nAnd be what I want to be  \r\n  \r\nWhen I think more than I want to think  \r\nDo things I never should do  \r\nI drink much more that I ought to drink  \r\nBecause it brings me back you  \r\n  \r\nLilac wine is sweet and heady  \r\nLike my love  \r\nLilac wine, I feel unsteady  \r\nLike my love  \r\n  \r\nListen to me, I cannot see clearly  \r\nIsn't that she, coming to me  \r\nNearly here  \r\n  \r\nLilac wine is sweet and heady  \r\nWhere's my love  \r\nLilac wine, I feel unsteady  \r\nWhere's my love  \r\n  \r\nListen to me, why is everything so hazy  \r\nIsn't that she, or am I just going crazy, dear  \r\nLilac wine, I feel unready for my love  \r\nFeel unready, for my love\r\n\r\n"

In [14]:
df.shape

(15000, 3)

# Text Cleaning / Text Preprocessing

In [15]:
# removing all the regular expressions
# str.lower(): converts all the text in the 'text' column to lowercase
# replace(r'^\w\s', ' '): using a regular expression to find and replace patterns in the text.  it's looking for a single word followed by a space at the start of a string.
# It replaces any newline character found with a single space.
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [16]:
# pattern of \r: a carriage return is used to indicate the end of a line and move the cursor to the beginning of the next line
# Windows uses both a carriage return \r and a line feed \n (often written as \r\n together), while Unix-based systems like Linux and macOS typically use just a line feed \n.
df['text'][0]

"i lost myself on a cool damp night  \r i gave myself in that misty light  \r was hypnotized by a strange delight  \r under a lilac tree  \r   \r i made wine from the lilac tree  \r put my heart in it's recipe  \r it makes me see what i want to see  \r and be what i want to be  \r   \r when i think more than i want to think  \r do things i never should do  \r i drink much more that i ought to drink  \r because it brings me back you  \r   \r lilac wine is sweet and heady  \r like my love  \r lilac wine, i feel unsteady  \r like my love  \r   \r listen to me, i cannot see clearly  \r isn't that she, coming to me  \r nearly here  \r   \r lilac wine is sweet and heady  \r where's my love  \r lilac wine, i feel unsteady  \r where's my love  \r   \r listen to me, why is everything so hazy  \r isn't that she, or am i just going crazy, dear  \r lilac wine, i feel unready for my love  \r feel unready, for my love\r \r "

In [17]:
# removing all the carriage return characters and the spaces from the text column.
# strip(): removes any leading or trailing whitespace from each string element in the 'text' column 
# using the replace() method to remove carriage return characters (\r).
df['text'] = df['text'].str.strip().str.replace('\r', '')
df['text'][0]

"i lost myself on a cool damp night   i gave myself in that misty light   was hypnotized by a strange delight   under a lilac tree      i made wine from the lilac tree   put my heart in it's recipe   it makes me see what i want to see   and be what i want to be      when i think more than i want to think   do things i never should do   i drink much more that i ought to drink   because it brings me back you      lilac wine is sweet and heady   like my love   lilac wine, i feel unsteady   like my love      listen to me, i cannot see clearly   isn't that she, coming to me   nearly here      lilac wine is sweet and heady   where's my love   lilac wine, i feel unsteady   where's my love      listen to me, why is everything so hazy   isn't that she, or am i just going crazy, dear   lilac wine, i feel unready for my love   feel unready, for my love"

In [18]:
df.head()

Unnamed: 0,artist,song,text
0,Miley Cyrus,Lilac Wine,i lost myself on a cool damp night i gave my...
1,Hillsong,Cry Of The Broken,lord i come lord i thank you for your love...
2,Paul McCartney,Flaming Pie,making love underneath the bed shooting star...
3,Radiohead,Cuttooth,i will lead a wallpaper life or run away to ...
4,Jimmy Buffett,High Cumberland Jubilee,maybe leaving's better in the afternoon when...


In [19]:
# Stemming is the process of reducing words to their root or base form. 
#  we are using this algorithm to stem words in our text data.
# "running" is stemmed to its base form "run" using the Porter stemming algorithm.
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [20]:
stemmer = PorterStemmer()

# Content Based Filtering
#####  Creating a function that is preparing the text data for analysis or comparison based on the content of the songs' lyrics.

In [21]:
# A function that tokenizes input text, performs stemming on each token, and then returns the resulting stemmed tokens as a single string.
# split a piece of text (a sentence or a paragraph) into individual words or tokens. Before stemming the words.
def token(txt):
    tokens = nltk.word_tokenize(txt) # Tokenize the input text
    stemmed = [stemmer.stem(w) for w in tokens]
    return " ".join(stemmed) # results in a single string

In [22]:
token("you are beautiful, beauty")

'you are beauti , beauti'

In [23]:
# applying our function
df['text'].apply(lambda x: token(x))

0        i lost myself on a cool damp night i gave myse...
1        lord i come lord i thank you for your love for...
2        make love underneath the bed shoot star from a...
3        i will lead a wallpap life or run away to the ...
4        mayb leav 's better in the afternoon when the ...
                               ...                        
14995    not long ago word by ray boltz , music by stev...
14996    hey babi i 'm talk to you stop yourself and li...
14997    `` well , sinc we 're record tonight , and we ...
14998    ( tosca ) ( what strang and love harmoni ) rec...
14999    the grass is high and the work is done send th...
Name: text, Length: 15000, dtype: object

In [24]:
# convert into vector
# converting text into numerical representations and then calculating how similar these numerical representations are using the cosine similarity metric.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [26]:
matrix = tfid.fit_transform(df['text'])
matrix

<15000x41425 sparse matrix of type '<class 'numpy.float64'>'
	with 806623 stored elements in Compressed Sparse Row format>

In [27]:
# cosine values
similar = cosine_similarity(matrix)

In [28]:
similar[0]

array([1.        , 0.0151156 , 0.00395909, ..., 0.05871802, 0.00790803,
       0.02256163])

In [29]:
df[df['song'] == "Cry Of The Broken"]

Unnamed: 0,artist,song,text
1,Hillsong,Cry Of The Broken,lord i come lord i thank you for your love...
2690,Hillsong United,Cry Of The Broken,lord i come lord i thank you for your love...


In [30]:
df[df['song'] == 'Cry Of The Broken'].index[0]

1

# Recommender Function

In [31]:
# The similarity values are sorted in descending order, so the most similar songs come first.
def recommender(song_name):
    idx = df[df['song'] == song_name].index[0] # This line finds the index of the input song in the DataFrame df.
    distance = sorted(list(enumerate(similar[idx])), reverse=True, key = lambda x: x[1]) # passing the similarity array into the list
    song = [] # 
    for s in distance[1:5]:
        song.append(df.iloc[s[0]].song)
    return song

In [32]:
recommender("Cry Of The Broken")

['Cry Of The Broken', 'Broken Down', 'Of A Broken Heart', 'Planet Telex']

In [33]:
import pickle as pk
pk.dump(similar, open("music", "wb"))

In [34]:
pk.dump(df, open("data", "wb"))