In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("dataset/spotify_millsongdata.csv")
df.shape

(57650, 4)

In [3]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.drop(["link"], axis=1, inplace=True)
df.rename(columns = {"text":"lyrics"}, inplace=True)
df.drop_duplicates(subset="song", inplace=True)
df.reset_index(drop=True, inplace=True)

In [5]:
df

Unnamed: 0,artist,song,lyrics
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...
...,...,...,...
44819,Zebrahead,Mental Health,Let's go \r\nThe lights are on but there is n...
44820,Zebrahead,The Setup,Lie to me \r\nTell me that everything will be...
44821,Ziggy Marley,Freedom Road,"That's why I'm marching, yes, I'm marching, \..."
44822,Ziggy Marley,G7,Seven richest countries in the world \r\nThem...


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahmadalaik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import re
from nltk.corpus import stopwords

def cleaning(text):
  text = re.sub("[^a-zA-Z]", " ", text)
  text = text.lower()
  text = text.split()
  stops = set(stopwords.words("english"))
  text = [w for w in text if not w in stops]
  text = " ".join(text)
  return text

In [8]:
df["lyrics"] = df["lyrics"].apply(cleaning)

In [9]:
df

Unnamed: 0,artist,song,lyrics
0,ABBA,Ahe's My Kind Of Girl,look face wonderful face means something speci...
1,ABBA,"Andante, Andante",take easy please touch gently like summer even...
2,ABBA,As Good As New,never know go put lousy rotten show boy tough ...
3,ABBA,Bang,making somebody happy question give take learn...
4,ABBA,Bang-A-Boomerang,making somebody happy question give take learn...
...,...,...,...
44819,Zebrahead,Mental Health,let go lights one home yeah type guy left alon...
44820,Zebrahead,The Setup,lie tell everything right lie say mean anythin...
44821,Ziggy Marley,Freedom Road,marching yes marching marching freedom road no...
44822,Ziggy Marley,G7,seven richest countries world little meeting p...


In [10]:
tfidf_vector = TfidfVectorizer(max_features=10000)

In [11]:
tfidf_matrix = tfidf_vector.fit_transform(df["lyrics"])

In [12]:
tfidf_matrix.shape

(44824, 10000)

In [13]:
cosine_similar = cosine_similarity(tfidf_matrix)

In [14]:
song_user_likes = "Story Of My Life"
song_index = df[df.song == song_user_likes].index[0]

In [15]:
song_index

13328

In [16]:
similar_songs = list(enumerate(cosine_similar[song_index]))

In [17]:
similar_songs[:5]

[(0, 0.003943723916110239),
 (1, 0.018681173172608194),
 (2, 0.02752370697165793),
 (3, 0.012858374594299667),
 (4, 0.012175632573813206)]

In [18]:
sorted_similar_songs = sorted(similar_songs, key=lambda x:x[1], reverse=True)

In [19]:
sorted_similar_songs[:5]

[(13328, 0.9999999999999998),
 (26307, 0.6238524673961828),
 (40650, 0.5224213679176957),
 (21586, 0.4788786413298322),
 (8522, 0.4369630126361331)]

In [20]:
for song in sorted_similar_songs[1:11]:
        similar_songs = df[df.index == song[0]]["song"].values[0]
        print(similar_songs)

Neverending Story
Half A Person
It's The Same Old Story
The Story Of Life
Parody
Story Of Your Life Is In Your Face
Scotland's Story
The Same Old Story
Same Old Song And Dance
Let's Make A Long Story Longer


In [21]:
def recommended_songs(song_user_likes):
    song_index = df[df.song == song_user_likes].index[0]
    similar_songs = list(enumerate(cosine_similar[song_index]))
    sorted_similar_songs = sorted(similar_songs, key=lambda x:x[1], reverse=True)
    
    for song in sorted_similar_songs[1:11]:
        similar_songs = df[df.index == song[0]]["song"].values[0]
        print(similar_songs)

In [22]:
recommended_songs("Yellow")

I Shine, You Shine
Shine Your Way
Shine It On
Didn't He Shine
I Think I See The Light
Love Shine Down
This Little Light Of Mine
My Little Love Lights
Shine On
Light From Your Lighthouse
