In [1]:
import pandas as pd
import boto3
import math


In [2]:
s3 = boto3.resource('s3')

s3_obj = s3.Bucket("spotify-song-recommender-data").Object('light_spotify_dataset.csv').get()

song_data = pd.read_csv(s3_obj['Body'])
print(song_data.shape)
print(song_data.columns)
song_data.head(3)


(236988, 18)
Index(['artist', 'song', 'emotion', 'variance', 'Genre', 'Release Date', 'Key',
       'Tempo', 'Loudness', 'Explicit', 'Popularity', 'Energy', 'Danceability',
       'Positiveness', 'Speechiness', 'Liveness', 'Acousticness',
       'Instrumentalness'],
      dtype='object')


Unnamed: 0,artist,song,emotion,variance,Genre,Release Date,Key,Tempo,Loudness,Explicit,Popularity,Energy,Danceability,Positiveness,Speechiness,Liveness,Acousticness,Instrumentalness
0,ABBA,She's My Kind Of Girl,joy,0.447619,pop,2014,F Maj,128,-6.0,No,31,78,56,60,3,31,7,0
1,ABBA,"Andante, Andante",love,0.202222,pop,1980,A# Maj,102,-10.72,No,59,36,52,38,2,7,68,0
2,ABBA,As Good As New,sadness,0.300881,pop,1979,E Maj,139,-5.7,No,50,78,85,97,3,8,20,2


In [3]:
print(math.sqrt(song_data.shape[0]/2))

344.2295745574456


In [4]:
print(song_data['Explicit'].unique())

['No' 'Yes']


In [5]:
# turning Explicit into a binary
explicitDict = {
    'No': 0,
    'Yes': 1,
}
pd.set_option('future.no_silent_downcasting', True)
song_data['Explicit'] = song_data['Explicit'].replace(explicitDict)
print(song_data['Explicit'].unique())

[0 1]


In [6]:
X = song_data.drop('artist', axis=1).drop('song', axis=1).drop('emotion', axis=1).drop('Key', axis=1).drop('Genre',axis=1)

In [7]:
from sklearn.cluster import KMeans

kmeans = KMeans(init="k-means++", n_clusters = 344, n_init=4, random_state=1)
kmeans.fit(X)
song_data['cluster'] = kmeans.labels_

In [8]:
print(song_data['Genre'].unique())

['pop' 'comedy' 'rock' ... '90s,dance,House' 'sad,Love,slow'
 'noise rock,Unknown,Unknown']


In [9]:
for cluster in sorted(song_data['cluster'].unique())[200:210]:
    print(f"\nCluster #{cluster}")
    songsInCluster = song_data[song_data['cluster'] == cluster].head(5)
    for songName in songsInCluster['song']:
        print(f" - {songName}")


Cluster #200
 - Fire
 - Lonely Road
 - Hold Ya Headz High
 - On Deck
 - Spin

Cluster #201
 - His Eye Is On The Sparrow
 - Islands In The Stream
 - Turn The Page
 - Main Chick
 - O

Cluster #202
 - Gimme Gimme Gimme
 - SOS
 - Janie's Got A Gun
 - Red House
 - Sweet Emotion

Cluster #203
 - Rock N Roll
 - I Got The Girl
 - The Prisoner
 - Everything You're Not
 - The Screen Behind The Mirror

Cluster #204
 - Capital Radio One
 - Every Little Bit Hurts
 - The Language
 - Disgusting
 - Get Free ColeWorld

Cluster #205
 - Can't Let Go
 - Walking In The Air
 - I Will Wait For You
 - Captive
 - A Foggy Day

Cluster #206
 - Fall To Pieces
 - You Never Satisfy Me
 - Eyes On Me
 - My Destiny
 - Paint It Black

Cluster #207
 - Seasons Of Wither
 - Hot Love
 - Love To Hate
 - If You Can See Me
 - Story To Tell

Cluster #208
 - Do I Ever Cross Your Mind?
 - All By Myself
 - I Hate You Then I Love You
 - I Love You Goodbye
 - After All

Cluster #209
 - Mr. Bake-O
 - I Will Be
 - I'll Never Stop Lo

In [12]:
datapoint = song_data.iloc[[4456]]
filteredDatapoint = datapoint.drop('artist', axis=1).drop('song', axis=1).drop('emotion', axis=1).drop('Key', axis=1).drop('Genre',axis=1).drop('cluster', axis=1)
print(filteredDatapoint)
print(datapoint)
test = kmeans.predict(filteredDatapoint)
print(test)

      variance  Release Date  Tempo  Loudness Explicit  Popularity  Energy  \
4456  0.411265          2024    162     -2.18        1          80      92   

      Danceability  Positiveness  Speechiness  Liveness  Acousticness  \
4456            40            56           38        13            19   

      Instrumentalness  
4456                 0  
     artist             song emotion  variance Genre  Release Date     Key  \
4456  Drake  Best I Ever Had     joy  0.411265   rap          2024  C# Maj   

      Tempo  Loudness Explicit  Popularity  Energy  Danceability  \
4456    162     -2.18        1          80      92            40   

      Positiveness  Speechiness  Liveness  Acousticness  Instrumentalness  \
4456            56           38        13            19                 0   

      cluster  
4456       47  
[47]


In [13]:
print(song_data[song_data['cluster'] == test[0]].head(5))

           artist             song  emotion  variance    Genre  Release Date  \
2690  Chris Brown           Erased  sadness  0.136596  hip-hop          2007   
2731  Chris Brown        Kiss Kiss    anger  0.104077  hip-hop          2007   
4456        Drake  Best I Ever Had      joy  0.411265      rap          2024   
8036     Ice Cube   Hood Mentality    anger -0.071877      rap          2008   
8789       J Cole       Cole World      joy  0.049735      rap          2011   

         Key  Tempo  Loudness Explicit  Popularity  Energy  Danceability  \
2690  A# min    140     -3.39        0          74      66            73   
2731  A# min    140     -3.39        0          74      66            73   
4456  C# Maj    162     -2.18        1          80      92            40   
8036   A Maj    146     -5.00        1          48      84            76   
8789   F min    157     -8.82        1          56      77            70   

      Positiveness  Speechiness  Liveness  Acousticness  Instr