In [1]:
import pandas as pd
import boto3
import math


In [2]:
s3 = boto3.resource('s3')

s3_obj = s3.Bucket("spotify-song-recommender-data").Object('light_spotify_dataset.csv').get()

song_data = pd.read_csv(s3_obj['Body'])
print(song_data.shape)
print(song_data.columns)
song_data.head(3)


(236988, 18)
Index(['artist', 'song', 'emotion', 'variance', 'Genre', 'Release Date', 'Key',
       'Tempo', 'Loudness', 'Explicit', 'Popularity', 'Energy', 'Danceability',
       'Positiveness', 'Speechiness', 'Liveness', 'Acousticness',
       'Instrumentalness'],
      dtype='object')


Unnamed: 0,artist,song,emotion,variance,Genre,Release Date,Key,Tempo,Loudness,Explicit,Popularity,Energy,Danceability,Positiveness,Speechiness,Liveness,Acousticness,Instrumentalness
0,ABBA,She's My Kind Of Girl,joy,0.447619,pop,2014,F Maj,128,-6.0,No,31,78,56,60,3,31,7,0
1,ABBA,"Andante, Andante",love,0.202222,pop,1980,A# Maj,102,-10.72,No,59,36,52,38,2,7,68,0
2,ABBA,As Good As New,sadness,0.300881,pop,1979,E Maj,139,-5.7,No,50,78,85,97,3,8,20,2


In [3]:
# turning Explicit into a binary
explicitDict = {
    'No': 0,
    'Yes': 1,
}
pd.set_option('future.no_silent_downcasting', True)
song_data['Explicit'] = song_data['Explicit'].replace(explicitDict)
print(song_data['Explicit'].unique())

[0 1]


In [4]:
from sklearn.preprocessing import StandardScaler
X = song_data.drop('artist', axis=1).drop('song', axis=1).drop('emotion', axis=1).drop('Key', axis=1).drop('Genre',axis=1)
# scaler = StandardScaler()
# X = scaler.fit_transform(X)
X = StandardScaler().fit_transform(X)

In [5]:
from sklearn.cluster import KMeans

kmeans = KMeans(init="k-means++", n_clusters = 344, n_init=4, random_state=1)
kmeans.fit(X)
song_data['cluster'] = kmeans.labels_

In [6]:
for cluster in sorted(song_data['cluster'].unique())[200:210]:
    print(f"\nCluster #{cluster}")
    songsInCluster = song_data[song_data['cluster'] == cluster].head(5)
    for songName in songsInCluster['song']:
        print(f" - {songName}")


Cluster #200
 - I Am The City
 - Hey Darling
 - Mercy, Mercy
 - Just Another Woman
 - Clear Across America Tonight

Cluster #201
 - Devil's Food
 - The Pause Of Mr. Claus
 - Come To The Supermarket
 - I'm The Pied Piper
 - Magic Transistor Radio

Cluster #202
 - Sippin' On Sunshine
 - Make You Happy
 - Beautiful Dream
 - I Wanna Be Loved
 - I'm Alright

Cluster #203
 - Cecilia
 - I'm Ready
 - Luv Lies
 - Road Runner
 - Sedona Sunrise

Cluster #204
 - Steve Polychronopolous
 - The Adventures Of The Cow
 - Falling In Love
 - Take It
 - She Knows

Cluster #205
 - Say (All I Need)
 - Verge
 - Best I Could
 - Don't Forget To Remember
 - Never Once

Cluster #206
 - Sleigh Ride
 - Feel Good
 - Loving Kind
 - Better To Have Loved
 - Gorgeous

Cluster #207
 - Fly
 - Joy To The World
 - Come Fly With Me
 - One Good Love
 - Dreaming

Cluster #208
 - Dancing Queen
 - I Should Have Known Better
 - Smoke 2 Joints
 - The Passenger
 - It's So Easy (To Fall In Love)

Cluster #209
 - Pick Me
 - Best Fr

In [7]:
datapoint = song_data.iloc[[4456]]
filteredDatapoint = datapoint.drop('artist', axis=1).drop('song', axis=1).drop('emotion', axis=1).drop('Key', axis=1).drop('Genre',axis=1).drop('cluster', axis=1)
print(datapoint)
test = kmeans.predict(filteredDatapoint)
print(test)

     artist             song emotion  variance Genre  Release Date     Key  \
4456  Drake  Best I Ever Had     joy  0.411265   rap          2024  C# Maj   

      Tempo  Loudness Explicit  Popularity  Energy  Danceability  \
4456    162     -2.18        1          80      92            40   

      Positiveness  Speechiness  Liveness  Acousticness  Instrumentalness  \
4456            56           38        13            19                 0   

      cluster  
4456      339  
[72]




In [8]:
print(song_data[song_data['cluster'] == test[0]].head(5))

                    artist                  song   emotion  variance  \
57867              theMIND     Animated Ambition       joy  0.833514   
58357  mike. fka mike stud          Captain Hook       joy  0.833514   
58821        iLoveMakonnen               Tuesday       joy  0.833514   
58824        iLoveMakonnen               No Maam       joy  0.833514   
58827        iLoveMakonnen  Down 4 So Long Remix  surprise  0.833514   

                             Genre  Release Date     Key  Tempo  Loudness  \
57867  Ghetto Sage,Unknown,Unknown          2021   G min    149     -5.19   
58357      Unknown,Unknown,Unknown          2020  A# min    164     -5.00   
58821                         soul          2014   C Maj    140     -6.00   
58824                         soul          2014   C Maj    140     -6.00   
58827                         soul          2014   C Maj    140     -6.00   

      Explicit  Popularity  Energy  Danceability  Positiveness  Speechiness  \
57867        1          7

In [12]:
import joblib
# exporting model
joblib.dump(kmeans, './models/test_model_joblib')

['./models/test_model_joblib']

In [13]:
# importing model
model = joblib.load('./models/test_model_joblib')

In [11]:
test = model.predict(filteredDatapoint)
print(test)

[72]


